In [1]:
import os
import string
import re
from nltk.stem import *
import nltk
import math

In [2]:
# Global variables
folder_path = 'articles_all'
file_name = "collated_data.txt"
inverted_index_file = "inverted_index.txt"
file_contents_list = []


file_name_number_mapping_dict = {}
number_file_name_mapping_dict = {}
inverted_index_intermediate = {}
inverted_index = {}
num_docs = 1
idf_dict = {}
tf_dict = {}
file_to_vec_dict = {}
query_to_vec_dict = {}

In [3]:
class fileContents:
    file_name = ""
    file_number = 0
    contents_raw = ""
    contents_tokens = []

class sentenceMatch:
    file_name = 0
    match = 0
    sentence_raw = ""
    sentence_tokenised = []

In [4]:
def readFileOnly(file_num):

    for entry in file_contents_list:
        if entry.file_number == int(file_num):
            file_name = entry.file_name

    # file_name = number_file_name_mapping_dict[str(file_num)]
    file_path = folder_path + '/' + file_name
    
    with open(file_path, 'r') as f:
        data = f.read()

    data = re.sub("\\n", "", data)
    data = re.sub("'''", "", data)
    data = re.sub("''", "", data)
    data = re.sub(",", "", data)

    data_list = data.split('.')

    return data_list

In [5]:
# check
def tokenize_and_remove_punctuations(s):
    translator = str.maketrans('','',string.punctuation)
    modified_string = s.translate(translator)
    modified_string = ''.join([i for i in modified_string if not i.isdigit()])
    return nltk.word_tokenize(modified_string)

In [6]:
def get_stopwords():
    stop_words = [word for word in open('stopwords.txt','r').read().split('\n')]
    return stop_words

In [7]:
def parse_data(contents):
    contents = contents.lower()
    title_start = contents.find('<title>')
    title_end = contents.find('</title>')
    title = contents[title_start+len('<title>'):title_end]
    text_start = contents.find('<text>')
    text_end = contents.find('</text>')
    text = contents[text_start+len('<text>'):text_end]
    return title+" "+text

In [8]:
def stem_words(tokens):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(token) for token in tokens]
    return stemmed_words

In [9]:
def remove_stop_words(tokens):
    stop_words = get_stopwords()
    filtered_words = [token for token in tokens if token not in stop_words and len(token) > 2]
    return filtered_words

In [10]:
def get_vocabulary(data):
    tokens = []
    for token_list in data.values():
        tokens = tokens + token_list
    fdist = nltk.FreqDist(tokens)
    return list(fdist.keys())

In [11]:
def preprocess_data(contents):
    dataDict = {}
    data_list = []
    for content in contents:

        tokens = tokenize_and_remove_punctuations(content)
        filtered_tokens = remove_stop_words(tokens)
        stemmed_tokens = stem_words(filtered_tokens)

        if len(stemmed_tokens) !=  0:
            data_list.append(stemmed_tokens)

    return data_list

In [12]:
def read_data(path):
    curr_file_num = 1
    
    for filename in os.listdir(path):
        # print (filename)
        if curr_file_num%1000 == 0:
            print (curr_file_num)

        global file_contents_list

        file_data = fileContents()
        # contents = []
        
        data = parse_data(open(path + '/' + filename,'r').read())
    
        data = data.replace('\n', ' ').replace("'''", '').replace("''", '').replace(",", ' ').strip()

        # print (data)
        file_data.contents_raw = data
        file_data.file_num = curr_file_num
        file_data.file_name = filename

        # filename = re.sub('\D',"",filename)
        # contents.append([file_name_number_mapping_dict[filename],data])
        # print (data)
    # return contents

        # print (type(data), data)
        data_list = data.split()
        file_data.contents_tokens = preprocess_data(data_list)

        file_contents_list.append(file_data)

        # inverted_index = generate_inverted_index(preprocess_data_dict)
        curr_file_num += 1
    return  inverted_index 

In [13]:
def query_to_vec(queries):
    global query_to_vec_dict

    for key, val in queries.items():
        # query_to_vec_dict[key] = []
        val_set = set(val)
        for tok in val_set:
            count = val.count(tok)
            query_to_vec_dict[tok] = count

    # print(key, val)
    

In [14]:
def preprocess_queries(path):
    queriesDict = {}
    queries = open(path,'r').read().split('\n')
    i = 1
    for query in queries:
        tokens = tokenize_and_remove_punctuations(query)
        filtered_tokens = remove_stop_words(tokens)
        stemmed_tokens = stem_words(filtered_tokens)
        filtered_tokens1 = remove_stop_words(stemmed_tokens)
        queriesDict[i] = filtered_tokens1
        i+=1
    return queriesDict

In [15]:
def processSentences(contents):
    # dataDict = {}
    final_tokens = []
    
    for content in contents:
        tokens = tokenize_and_remove_punctuations(content)
        filtered_tokens = remove_stop_words(tokens)
        stemmed_tokens = stem_words(filtered_tokens)
        filtered_tokens1 = remove_stop_words(stemmed_tokens)
        # dataDict[content[0]] = filtered_tokens1
        
        
        for tok in filtered_tokens1:
            # print (len(tok), tok)
            if len(tok) != 0:
                final_tokens.append(tok)

    # print (final_tokens)
    return final_tokens

In [16]:
#Read tokensied data from file 
with open(file_name, 'r') as file:
    lines = file.readlines()

for line in lines:
    content = line.split("!@#")
    file_content = fileContents()
    file_content.file_name = content[0]
    file_content.file_number = int(content[1])
    file_content.contents_raw = content[2]
    file_content.contents_tokens = content[3].replace("'], ['", ",").replace("\n", "").strip("'[]'").split(',')

    file_contents_list.append(file_content)
    

In [17]:
# Load inverted index

with open(inverted_index_file, 'r') as file:
    lines = file.readlines()

for line in lines:
    content = line.split("\t")
    inverted_index_list = content[1].strip('\n').strip("[]").replace("], [", ":").split(':')
    inverted_index_list  = [x.split(',') for x in inverted_index_list]
    
    for pair in inverted_index_list:
        pair[0] = int(pair[0])
        pair[1] = int(pair[1].strip())

    inverted_index[content[0]] = inverted_index_list
    

In [18]:
def loadQuery():
    global query_to_vec_dict
    query_to_vec_dict = {}

    queries_dict = preprocess_queries('queries.txt')
    # print (queries_dict)
    query_to_vec(queries_dict)

    print ("Tokenised query along with frequency")
    for key, val in query_to_vec_dict.items():
        print (key, val)

    return query_to_vec_dict

In [19]:
def getIndexForQuery(query_to_vec_dict):

    index_for_query_dict = {}
    index_for_query_list = []

    for tok in query_to_vec_dict:

        if tok in inverted_index.keys():
            doc_list = inverted_index[tok]
            len_doc_list = len(doc_list)

            for pair in doc_list:
                pair[1] = pair[1]/(len_doc_list)
                
                # print (type(pair[0]))  

            for doc in doc_list:
                if doc[0] in index_for_query_dict.keys():
                    index_for_query_dict[doc[0]] += doc[1]
                else:
                    index_for_query_dict[doc[0]] = doc[1]

    
    for key, val in index_for_query_dict.items():
        index_for_query_list.append([key, val])
    
    index_for_query_list.sort(key=lambda x: x[1], reverse=True)
    index_for_query_list = index_for_query_list[:25]

    print()
    print ("Index retrieved for query")
    print ((index_for_query_list))

    return index_for_query_list
# print (len(index_for_query[0]))

In [20]:
def extractSentences(index_for_query_list):
    sentences_list = []

    global file_contents_list

    for entry in index_for_query_list: 
        sentences = readFileOnly(entry[0])
        
        for line in sentences:
            sentence_match = sentenceMatch()
            sentence_match.sentence_raw = line.strip()

            for content in file_contents_list:
                if content.file_number == entry[0]:
                    sentence_match.file_name = content.file_name

            sentence_match.sentence_tokenised = processSentences(sentence_match.sentence_raw.split())
            sentences_list.append(sentence_match)
    
    # for entry in sentences_list:
    #     print (entry.fil
    for entry in index_for_query_list: 
        sentences = readFileOnly(entry[0])
        
        for line in sentences:
            sentence_match = sentenceMatch()
            sentence_match.sentence_raw = line.strip()

            for content in file_contents_list:
                if content.file_number == entry[0]:
                    sentence_match.file_name = content.file_name

            # print (sentence_match.file_name, sentence_match.sentence_raw)
            sentence_match.sentence_tokenised = processSentences(sentence_match.sentence_raw.split())
            # print (type(sentence_match.sentence_tokenised), sentence_match.sentence_tokenised)
            # processSentences(sentence_match.sentence_raw.split())
            sentences_list.append(sentence_match)
    

    return sentences_list


In [21]:
def rankSentences(query_to_vec_dict, sentences_list):
    query_token = []

    for key in query_to_vec_dict:
        query_token.append(key)

    # print (query_token)

    for entry in sentences_list:
        match_count = 0
        for tok in query_token:
            if tok in entry.sentence_tokenised:
                match_count += 1*query_to_vec_dict[tok]
        entry.match = match_count
    
    return

In [22]:
def printTopSentences(sentences_list):
    res = sorted(sentences_list, key = lambda sentenceMatch : sentenceMatch.match, reverse=True)

    res = res[:10]

    print ("The obtained sentences are:")
    for entry in res:
        print (entry.file_name, entry.sentence_raw)
        print ()

    return


In [26]:
query_dict = loadQuery()
index_for_query_list = getIndexForQuery(query_dict)
sentences = extractSentences(index_for_query_list)
rankSentences(query_dict, sentences)
printTopSentences(sentences)


Tokenised query along with frequency
ship 1
titan 1
sank 1
alien 1
collid 1

Index retrieved for query
[[34519, 0.46987881796151443], [53983, 0.41981789568345323], [39377, 0.2892165679221329], [21716, 0.23046875], [34620, 0.2136267456622937], [58419, 0.1796875], [7764, 0.1704109587249399], [11929, 0.162251375370292], [5251, 0.13651079136690647], [28765, 0.13598901098901098], [45803, 0.13165665996614473], [51678, 0.13090383588325846], [20318, 0.10480349344978165], [36465, 0.10300134892086331], [16739, 0.09894756683735981], [9869, 0.09606986899563319], [44620, 0.08733624454148471], [12116, 0.0845728711790393], [60903, 0.08100269784172662], [60681, 0.07860262008733625], [67220, 0.07778247989843419], [52225, 0.07202708421498095], [67939, 0.07175134892086331], [40743, 0.07151925518408803], [9676, 0.06593406593406594]]
The obtained sentences are:
Ship.txt At one time the steamships RMS Titanic|Titanic Olympic and Britannic were the largest ships in the world Titanic sank on her maiden voyage