In [48]:
import csv
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import pickle
import os
import math

In [49]:
stopwords = set(stopwords.words('english'))

In [50]:
inverted_index = {}
N = 20

In [51]:
def remove_stopwords(tokens):
    tokens_wo_stopwords = []
    for i in range(0,len(tokens)):
        if tokens[i].lower() not in stopwords:
            tokens_wo_stopwords.append(tokens[i].lower())
    return tokens_wo_stopwords

In [52]:
def get_pos_tag(token):
    pos_tag = nltk.pos_tag([token])[0][1]
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [53]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    for i in range(0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i],pos=str(get_pos_tag(tokens[i])))
    return tokens

In [54]:
def add_to_inverted_index(tokens,index):
    for i in range(0,len(tokens)):
        if tokens[i] not in inverted_index:
            inverted_index[tokens[i]] = {
                str(index): 1
            }
        else:
            if str(index) not in inverted_index[tokens[i]]:
                inverted_index[tokens[i]][str(index)] = 1
            else:
                inverted_index[tokens[i]][str(index)] = inverted_index[tokens[i]][str(index)] + 1

In [55]:
def save(inverted_index,filename):
    with open(filename + '.pkl','wb') as index:
        pickle.dump(inverted_index,index,pickle.HIGHEST_PROTOCOL)

In [56]:
def read():
    with open("inverted_index.pkl",'rb') as file:
        inverted_index = pickle.load(file)
    with open("bi_word_inverted_index.pkl",'rb') as file1:
        bi_word_inverted_index = pickle.load(file1)

In [57]:
def load_and_preprocess():
    for i in range(5,100,5):
        with open("data/data_split_" + str(i) + ".csv") as file:
            csv_reader = csv.reader(file,delimiter=',')
            flag = 0
            for row in csv_reader:
                row = re.sub(r'[^a-zA-Z]', ' ', str(row))
                tokens = word_tokenize(str(row))
                tokens = remove_stopwords(tokens)
                tokens = lemmatize(tokens)
                add_to_inverted_index(tokens,i)
    save(inverted_index,"inverted_index")

In [58]:
def get_idf(df):
    if df != 0:
        return math.log10(N/df)
    return 0

In [59]:
def get_tf(documents):
    tf = 0
    for key in documents.keys():
        tf += docuemnts[key]
    if tf != 0:
        return (1 + math.log10(tf))
    return 0

In [60]:
def get_cosine_score(query_vec,doc_vecs):
    cosine_scores = {}
    for key in doc_vecs.keys():
        prod = 0
        tf_sq = 0
        tfd_sq = 0
        for i in range(0,len(query_vec)):
            prod += query_vec[i] * doc_vecs[key][i]
            tf_sq += query_vec[i] ** 2
            tfd_sq += doc_vecs[key][i] ** 2
        cosine_scores[key] = prod/(tf_sq * tfd_sq)
    return cosine_scores

In [61]:
def find_relevant_documents(query_tokens):
    query_vec = []
    doc_vecs = {}
    for i in range(0,len(query_tokens)):
        documents = inverted_index[query_tokens[i]]
        idf = get_idf(len(documents.keys()))
        tf = get_tf(documents)
        tf_idf = tf * idf
        query_vec.append(tf_idf)
        for j in range(5,100,5):
            key = str(j)
            if key not in doc_vecs:
                doc_vecs[key] = []
            if key in documents:
                doc_vecs[key].append(documents[key])
            else:
                doc_vecs[key].append(0)
        consine_scores = get_cosine_scores(query_vec,doc_vecs)
        cosine_scores = sorted(csine_scores.items(), key=lambda x: x[1],reverse=True)
        print("Most relevant documents are:- ")
        for i in range(0,min(5,len(cosine_scores))):
            print(cosine_scores[i][0])

In [None]:
load_and_preprocess()

In [None]:
query = input("Enter query:- ")
query_tokens = query.split(' ')
find_relevant_documents(query_tokens)