In [1]:
import csv
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import pickle
import os
import math

In [2]:
stopwords = set(stopwords.words('english'))

In [3]:
inverted_index = {}
N = 200

In [4]:
def remove_stopwords(tokens):
    tokens_wo_stopwords = []
    for i in range(0,len(tokens)):
        if tokens[i].lower() not in stopwords:
            tokens_wo_stopwords.append(tokens[i].lower())
    return tokens_wo_stopwords

In [5]:
def get_pos_tag(token):
    pos_tag = nltk.pos_tag([token])[0][1]
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [6]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    for i in range(0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i],pos=str(get_pos_tag(tokens[i])))
    return tokens

In [7]:
def add_to_inverted_index(tokens,index):
    for i in range(0,len(tokens)):
        if tokens[i] not in inverted_index:
            inverted_index[tokens[i]] = {
                str(index): 1
            }
        else:
            if str(index) not in inverted_index[tokens[i]]:
                inverted_index[tokens[i]][str(index)] = 1
            else:
                inverted_index[tokens[i]][str(index)] = inverted_index[tokens[i]][str(index)] + 1

In [8]:
def save(inverted_index,filename):
    with open(filename + '.pkl','wb') as index:
        pickle.dump(inverted_index,index,pickle.HIGHEST_PROTOCOL)

In [9]:
def read():
    with open("inverted_index.pkl",'rb') as file:
        inverted_index = pickle.load(file)
    with open("bi_word_inverted_index.pkl",'rb') as file1:
        bi_word_inverted_index = pickle.load(file1)

In [10]:
def preprocess(data):
    data = re.sub(r'[^a-zA-Z]', ' ', data)
    tokens = word_tokenize(data)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize(tokens)
    return tokens

In [11]:
def load_and_preprocess():
    for i in range(3,450,3):
        with open("data/data_split_" + str(i) + ".csv") as file:
            csv_reader = csv.reader(file,delimiter=',')
            flag = 0
            for row in csv_reader:
                tokens = preprocess(str(row))
                add_to_inverted_index(tokens,i)
    save(inverted_index,"inverted_index")

In [12]:
def get_idf(df):
    if df != 0:
        return math.log10(N/df)
    return 0

In [13]:
def get_tf(tf):
    if tf != 0:
        return (1 + math.log10(tf))
    return 0

In [14]:
def get_query_index(query_tokens):
    query_index = {}
    for i in range(0,len(query_tokens)):
        if query_tokens[i] not in query_index:
            query_index[query_tokens[i]] = 1
        else:
            query_index[query_tokens[i]] += 1
    return query_index

In [15]:
def get_cosine_scores(query_vec,doc_vecs):
    cosine_scores = {}
    for key in doc_vecs.keys():
        prod = 0
        tf_sq = 0
        tfd_sq = 0
        for i in range(0,len(query_vec)):
            prod += (query_vec[i] * doc_vecs[key][i])
            tf_sq += query_vec[i] ** 2
            tfd_sq += doc_vecs[key][i] ** 2
        if tf_sq == 0 or tfd_sq == 0:
            cosine_scores[key] = 0
        else:
            cosine_scores[key] = prod/(math.sqrt(tf_sq * tfd_sq))
    return cosine_scores

In [16]:
def find_relevant_documents(query_tokens):
    query_vec = []
    doc_vecs = {}
    tf = 0
    idf = 0
    tf_idf = 0
    documents = []
    query_index = get_query_index(query_tokens)
    for i in range(0,len(query_tokens)):
        if query_tokens[i] in inverted_index:
            documents = inverted_index[query_tokens[i]]
            idf = get_idf(len(documents.keys()))
            tf = get_tf(query_index[query_tokens[i]])
            tf_idf = tf * idf
        query_vec.append(tf_idf)
        for j in range(3,450,3):
            key = str(j)
            if key not in doc_vecs:
                doc_vecs[key] = []
            if len(documents) > 0 and key in documents:
                doc_vecs[key].append(get_tf(documents[key]))
            else:
                doc_vecs[key].append(0)
    cosine_scores = get_cosine_scores(query_vec,doc_vecs)
    cosine_scores = sorted(cosine_scores.items(), key=lambda x: x[1],reverse=True)
    print (cosine_scores)
    print("Most relevant documents are:- ")
    count = 0
    for i in range(0,len(cosine_scores)):
        if cosine_scores[i][1] != 0:
            print(cosine_scores[i][0])
            count += 1
        if count == 10:
            break

In [17]:
load_and_preprocess()

In [22]:
query = input("Enter query:- ")
query_tokens = preprocess(str(query))
find_relevant_documents(query_tokens)

Enter query:- peter governor thriller yellow fever
[('3', 0.6331089777573414), ('318', 0.6108861050433061), ('66', 0.5459085871664903), ('144', 0.5393778264005944), ('156', 0.5393778264005944), ('261', 0.5393778264005944), ('411', 0.5393778264005944), ('84', 0.5086114726454749), ('108', 0.5086114726454749), ('291', 0.5086114726454749), ('309', 0.5086114726454749), ('429', 0.5086114726454749), ('81', 0.48296623070812045), ('18', 0.32454558841692416), ('42', 0.32454558841692416), ('48', 0.32454558841692416), ('69', 0.32454558841692416), ('72', 0.32454558841692416), ('132', 0.32454558841692416), ('141', 0.32454558841692416), ('219', 0.32454558841692416), ('222', 0.32454558841692416), ('225', 0.32454558841692416), ('321', 0.32454558841692416), ('342', 0.32454558841692416), ('360', 0.32454558841692416), ('363', 0.32454558841692416), ('378', 0.32454558841692416), ('405', 0.32454558841692416), ('408', 0.32454558841692416), ('96', 0.23265350138619104), ('192', 0.23265350138619104), ('303', 0.2

In [19]:
inverted_index['yellow'], inverted_index['fever'],inverted_index['cocaine']

({'144': 1, '261': 1, '318': 1, '81': 1},
 {'108': 3, '291': 1, '309': 1, '429': 1, '84': 1},
 {'69': 2})

In [21]:
inverted_index

{'content': {'12': 3,
  '129': 3,
  '135': 1,
  '15': 1,
  '159': 1,
  '174': 22,
  '186': 3,
  '189': 24,
  '201': 2,
  '204': 1,
  '21': 1,
  '216': 1,
  '234': 25,
  '249': 1,
  '279': 1,
  '3': 1,
  '306': 1,
  '312': 1,
  '333': 27,
  '354': 2,
  '366': 28,
  '372': 1,
  '375': 1,
  '387': 1,
  '414': 1,
  '420': 1,
  '423': 1,
  '444': 29,
  '45': 1,
  '60': 3,
  '69': 1,
  '90': 19,
  '93': 4},
 'patriot': {'129': 2,
  '132': 2,
  '171': 1,
  '189': 1,
  '252': 1,
  '264': 1,
  '273': 1,
  '291': 1,
  '3': 8,
  '300': 1,
  '303': 1,
  '321': 1,
  '342': 12,
  '354': 1,
  '363': 2,
  '399': 1,
  '411': 13,
  '42': 1,
  '423': 13,
  '99': 1},
 'day': {'102': 3,
  '105': 3,
  '108': 2,
  '111': 5,
  '114': 2,
  '117': 1,
  '12': 5,
  '123': 1,
  '126': 5,
  '129': 11,
  '138': 2,
  '141': 3,
  '144': 7,
  '147': 3,
  '15': 3,
  '150': 3,
  '153': 5,
  '159': 1,
  '162': 14,
  '165': 5,
  '171': 12,
  '174': 14,
  '177': 4,
  '18': 1,
  '180': 1,
  '183': 8,
  '186': 1,
  '189': 15,