In [191]:
import csv
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import pickle
import os

In [192]:
stopwords = set(stopwords.words('english'))

In [193]:
inverted_index = {}
bi_word_inverted_index = {}
not_result = []

In [194]:
def remove_stopwords(tokens):
    tokens_wo_stopwords = []
    for i in range(0,len(tokens)):
        if tokens[i].lower() not in stopwords:
            tokens_wo_stopwords.append(tokens[i].lower())
    return tokens_wo_stopwords

In [195]:
def get_pos_tag(token):
    pos_tag = nltk.pos_tag([token])[0][1]
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [196]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    for i in range(0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i],pos=str(get_pos_tag(tokens[i])))
    return tokens

In [197]:
def add_to_inverted_index(tokens,index):
    for i in range(0,len(tokens)):
        if tokens[i] not in inverted_index:
            inverted_index[tokens[i]] = [index]
        else:
            if index not in inverted_index[tokens[i]]:
                inverted_index[tokens[i]].append(index)

In [198]:
def add_to_bi_word_inverted_index(tokens,index):
    for i in range(0,len(tokens)-1):
        token = tokens[i] + " " + tokens[i+1]
        if token not in bi_word_inverted_index:
            bi_word_inverted_index[token] = [index]
        else:
            if index not in bi_word_inverted_index[token]:
                bi_word_inverted_index[token].append(index)

In [199]:
def save(inverted_index,filename):
    with open(filename + '.pkl','wb') as index:
        pickle.dump(inverted_index,index,pickle.HIGHEST_PROTOCOL)

In [202]:
def read():
    with open("inverted_index.pkl",'rb') as file:
        inverted_index = pickle.load(file)
    with open("bi_word_inverted_index.pkl",'rb') as file:
        bi_word_inverted_index = pickle.load(file)

In [200]:
def load_and_preprocess():
    for i in range(5,100,5):
        with open("data/data_split_" + str(i) + ".csv") as file:
            not_result.append(i)
            csv_reader = csv.reader(file,delimiter=',')
            flag = 0
            for row in csv_reader:
                row = re.sub(r'[^a-zA-Z]', ' ', str(row))
                tokens = word_tokenize(str(row))
                tokens = remove_stopwords(tokens)
                tokens = lemmatize(tokens)
                add_to_inverted_index(tokens,i)
                add_to_bi_word_inverted_index(tokens,i)
    save(inverted_index,"inverted_index")
    save(bi_word_inverted_index,"bi_word_inverted_index")

In [180]:
def and_query(words):
    first_word = True
    result = []
    for word in words:
        word = word.trim()
        if word not in inverted_index:
            return []
        if first_word:
            first_word = False
            result = inverted_index[word]
        else:
            result_temp = []
            for i in range(0,len(inverted_index[word])):
                if inverted_index[word][i] in result:
                    result_temp.append(inverted_index[word][i])
            result = []
            result = result_temp
            if(len(result) == 0):
                return result
    return result

In [181]:
def or_query(words):
    result = []
    for word in words:
        word = word.trim()
        for i in range(0,len(inverted_index[word])):
            if inverted_index[word][i] not in result:
                result.append(inverted_index[word][i])
    return result

In [182]:
#AND
def not_query(words):
    result = not_result
    for word in words:
        for i in range(0,len(inverted_index[word])):
            if inverted_index[word][i] in result:
                result.remove(inverted_index[word][i])
    return result

In [168]:
if os.path.isfile("inverted_index.pkl"):
    read()
else:
    load_and_preprocess()
query = input("Enter and query here:-")
words = query.split("|")
print(not_query(words))

Enter and query here:-content|washington
[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
[85, 100]
