In [204]:
import csv
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import pickle
import os

In [205]:
stopwords = set(stopwords.words('english'))

In [206]:
inverted_index = {}
bi_word_inverted_index = {}
not_result = []

In [207]:
def remove_stopwords(tokens):
    tokens_wo_stopwords = []
    for i in range(0,len(tokens)):
        if tokens[i].lower() not in stopwords:
            tokens_wo_stopwords.append(tokens[i].lower())
    return tokens_wo_stopwords

In [208]:
def get_pos_tag(token):
    pos_tag = nltk.pos_tag([token])[0][1]
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [209]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    for i in range(0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i],pos=str(get_pos_tag(tokens[i])))
    return tokens

In [210]:
def add_to_inverted_index(tokens,index):
    for i in range(0,len(tokens)):
        if tokens[i] not in inverted_index:
            inverted_index[tokens[i]] = [index]
        else:
            if index not in inverted_index[tokens[i]]:
                inverted_index[tokens[i]].append(index)

In [211]:
def add_to_bi_word_inverted_index(tokens,index):
    for i in range(0,len(tokens)-1):
        token = tokens[i] + " " + tokens[i+1]
        if token not in bi_word_inverted_index:
            bi_word_inverted_index[token] = [index]
        else:
            if index not in bi_word_inverted_index[token]:
                bi_word_inverted_index[token].append(index)

In [212]:
def save(inverted_index,filename):
    with open(filename + '.pkl','wb') as index:
        pickle.dump(inverted_index,index,pickle.HIGHEST_PROTOCOL)

In [213]:
def read():
    with open("inverted_index.pkl",'rb') as file:
        inverted_index = pickle.load(file)
    with open("bi_word_inverted_index.pkl",'rb') as file:
        bi_word_inverted_index = pickle.load(file)

In [214]:
def load_and_preprocess():
    for i in range(5,100,5):
        with open("data/data_split_" + str(i) + ".csv") as file:
            not_result.append(i)
            csv_reader = csv.reader(file,delimiter=',')
            flag = 0
            for row in csv_reader:
                row = re.sub(r'[^a-zA-Z]', ' ', str(row))
                tokens = word_tokenize(str(row))
                tokens = remove_stopwords(tokens)
                tokens = lemmatize(tokens)
                add_to_inverted_index(tokens,i)
                add_to_bi_word_inverted_index(tokens,i)
    save(inverted_index,"inverted_index")
    save(bi_word_inverted_index,"bi_word_inverted_index")

In [221]:
def and_query(words,is_bi_word=False):
    if is_bi_word:
        index = bi_word_inverted_index
    else:
        index = inverted_index
    first_word = True
    result = []
    for word in words:
        word = word.strip()
        if word not in index:
            return []
        if first_word:
            first_word = False
            result = index[word]
        else:
            result_temp = []
            for i in range(0,len(index[word])):
                if index[word][i] in result:
                    result_temp.append(index[word][i])
            result = []
            result = result_temp
            if(len(result) == 0):
                return result
    return result

In [222]:
def or_query(words,is_bi_word=False):
    if is_bi_word:
        index = bi_word_inverted_index
    else:
        index = inverted_index
    result = []
    for word in words:
        word = word.strip()
        for i in range(0,len(index[word])):
            if index[word][i] not in result:
                result.append(index[word][i])
    return result

In [223]:
#AND
def not_query(words,is_bi_word=False):
    if is_bi_word:
        index = bi_word_inverted_index
    else:
        index = inverted_index
    result = not_result
    for word in words:
        word = word.strip()
        for i in range(0,len(index[word])):
            if index[word][i] in result:
                result.remove(index[word][i])
    return result

In [None]:
if os.path.isfile("inverted_index.pkl") and os.path.isfile("bi_word_inverted_index.pkl"):
    read()
else:
    load_and_preprocess()