In [1]:
import csv
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import pickle
import os

In [2]:
stopwords = set(stopwords.words('english'))

In [3]:
inverted_index = {}
bi_word_inverted_index = {}

In [4]:
def remove_stopwords(tokens):
    tokens_wo_stopwords = []
    for i in range(0,len(tokens)):
        if tokens[i].lower() not in stopwords:
            tokens_wo_stopwords.append(tokens[i].lower())
    return tokens_wo_stopwords

In [5]:
def get_pos_tag(token):
    pos_tag = nltk.pos_tag([token])[0][1]
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [6]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    for i in range(0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i],pos=str(get_pos_tag(tokens[i])))
    return tokens

In [7]:
def add_to_inverted_index(tokens,index):
    for i in range(0,len(tokens)):
        if tokens[i] not in inverted_index:
            inverted_index[tokens[i]] = {index: [i]}
        else:
            if index not in inverted_index[tokens[i]]:
                inverted_index[tokens[i]][index] = [i]
            else:
                inverted_index[tokens[i]][index].append(i)

In [8]:
def add_to_bi_word_inverted_index(tokens,index):
    for i in range(0,len(tokens)-1):
        token = tokens[i] + " " + tokens[i+1]
        if token not in bi_word_inverted_index:
            bi_word_inverted_index[token] = {index: [i]}
        else:
            if index not in bi_word_inverted_index[token]:
                bi_word_inverted_index[token][index] = [i]
            else:
                bi_word_inverted_index[token][index].append(i)

In [9]:
def save(inverted_index,filename):
    with open(filename + '.pkl','wb') as index:
        pickle.dump(inverted_index,index,pickle.HIGHEST_PROTOCOL)

In [10]:
def read():
    with open("inverted_index.pkl",'rb') as file:
        inverted_index = pickle.load(file)
    with open("bi_word_inverted_index.pkl",'rb') as file1:
        bi_word_inverted_index = pickle.load(file1)

In [11]:
def load_and_preprocess():
    for i in range(5,100,5):
        with open("data/data_split_" + str(i) + ".csv") as file:
            csv_reader = csv.reader(file,delimiter=',')
            flag = 0
            for row in csv_reader:
                row = re.sub(r'[^a-zA-Z]', ' ', str(row))
                tokens = word_tokenize(str(row))
                tokens = remove_stopwords(tokens)
                tokens = lemmatize(tokens)
                add_to_inverted_index(tokens,i)
                add_to_bi_word_inverted_index(tokens,i)
    save(inverted_index,"inverted_index")
    save(bi_word_inverted_index,"bi_word_inverted_index")

In [12]:
def and_query(words,is_bi_word=False):
    if is_bi_word:
        index = bi_word_inverted_index
    else:
        index = inverted_index
    first_word = True
    result = []
    for word in words:
        word = word.strip()
        if word not in index:
            return []
        if first_word:
            first_word = False
            result = list(index[word].keys())
        else:
            result_temp = []
            for i in range(0,len(index[word].keys())):
                if list(index[word].keys())[i] in result:
                    result_temp.append(list(index[word].keys())[i])
            result = []
            result = result_temp
            if(len(result) == 0):
                return result
    return result

In [13]:
def or_query(words,is_bi_word=False):
    if is_bi_word:
        index = bi_word_inverted_index
    else:
        index = inverted_index
    result = []
    for word in words:
        word = word.strip()
        for i in range(0,len(index[word].keys())):
            if list(index[word].keys())[i] not in result:
                result.append(list(index[word].keys())[i])
    return result

In [14]:
#AND
def not_query(words,is_bi_word=False):
    if is_bi_word:
        index = bi_word_inverted_index
    else:
        index = inverted_index
    result = [5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
    for word in words:
        word = word.strip()
        for i in range(0,len(index[word].keys())):
            if list(index[word].keys())[i] in result:
                result.remove(list(index[word].keys())[i])
    return result

In [15]:
def proximity_query(words,offset=100):
    result = []
    and_query_result = and_query(words)
    for document in and_query_result:
        for pos_i in inverted_index[words[0]][document]:
            for pos_j in inverted_index[words[1]][document]:
                if abs(pos_i - pos_j) < offset:
                    if document not in result:
                        result.append(document)
    return result

In [16]:
load_and_preprocess()

In [18]:
proximity_query(['content washington','washington congressional'])

[5, 10, 80]