In [2]:
import csv
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import pickle
import os

In [3]:
stopwords = set(stopwords.words('english'))

In [4]:
inverted_index = {}
bi_word_inverted_index = {}
not_result = []

In [5]:
def remove_stopwords(tokens):
    tokens_wo_stopwords = []
    for i in range(0,len(tokens)):
        if tokens[i].lower() not in stopwords:
            tokens_wo_stopwords.append(tokens[i].lower())
    return tokens_wo_stopwords

In [6]:
def get_pos_tag(token):
    pos_tag = nltk.pos_tag([token])[0][1]
    if pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [7]:
def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    for i in range(0,len(tokens)):
        tokens[i] = lemmatizer.lemmatize(tokens[i],pos=str(get_pos_tag(tokens[i])))
    return tokens

In [8]:
def add_to_inverted_index(tokens,index):
    for i in range(0,len(tokens)):
        if tokens[i] not in inverted_index:
            inverted_index[tokens[i]] = {index: [i]}
        else:
            if index not in inverted_index[tokens[i]]:
                inverted_index[tokens[i]][index] = [i]
            else:
                inverted_index[tokens[i]][index].append(i)

In [9]:
def add_to_bi_word_inverted_index(tokens,index):
    for i in range(0,len(tokens)-1):
        token = tokens[i] + " " + tokens[i+1]
        if token not in bi_word_inverted_index:
            bi_word_inverted_index[token] = {index: [i]}
        else:
            if index not in bi_word_inverted_index[token]:
                bi_word_inverted_index[token][index] = [i]
            else:
                bi_word_inverted_index[token][index].append(i)

In [10]:
def save(inverted_index,filename):
    with open(filename + '.pkl','wb') as index:
        pickle.dump(inverted_index,index,pickle.HIGHEST_PROTOCOL)

In [11]:
def read():
    with open("inverted_index.pkl",'rb') as file:
        inverted_index = pickle.load(file)
    with open("bi_word_inverted_index.pkl",'rb') as file:
        bi_word_inverted_index = pickle.load(file)

In [12]:
def load_and_preprocess():
    for i in range(5,100,5):
        with open("data/data_split_" + str(i) + ".csv") as file:
            not_result.append(i)
            csv_reader = csv.reader(file,delimiter=',')
            flag = 0
            for row in csv_reader:
                row = re.sub(r'[^a-zA-Z]', ' ', str(row))
                tokens = word_tokenize(str(row))
                tokens = remove_stopwords(tokens)
                tokens = lemmatize(tokens)
                add_to_inverted_index(tokens,i)
                add_to_bi_word_inverted_index(tokens,i)
    save(inverted_index,"inverted_index")
    save(bi_word_inverted_index,"bi_word_inverted_index")

In [23]:
def and_query(words,is_bi_word=False):
    if is_bi_word:
        index = bi_word_inverted_index
    else:
        index = inverted_index
    first_word = True
    result = []
    for word in words:
        word = word.strip()
        if word not in index:
            return []
        if first_word:
            first_word = False
            result = list(index[word].keys())
        else:
            result_temp = []
            for i in range(0,len(index[word].keys())):
                if list(index[word].keys())[i] in result:
                    result_temp.append(list(index[word].keys())[i])
            result = []
            result = result_temp
            if(len(result) == 0):
                return result
    return result

In [24]:
def or_query(words,is_bi_word=False):
    if is_bi_word:
        index = bi_word_inverted_index
    else:
        index = inverted_index
    result = []
    for word in words:
        word = word.strip()
        for i in range(0,len(index[word].keys())):
            if list(index[word].keys())[i] not in result:
                result.append(list(index[word].keys())[i])
    return result

In [25]:
#AND
def not_query(words,is_bi_word=False):
    if is_bi_word:
        index = bi_word_inverted_index
    else:
        index = inverted_index
    result = not_result
    for word in words:
        word = word.strip()
        for i in range(0,len(index[word].keys())):
            if list(index[word].keys())[i] in result:
                result.remove(list(index[word].keys())[i])
    return result

In [27]:
#Could have uset set()
def positional_query(words,offset=10):
    result = []
    and_query_result = and_query(words)
    for document in and_query_result:
        for pos_i in inverted_index[words[0]][document]:
            for pos_j in inverted_index[words[1]][document]:
                if abs(pos_i - pos_j) < offset:
                    if document not in result:
                        result.append(document)
    return result

In [20]:
if os.path.isfile("inverted_index.pkl") and os.path.isfile("bi_word_inverted_index.pkl"):
    read()
else:
    load_and_preprocess()

In [29]:
inverted_index

{'content': {5: [0],
  10: [0],
  35: [1246],
  40: [1246],
  55: [2871],
  60: [2871],
  75: [3556],
  80: [3556, 5750, 6209],
  90: [4151],
  95: [3624, 3922]},
 'washington': {5: [1, 1925],
  10: [1, 1925, 6156, 6175, 6250, 6664],
  15: [1903],
  20: [1903, 5181, 5188, 6510],
  25: [444],
  30: [444, 2961, 3331, 3689, 3934, 4264, 4609],
  35: [464, 1476, 1576],
  40: [464, 1476, 1576],
  45: [347, 478, 652, 971, 1071, 1560, 1596, 1602, 1674, 2315, 2324, 2550],
  50: [347,
   478,
   652,
   971,
   1071,
   1560,
   1596,
   1602,
   1674,
   2315,
   2324,
   2550,
   4073,
   4087,
   4133,
   4137,
   4274],
  55: [2744],
  60: [2744, 4476, 5150, 5713],
  65: [0, 635, 646, 1100, 1109, 1124, 1236, 1248, 1263, 1390, 1460],
  70: [0,
   635,
   646,
   1100,
   1109,
   1124,
   1236,
   1248,
   1263,
   1390,
   1460,
   3000,
   3490],
  75: [4505],
  80: [4505, 6306],
  90: [3214, 3795],
  95: [0, 499, 818, 1275, 2070, 2575, 2758, 3105, 5490, 6139, 6478, 6680]},
 'congressional'

In [28]:
positional_query(['content','washington'])

[5, 10]