### Imports

In [1]:
import pickle

In [2]:
import os

In [3]:
import re

In [4]:
import nltk

In [5]:
# nltk.download('all')

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
from nltk.corpus import stopwords

In [8]:
stop_words = set(stopwords.words('english'))

## (iii) Compare and comment on your results using (i) and (ii). [5 Marks]
1. Input Format:

a. The first line contains N denoting the
number of queries to execute

b. The next N lines contain phrase queries

#### Load the doc map

In [9]:
PATH_TO_PREPROCESSED_DATA= "./preprocessed_data"

In [10]:
doc_map = dict()


for index,doc in enumerate(os.listdir(PATH_TO_PREPROCESSED_DATA)):
    doc_map[index] = doc

##### Load the inverted bigram index

In [11]:
PATH_TO_SAVED = "./Saved"

In [12]:
inverted_bigram_index = ""

with open(os.path.join(PATH_TO_SAVED,'inverted_bigram_index.pickle'),'rb') as f:
    inverted_bigram_index = pickle.load(f)

In [13]:
len(inverted_bigram_index)

84208

#### load the pos index

In [14]:
pos_index = ""

with open(os.path.join(PATH_TO_SAVED,'pos_index.pickle'),'rb') as f:
    pos_index = pickle.load(f)

#### Executing the query for positonal index


In [15]:
def preprocess(text):
    
    #lowercase the text
    text = text.lower()

    #remove punctuation
    text = re.sub(r'[+*/\\\-?.>,<\"\';:!@#$%^&()_`~]', ' ', text)
  
    #tokenize
    tokens = word_tokenize(text) #tokens will contain list of tokens

    #remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    tokens = [token for token in tokens if token!=" "]

    return tokens


In [16]:
def get_inp():
    
    print("Execute Query ------>\n")
    
    N = int(input("Enter the number of queries to execute -->"))

    list_of_ip_seq = []

    for i in range(N):
        input_seq = input("\nEnter the phrase query ---->")
        
    list_of_ip_seq.append(input_seq)
    
    return list_of_ip_seq

In [17]:
def get_doc_id_for_token(token):
        return pos_index[token][1] # will return the list of docids 
    

In [18]:
def check(list_of_tok):
    
    for tok in list_of_tok:
        if tok not in pos_index.keys():
            return False

    return True 

In [19]:
def get_list_for_token(token):
    return pos_index[token]

In [20]:
def convert_to_dict(list_of_dict):
    #takes a list of dict and return a dict
    res = dict()
    
    for d in list_of_dict:
        key = list(d.keys())[0]
        value = list(d.values())[0]
        res[key] = value
    
    return res

In [21]:
def pos_intersect(pos_list_1,pos_list_2,k):
    ans = []

    pos_list_1 = convert_to_dict(pos_list_1)
    pos_list_2 = convert_to_dict(pos_list_2)
    
    
    for doc_id in pos_list_1.keys():
        if doc_id in pos_list_2.keys():
            list1 = pos_list_1[doc_id]
            list2 = pos_list_2[doc_id]
            
            
            for pos1 in list1:  #### ERROR IS HEREEE
                for pos2 in list2:
                     
                    if pos2 - pos1 == k:
                        
                        if doc_id not in ans:
                            ans.append(doc_id)
                        break # no need to check further in file when it is added
        
        
    
    return ans
                        
                    
            

#### Executing the query for bigram index


In [22]:
def AND_opr(x,y):
    
    #note: both x and y are sorted
    n = len(x)
    m = len(y)

    x_ptr,y_ptr=0,0

    res = []
    
    while(x_ptr<n and y_ptr<m):
        
        if x[x_ptr] == y[y_ptr]:
            res.append(x[x_ptr])
      
            x_ptr = x_ptr + 1
            y_ptr = y_ptr + 1

    
        else:
            if x[x_ptr]<y[y_ptr]:
                x_ptr = x_ptr + 1
            
            else:
                y_ptr = y_ptr + 1
    
    return res
        



In [32]:
def main():
    
    list_of_queries = get_inp()
    
    for index,query in enumerate(list_of_queries):
        
        res = [] # will contain all the doc ids that fulfill pos intersect
        
        list_of_tokens = preprocess(query)
        
        #print(f"The list of tokens {list_of_tokens}")
        
        #check if the tokens in query is present in our corpus    
        if not check(list_of_tokens):
            print(f"The Query {index + 1} cannot be computed as the tokens in the query do not exist in the corpus")
            continue #hence moving onto the next query
    
        
        if len(list_of_tokens) >5:
            print(f"Phrase query {index + 1} has more than 5 tokens and hence cannot be processed")
            continue #hence continue to the next query
        
        
        ## Extraction from postional index
        for i in range(len(list_of_tokens)-1):
            j = i + 1
            
            list_tok1 = get_list_for_token(list_of_tokens[i])
            
            dis = len(list_of_tokens[i]) # will keep of track of the next pos of the token
            
            while(j < len(list_of_tokens)):
                
                list_tok2 = get_list_for_token(list_of_tokens[j])
                
                k = j - i
                
                list_of_docs = pos_intersect(list_tok1[1],list_tok2[1],dis+k)
                
                res.append(list_of_docs)
                
                
                dis = dis + len(list_of_tokens[j])
                j = j + 1
        

        final_res = set(res[0])
        for r in res[1:]:
            final_res = final_res & set(r)
        
        
        print(f"Number of documents retrieved for query {index + 1} using positional index: {len(final_res)}")
        
        file_names = []
        for res in final_res:
            file_names.append(doc_map[res])
            
        print(f"Names of documents retrieved for query {index + 1} using positional index: {file_names}")
                
        
        
        
        ## Extraction from bigram index
        bigrams = []
    
        for i in range(len(list_of_tokens)-1):
            bigrams.append((list_of_tokens[i],list_of_tokens[i+1]))
        
        print(bigrams)
        
        res_doc_ids = inverted_bigram_index.get(bigrams[0])
        
        for bigram in bigrams[1:]:
            res_doc_ids = AND_opr(res_doc_ids,inverted_bigram_index.get(bigram))
        
        print(f"Number of documents retrieved for query {index + 1} using bigram inverted index: {len(res_doc_ids)}")
        
        file_names = []
        for res in final_res:
            file_names.append(doc_map[res])
                                  
        print(f"Names of documents retrieved for query {index + 1} using bigram inverted index: {file_names}")
        
                    
            
            

In [33]:
len('extend')

6

In [34]:
463+6+1

470

In [26]:
len("range")

5

In [27]:
463+len('extend')+len('range')+2

476

In [28]:
len('extend')+len('range')+2

13

In [35]:
main()

Execute Query ------>

Enter the number of queries to execute -->1

Enter the phrase query ---->lines method superposition linearized
Number of documents retrieved for query 1 using positional index: 1
Names of documents retrieved for query 1 using positional index: ['cranfield1343']
[('lines', 'method'), ('method', 'superposition'), ('superposition', 'linearized')]
Number of documents retrieved for query 1 using bigram inverted index: 1
Names of documents retrieved for query 1 using bigram inverted index: ['cranfield1343']
