In [1]:
import nltk
from nltk import word_tokenize
# from nltk.corpus import stopwords -- we are not removing stop words because they also play role for postional indexing
# stop_words = set(stopwords.words('english')) 

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import pickle
import os


### For loading and storing Data Structures

In [2]:
def pickle_data(filename,file):
    filename = os.path.join('./pickle',filename)
    out = open(filename, 'wb')
    pickle.dump(file, out)
    out.close()
    
def load_pickle_data(file):
    file = os.path.join('./pickle',file)
    out = open(file, 'rb')
    index_dict1 = pickle.load(out)
    out.close()
    return index_dict1

### Preprocessing

In [5]:
def remove_metadata(lines):
    for i in range(len(lines)):
        if lines[i] == '\n':
            start = i + 1
            break
    return lines[start:]

def process_words(words):
    mod_words = []
    symbols = [ "'",'/','.','-','!','@','#','$','^','&','*','(',')','+']
    #words = [word.replace(sym,'') for word in words for sym in symbols if sym in word]
    removed_symbols = []
    for word in words :
        for sym in symbols:
            if sym in word:
                word = word.replace(sym,'')
        removed_symbols.append(word)
        
    words = removed_symbols
    del removed_symbols
        
    for word in words:
        word = word.lower()
        if word.isalnum():
            word = lemmatizer.lemmatize(word)
            #if word not in stop_words and len(word) > 2:
            if len(word) >= 2:
                mod_words.append(word)
    return mod_words
            
            
        
    
def process_text(lines, positional_dict,doc_ID):
    ''' This method is handle all the text in the file.
    
        This will remove meta data, pre process the file and construct the positional dictionary.
        
        positional dictionary format :
            pos_dict = {
                        token : [doc_freq,{
                            file_id:[pos1, pos2,.....]
                            }]
                        }
    '''
    lines = remove_metadata(lines)
    seperator = ' '
    file = seperator.join(lines)
    words = word_tokenize(file)
    words = process_words(words)
    for pos,word in enumerate(words):
        if word in positional_dict.keys():
            positional_dict[word][0] += 1 # doc frequency of word 
            if doc_ID in positional_dict[word][1].keys():
                positional_dict[word][1][doc_ID].append(pos)
            else:
                positional_dict[word][1][doc_ID] = [pos] # adding the new file id and its position
            
        else :
            dummy_dict = {}
            dummy_dict[doc_ID] = [pos]
            positional_dict[word] = [1, dummy_dict] # freq is 1 and its corresponding dictionary
            
    return positional_dict
        
        
    


In [6]:

file_mapper = {}

root_dirs = ['./data/rec.motorcycles', './data/comp.graphics']
doc_ID = 0
positional_dict = {}
for fold in root_dirs:
    print('Processing files in : {}'.format(fold))
    for file in os.listdir(fold):
        path = os.path.join(fold,file)
        
        with open(path , 'r') as f:
            lines = f.readlines()
            positional_dict = process_text(lines,positional_dict,doc_ID)
            file_mapper[doc_ID] = path
            doc_ID += 1
            if doc_ID % 100 == 0:
                print('Processing of {} files in {} is completed '.format(doc_ID,fold))

# pickling the dictionary
pickle_data('positional_dict.pkl',positional_dict)
pickle_data('pos_file_mapper.pkl',file_mapper)

Processing files in : ./data/rec.motorcycles
Processing of 100 files in ./data/rec.motorcycles is completed 
Processing of 200 files in ./data/rec.motorcycles is completed 
Processing of 300 files in ./data/rec.motorcycles is completed 
Processing of 400 files in ./data/rec.motorcycles is completed 
Processing of 500 files in ./data/rec.motorcycles is completed 
Processing of 600 files in ./data/rec.motorcycles is completed 
Processing of 700 files in ./data/rec.motorcycles is completed 
Processing of 800 files in ./data/rec.motorcycles is completed 
Processing of 900 files in ./data/rec.motorcycles is completed 
Processing of 1000 files in ./data/rec.motorcycles is completed 
Processing files in : ./data/comp.graphics
Processing of 1100 files in ./data/comp.graphics is completed 
Processing of 1200 files in ./data/comp.graphics is completed 
Processing of 1300 files in ./data/comp.graphics is completed 
Processing of 1400 files in ./data/comp.graphics is completed 
Processing of 1500 

#### Loading the dictionary

In [7]:
positional_dict = load_pickle_data('positional_dict.pkl')
file_mapper = load_pickle_data('pos_file_mapper.pkl')
len(positional_dict.keys())

26205

In [10]:
def retrieve_list(word):
    '''
    This will retrieve postings list of given token if exists
    '''
    ans = []
    if word in positional_dict.keys():
        #print('Term {} is present in the dictionary'.format(word))
        ans =  positional_dict[word]
    else:
        print('Term : {} not present in dictionary'.format(word))
    return ans


def positional_intersect(pos_list_1,pos_list_2,k):
    ans = []
    for file_id in pos_list_1.keys():
        if file_id in pos_list_2.keys():
            list_1 = pos_list_1[file_id]
            list_2 = pos_list_2[file_id]
            
            
            for pos1 in list_1:
                for pos2 in list_2 :
                    if pos2 - pos1 == k : #or pos1 - pos2 == k :
                        if file_id not in ans:
                            ans.append(file_id)
                            #print('file found.')
                        break
    return ans  
    

def process_query(query):
    results = []
    query = process_words(query)
    print('final query after preprocessing :')
    print(query)
    for i in range(len(query)):
        j = i + 1
        pos_list_1 = retrieve_list(query[i])
        while j < len(query):
            pos_list_2 = retrieve_list(query[j])
            
            swap = False
            if pos_list_1[0] > pos_list_2[0]:
                #print('Swapping')
                pos_list_1, pos_list_2 = pos_list_2, pos_list_1
                swap = True
            if swap :
                k = i - j
            else :
                k = j - i
            # Small postings list is always first one for optimisation
            ans = positional_intersect(pos_list_1[1], pos_list_2[1], k)
            results.append(ans)
            j += 1
    return results
    
def construct_dict(lists):
    results = {}
    for lis in lists:
        for ele in lis:
            if ele in results.keys():
                results[ele] += 1
            else:
                results[ele] = 1
    return results

def print_output(results):
    
    print('-----------------------')
    print('The most probable files for given phrasal query in descending order is : ')
    i  = 1
    total = sum(results.values())
    while len(results) > 0:
        count = max(results.values())
        prob = (count / total)
        for key in results.keys():
            if results[key] == count:
                print('file {} is {} with probability : {} '.format(i,file_mapper[key], prob))
                i += 1
                break
        del results[key]
        if i == 5 :
            break
        
def read_query():
    query = input().split()
    results = process_query(query)
    results = construct_dict(results)
    print_output(results)

In [12]:
read_query()

there is no larger tank available for the Hawk
final query after preprocessing :
['there', 'is', 'no', 'larger', 'tank', 'available', 'for', 'the', 'hawk']
-----------------------
The most probable files for given phrasal query in descending order is : 
file 1 is ./data/rec.motorcycles\103128 with probability : 0.02447980416156671 
file 2 is ./data/comp.graphics\38403 with probability : 0.009791921664626682 
file 3 is ./data/comp.graphics\39078 with probability : 0.009791921664626682 
file 4 is ./data/comp.graphics\39638 with probability : 0.009791921664626682 
file 5 is ./data/comp.graphics\38376 with probability : 0.0073439412484700125 
file 6 is ./data/comp.graphics\38377 with probability : 0.0073439412484700125 
file 7 is ./data/comp.graphics\38851 with probability : 0.0073439412484700125 
file 8 is ./data/comp.graphics\38853 with probability : 0.0073439412484700125 
file 9 is ./data/rec.motorcycles\102616 with probability : 0.006119951040391677 
