In [2]:
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import copy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import string


[nltk_data] Downloading package stopwords to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Samyak
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
data_dir = '../Humor,Hist,Media,Food'
file_names = os.listdir(data_dir) #reading the data directory to list all the files
file_paths = [(data_dir + '/' + fname) for fname in file_names] #forming file paths
docID_to_doc_mapping = {} #forming docID to doc name mapping
for i in range(len(file_names)):
    docID_to_doc_mapping[i] = file_names[i]

Preprocessing Part (for file texts and query text)

In [4]:
def remove_punct(tok):
    '''
        Removing punctations from tokens
    '''
    punctuations = string.punctuation
    tok = ''.join(x for x in tok if x not in punctuations)
    return tok
def remove_space(tok):
    '''
        Removing blank space toks
    '''
    tok = ''.join(x for x in tok if x != ' ')
    return tok

def preprocess_file(file_text):
    '''
        This function preprocesses the file text.
        Input: file_text in string form represting the text of a file
        Returns: cleaned_toks, word tokens present in the file after preprocessing
    '''

    #converting the text to lowercase
    ftext = file_text.lower()

    #performing word tokenization
    file_toks = word_tokenize(ftext)

    #removing the stopwords from tokens
    stop_words = list(set(stopwords.words('english')))
    file_toks = [tok for tok in file_toks if tok not in stop_words]

    #removing punctuation marks from tokens
    toks_no_punct = []
    for tok in file_toks:
        ctok = remove_punct(tok)
        if(ctok != ""):
            toks_no_punct.append(ctok)
    
    #Removing blank space tokens
    cleaned_toks = []
    for tok in toks_no_punct:
        ctok = remove_space(tok)
        if(ctok != ""):
            cleaned_toks.append(ctok)

    return cleaned_toks

def cleanQuery(query_text):
    '''
        Preprocessing the query text
        Input: query_text, string of the phrase query text
        Returns: cleaned_toks, an array containg the preprocessed query tokens
    '''

    #We perform the same preprocessing steps on the query as we did for the file text

    #converting the text to lowercase
    qtext = query_text.lower()
    
    #performing word tokenization
    query_toks = word_tokenize(qtext)
    
    #removing the stopwords from tokens
    stop_words = list(set(stopwords.words('english')))
    query_toks = [tok for tok in query_toks if tok not in stop_words]
    
    #removing punctuation marks from tokens
    toks_no_punct = []
    for tok in query_toks:
        ctok = remove_punct(tok)
        if(ctok != ""):
            toks_no_punct.append(ctok)
    
    #Removing blank space tokens
    cleaned_toks = []
    for tok in toks_no_punct:
        ctok = remove_space(tok)
        if(ctok != ""):
            cleaned_toks.append(ctok)
    
    return cleaned_toks

Reading the files and preprocessing

In [5]:
def read_file(fpaths):
    '''
        Reads the files and preprocess every file's text to form word tokens for every file.
        Returns a 2-D list containing word tokens for every file
    '''
    file_tokens = []
    for fpath in fpaths:
        f = open(fpath, 'r', encoding='utf-8', errors='ignore') #open the file
        ftxt_unprocessed = f.read() #read the text of the file
        ftoks = preprocess_file(ftxt_unprocessed) #preprocessing the text to form word tokens
        file_tokens.append(ftoks)
    return file_tokens


In [6]:
def getDocsFromID(docID_to_doc, doc_IDs):
    '''
        Given a list of document IDs, it outputs the document names corresponding to thos IDs.
        Input: docID_to_docs (mapping between docID -> doc_name), docIDs - list of input document IDs
        Returns: doc_names - list of doc_names corresponding to document IDs in doc_IDs
    '''
    doc_names = []
    for doc_ID in doc_IDs:
        doc_names.append(docID_to_doc[doc_ID])
    return doc_names

In [7]:
file_toks = read_file(file_paths) #extracting the tokens from each and every file

Creating Positional Index

In [8]:
def create_positional_index(file_toks):
    '''
        This function build the positional index. It takes in the word tokens of each file as input and returns two dictionaries: 
        positional_index - corresponding to the posting and positions details for all the terms, pidx_freq - corresponding to the document frequency 
        for each term (no. of documents containing the term).
    '''

    pidx_dict = {} #this is dictionary to store the postings and positions details for the terms
    pidx_freq = {} #this is dictionary to store the frequency values for the terms

    #Iterate over all the files
    for i in range(len(file_toks)):
        #For each file, iterate over all the file tokens
        for j in range(len(file_toks[i])):
            tok = file_toks[i][j] #the j_th token in the i_th file
            if(tok not in pidx_dict.keys()): #if the token is not yet present as a term in the index
                pidx_dict[tok] = {} #create a new entry for the term and a dictionary for the term to store the document_ID and positions
                pidx_dict[tok][i] = [j] #in the term's dictionary add the position 'j' in the positions list of the i_th document
            else: # else if the token is already present as a term in the index
                if(i in pidx_dict[tok].keys()): #if the i_th document is already present in the term's corresponding dictionary
                    pidx_dict[tok][i].append(j) #then append the j_th position to the i_th document's positions list
                else:
                    pidx_dict[tok][i] = [j] # add the i_th document to the term's dictionary and initialize the new positions list for the document with the position j
    
    pidx_dict = dict(sorted(pidx_dict.items())) #alphabetically sort the index wrt the terms
    
    #Converting the intenal dictionary (the dictionary corresponding to each term which has document IDs and positions list) to a nested list.
    positional_index = {}
    terms = pidx_dict.keys()
    for t in terms:
        positional_index[t] = []
        for docID in pidx_dict[t].keys():
            term_doc_positions = copy.deepcopy(pidx_dict[t][docID])
            term_doc_positions.sort() #sorting the positions corresponding to each document_ID included for each term
            positional_index[t].append([docID, term_doc_positions])        
        positional_index[t].sort(key=lambda x: x[0]) #sort a term's corresponding list wrt to document IDs.

    #building the term frequency dictionary
    for t in positional_index.keys():
        pidx_freq[t] = len(positional_index[t])

    return positional_index, pidx_freq


In [9]:
def get_postings(pidx, term):
    '''
        Given a term, retreive its posting list.
        Input: pidx - positional index, term
        Returns: [] if term not in index, posting list for the term otherwise
    '''
    all_terms = pidx.keys()
    if(term not in all_terms):
        return []
    else:
        return pidx[term]

In [10]:
def get_frequency(pidx_freq, term):
    '''
        Given a term, retreive its frequency value.
        Input: pidx_freq - positional index frequency array, term
        Returns: 0 if term not in index, frequency value for the term otherwise
    '''
    all_terms = pidx_freq.keys()
    if(term not in all_terms):
        return 0
    else:
        return pidx_freq[term]

In [11]:
def get_term_index_info(pidx, pidx_freq, term):
    '''
        Given a term, retreive both its posting list & frequency value.
        Input: pidx - positional index, pidx_freq - positional index frequency array, term
        Returns: [], 0 if term not in index; posting, frequency value for the term otherwise
    '''
    if(term not in pidx.keys()):
        print("Term not found in index")
        return [], 0
    else:
        return pidx[term], pidx_freq[term]

In [12]:
positional_index, idx_frequency_array = create_positional_index(file_toks) #creating the positional index
#positional_index stores the information related to posting lists i.e- docID and position of term in that document.
#idx_frequency_array stores information related to frequency of terms


Phrase Query Searching

In [13]:
def check_all_in_range(ptrs, ranges):
    '''
        Helper function for checking if all the pointers are in respective ranges
    '''
    for i in range(len(ptrs)):
        if(ptrs[i] >= ranges[i]):
            return False
    return True

def process_phrase_query(positional_index, query_toks):
    '''
        This function processes the phrase query. It takes in the positional_index and query_toks(tokens) as the input 
        and determines if all tokens are present in the terms list, then performs phrase query search otherwise returns 0, [].
        Returns: num_docs - number of documents retreived, doc_names - names of retreived documents
    '''
    terms = positional_index.keys()
    for tok in query_toks:
        if(tok not in terms): #query token is not in the index terms
            print("A query term is not present in index.")
            return 0, []
    if(len(query_toks) == 1):
        tok_postings = positional_index[query_toks[0]]
        num_docs = len(tok_postings)
        doc_IDs = []
        for i in range(num_docs):
            doc_IDs.append(tok_postings[i][0])
        doc_names = getDocsFromID(docID_to_doc_mapping, doc_IDs)
        return num_docs, doc_names
    num_docs, doc_names = perform_phrase_query(positional_index, query_toks) #If all query tokens present in index terms, perform phrase query searching
    return num_docs, doc_names

def perform_phrase_query(positional_index, query_toks):
    '''
        This function performs phrase query searching and has the algorithm for it. Given the positional_index and query_toks (query tokens)
        as input, the function returns number of documents retreived and their names for a given phrase query search.
    '''
    n_toks = len(query_toks) #number of tokens in the phrase query
    outer_pointers = [0] * n_toks #constructing n_toks number of outer pointers using which we iterate over the document IDs.
    outer_ranges = [len(positional_index[query_toks[i]]) for i in range(n_toks)] #ranges for the outer pointers for each query token term
    answer = [] #array that holds our final answer
    while(check_all_in_range(outer_pointers, outer_ranges)): #Check condition for while loop ensures all outer_pointers in range

        all_equal = True
        #This loop checks if all the outer pointers point to the same document ID. If not so, set flag all_equal to False and break out of the loop.
        for i in range(n_toks - 1):
            if(positional_index[query_toks[i]][outer_pointers[i]][0] != positional_index[query_toks[i + 1]][outer_pointers[i + 1]][0]):
                all_equal = False
                break
        
        if(all_equal == False): #If all outer pointers do not point to same document ID
            #Through this loop we find the outer pointer that points to the least magnitude document ID and finally after the loop increment it by one.
            least_ptr = 0
            for i in range(1, n_toks):
                if(positional_index[query_toks[least_ptr]][outer_pointers[least_ptr]][0] > positional_index[query_toks[i]][outer_pointers[i]][0]):
                    least_ptr = i
            outer_pointers[least_ptr] += 1 #increment that outer pointer which points to the lowest magnitude document ID by one.
        
        if(all_equal == True): #If all outer pointers point to same document ID
            doc_ID = positional_index[query_toks[0]][outer_pointers[0]][0] #the value of document ID which all outer pointer point to
            posting_positions_lists = [] #this will be a two dimensional list which contains the term positions list for each query token for the common document ID to which all outer pointers point
            for i in range(n_toks):
                posting_positions_lists.append(positional_index[query_toks[i]][outer_pointers[i]][1]) #appending the term positions list for the i_th query term/token corresponding common document_ID pointed to by the outer pointer
            
            j = 1 
            inner_pointers = [0] * n_toks #constructing n_toks number of inner pointers using which we iterate over the positions list for each query token for the corresponding common document_ID pointed to by the outer pointer.
            inner_ranges = [len(posting_positions_lists[i]) for i in range(n_toks)] #ranges for the outer pointers for each query token term
            flag = True
            while(flag == True and check_all_in_range(inner_pointers, inner_ranges)): #Check condition for while loop ensures all inner_pointers in range
                
                #This loop is used to check if in the positions array the values pointed by the inner pointers are consecutive starting from the position value (value pointed by inner_pointer) of the 0_th index query token (i.e first query token).
                cntr = 0
                for i in range(1, n_toks):
                    if(posting_positions_lists[i][inner_pointers[i]] == posting_positions_lists[i - 1][inner_pointers[i - 1]] + 1): #IF the position value pointed by consecutive inner pointer is consecutive, increment cntr by 1
                        cntr += 1
                
                if(cntr == n_toks - 1): #if all the values are consecutive i.e - the positions array the values pointed by the inner pointers are consecutive starting from the position value (value pointed by inner_pointer) of the 0_th index query token (i.e first query token).
                    #This means the current document ID is a valid answer
                    flag = False
                    answer.append(doc_ID) #append the current document ID to our answer and break
                    break
                else:
                    #While the position value pointed by the j_th inner pointer is less or equal to that pointed by (j-1)_th inner pointer, keep incrementing the j_th inner pointer (because we want that the position value of j_th pointer should be equal to value of (j-1)th plus one)
                    while(inner_pointers[j] < inner_ranges[j] and (posting_positions_lists[j][inner_pointers[j]] <= posting_positions_lists[j - 1][inner_pointers[j - 1]])):
                        inner_pointers[j] += 1
                    #If after all the incrementing, the j_th pointer is in range then compare the value of j_th inner pointer with (j-1)_th
                    if(inner_pointers[j] < inner_ranges[j]):
                        
                        if(posting_positions_lists[j][inner_pointers[j]] == posting_positions_lists[j - 1][inner_pointers[j - 1]] + 1):
                            #If the position value pointed by j_th inner pointer is equal to (j-1)_th's value plus one, then these two tokens(j_th and (j-1)_th) are present at valid & consecutive positions, thus we increment j by one and move on to the next query token
                            j += 1
                        else:
                            #If the  position value pointed by j_th inner pointer is not equal to (j-1)_th's value plus one, means they are not currently consecutive terms thus we discard this current sequence of positions starting from the position value pointed by the 0_th inner pointer (first token's inner pointer).
                            j = 1
                            inner_pointers[0] += 1 # We will increment the 0_th inner pointer by one to again start with a new position sequence and check

            #The case when the document IDs pointed by all the outer pointers are equal, then increment all the outer pointers by one.
            for i in range(n_toks):
                outer_pointers[i] += 1
    
    #Now we have our answer
    num_docs = len(answer) #number of documents retreived
    doc_names = getDocsFromID(docID_to_doc_mapping, answer) #names of documents retreived

    return num_docs, doc_names

In [27]:
query_input = input("Enter the phrase query : ")
query_toks = cleanQuery(query_input)
print(f"\nInput Phrase Query: {query_input}")
print(f"Query tokens : {query_toks}\n")
if(len(query_toks) > 5):
    print("Please enter query within length limit.")
elif(len(query_toks) == 0):
    print("No query tokens to search after preprocessing")
else:
    num_docs_retreived, doc_names_retreived = process_phrase_query(positional_index, query_toks)
    print(f"Number of documents retreived : {num_docs_retreived}")
    print(f"Names of documents retreived : {doc_names_retreived}\n")


Input Phrase Query: different countries in europe
Query tokens : ['different', 'countries', 'europe']

Number of documents retreived : 1
Names of documents retreived : ['aboutada.txt']

