In [None]:
import pandas as pd
import numpy as np
import os
import shutil
from pathlib import Path
import nltk
import string
import json
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
"""
The function [process_query] processes the query as follows:
1. lowercase the query
2. remove punctuation
3. tokenize the query
4. stopword removal (english stopword removal)
5. remove blankspace tokens

Arguments : Query (String)
Returns : Query Tokens (List)
"""

def process_query(query):
    query = query.lower()
    query = query.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(query)
    tokens_without_sw = [word for word in tokens if not word in stopwords.words('english')]
    return tokens_without_sw

#......

In [None]:
"""
The following code reads the positional_index that was already created and
stored before. It also reads the mapping from document ids to document name.
The data is stored in the form of bytes by the pickle library.
"""

f = open("positional_index.txt","rb")
txt = f.read()
f.close()
positional_index = pickle.loads(txt)

f = open("id_to_name.txt","rb")
txt = f.read()
f.close()
id_to_name = pickle.loads(txt)

#......

In [None]:
"""
The function [doc_contains_phrase] checks whether a particular document contains
phrase or not. 

Arguments : The positions at which term1 occurs in document (List)
            The positions at which term2 occurs in document (List)
Returns : True if phrase occurs in document, False if not (Boolean)

"""
def doc_contains_phrase(postion_term1, position_term2):
    for i in postion_term1:
        if (i+1) in position_term2:
            return True
    return False

#......

In [None]:
"""
The function [phrase_retrieval] is the core function for phrase retrieval.

Arguments : Query (String)
            Positional_Index (Dictionary)
Returns : List of documents (names) containing the query (List)

The function works the following way:
1. processes the query with the help of [process_query] function to get tokens.
2. gets the list of all eligible documents for first token (token1)
3. gets the list of all eligible documents for second token (token2)
4. intersect the two lists to get common documents
5. iterate through each of the common document to check if document contains
   token1 + token2 in it. The helper function [doc_contains_phrase] is used.
6. a new list is created containing documents having token1 + token2. 
7. token1 is assigned the value token2 and token2 then takes the value of next
   token i.e third token (token3)
8. steps 3 to 7 are repeated until tokens are finished
Common List is the Final Answer after all the above steps
"""


def phrase_retrieval(query,positional_index):
    query_tokens = process_query(query)
    eligible = []
    n = len(query_tokens)
    if (n==0):
        return []
    query1 = query_tokens[0]
    try:
        eligible = positional_index[query1].copy()
    except KeyError:
        eligible = {}
    list_of_docs1 = np.array(list(eligible.keys()))
    for query in query_tokens[1:]:
        query2 = query
        try:
            list_of_docs2 = np.array(list(positional_index[query2].keys()))
        except KeyError:
            list_of_docs2 = np.array([])
        common_docs = list(np.intersect1d(list_of_docs1,list_of_docs2))
        common_docs_new = []
        for i in common_docs:
            try:
                position_term1 = positional_index[query1][i]
            except KeyError:
                position_term1 = []

            try:
                position_term2 = positional_index[query2][i]
            except KeyError:
                position_term2 = []

            if doc_contains_phrase(position_term1,position_term2):
                common_docs_new.append(i)
        list_of_docs1 = np.array(common_docs_new)
        query1 = query2
    
    common_docs_with_name = []
    for i in list(list_of_docs1):
        common_docs_with_name.append(id_to_name[i])
    return common_docs_with_name

#.....

In [None]:
while True:
    query = input("Enter Phrase Query")
    documents = phrase_retrieval(query,positional_index)
    print("Number of Documents Retrieved :",len(documents))
    print("The Documents containing ("+query+") are:")
    for i in documents:
        print(i)


Number of Documents Retrieved : 13
The Documents containing (michael jackson) are:
aboutada.txt
bnbeg2.4.txt
bugbreak.hum
bugs.txt
episimp2.txt
filmgoof.txt
hotnnot.hum
how.bugs.breakd
interv.hum
jokes
pepsideg.txt
popconc.hum
talebeat.hum


KeyboardInterrupt: Interrupted by user

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=04d84d63-ff1d-4672-9f78-d034a2868658' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>