In [24]:
import pandas as pd
import numpy as np
import glob
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
nltk.download('stopwords')  # download stopwords
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
files_path = "/content/drive/MyDrive/Humor,Hist,Media,Food"

#Store path of all files in paths
paths = []
for (dirpath, dirnames, filenames) in os.walk(str(files_path)):
  for i in filenames:
     paths.append(str(dirpath)+str("/")+i)

print(len(paths))

1133


In [28]:
######################################   part a   ############################################

#Convert the text to lower case
def convert_lower_case(text):   
    return np.char.lower(text)

#Perform word tokenization
def word_tokenization(text):
  tokens = word_tokenize(str(text))
  return tokens

#Remove stopwords from tokens
def remove_stop_words(tokens):  
    stop_words = stopwords.words('english')
    filtered_tokens = []
    for word in tokens:
        if word not in stop_words:
            filtered_tokens.append(word)
    return filtered_tokens

#Remove punctuation marks from tokens
def remove_punctuation(words):   
    filtered_words = []
    for word in words:
        filtered_word = re.sub(r'[^\w\s]', ' ', word)    #\w for string containing char a-z,A-Z,0-9,_, \s for whitespaces, ^for except these 
        if filtered_word != '':
            filtered_words.append(filtered_word)
    return filtered_words

#Remove blank space tokens
def remove_blank_space_tokens(words):
  words=' '.join(words).split()
  return words 

#Perform preprocessing
def preprocess_data(text):
    text = convert_lower_case(text)
    tokens = word_tokenization(text)
    filtered_tokens = remove_stop_words(tokens)
    filtered_tokens = remove_punctuation(filtered_tokens)
    filtered_tokens = remove_blank_space_tokens(filtered_tokens)
    return filtered_tokens

In [29]:
######################################   part b   ############################################

def find_pos_of_list_having_doc(posting_list, doc):
    pos = 0
    for a in posting_list:
        if a[0] == doc:     
            return pos         #return position of list in postings having doc at 0th position
        pos+=1
    return ""

doc_no = 0
positional_index = {}
doc_map = {}

for path in paths:
    file = open(path, 'r', encoding='ISO-8859-1')
    text = file.read().strip()
    file.close()
    tokens = preprocess_data(text)
 
    for position, term in enumerate(tokens):
        if term in positional_index : 
            positional_index[term][1] += 1
            posting = positional_index[term][0]                    #posting list of token  
            pos = find_pos_of_list_having_doc(posting,doc_no)   
            if pos != "" :                                         #if doc already added
                positional_index[term][0][pos][1].add(position)    #add position to set
            else:                                                  #if doc added first time 
                positional_index[term][0].append([])               #create new list 
                positional_index[term][0][-1].append(doc_no)       #append doc_no to the newly appended list(-1 is position of new list)
                positional_index[term][0][-1].append(set())        #append set for positions to the newly appended list
                positional_index[term][0][-1][1].add(position)     #add position to this newly appended set
        else:
            positional_index[term] = []                            #new list as a value for token
            positional_index[term].append([])                      #list at 1st pos for storing posting lists for each doc
            positional_index[term].append(1)                       #occurence count of token at 2nd position
            positional_index[term][0].append([])                   #create new list 
            positional_index[term][0][-1].append(doc_no)           #append doc_no to the newly appended list(-1 is position of new list)
            positional_index[term][0][-1].append(set())            #append set for positions to the newly appended list
            positional_index[term][0][-1][1].add(position)         #add position to this newly appended set
    filename = os.path.basename(path)
    doc_map[doc_no] = filename
    doc_no+=1
    
df = pd.DataFrame(positional_index)
display(df)

Unnamed: 0,newsgroups,talk,bizarre,rigler,dao,nrc,ca,michael,subject,t,b,boxed,edition,message,id,1992dec11,033233,26164,sol,uvic,reply,to,organization,cadc,date,fri,11,dec,92,03,32,33,gmt,lines,372,board,game,copyright,1992,labs,...,couters,couterfeiting,niven,10megz,06601030305800,f0110030,kvm,stacc,halk,cided,ungrateful,cin,acheing,recommande,rb,natuurlijk,nemen,goeie,belgische,yeeaaah,dispensaries,samurais,dogfood,sprit,practises,bodys,hairdryer,yeeaaahhhhh,wechselstr,lotsa,niki,mmmhh,halfwit,butterman,snotface,robosig,errorfree,sigmaker,dowdy,laver
0,"[[0, {0, 1416}], [15, {0, 2039}], [23, {0, 232...","[[0, {1, 42, 103}], [3, {374}], [6, {2667, 101...","[[0, {104, 2, 43}], [4, {349, 53}], [15, {1715...","[[0, {1121, 3, 8, 48, 1588, 1589, 24}]]","[[0, {25, 4, 29, 1590}], [639, {1530}], [648, ...","[[0, {26, 5, 1591}], [450, {6594, 6598}], [593...","[[0, {1592, 27, 21, 6}], [1, {272, 97, 105}], ...","[[0, {1586, 7}], [17, {396}], [25, {645}], [29...","[[0, {9}], [10, {6}], [15, {129, 9, 4434, 4793...","[[0, {10, 1422, 145, 177, 1266, 212, 1298, 120...","[[0, {96, 1152, 11, 178, 146, 1267, 213, 1299,...","[[0, {171, 12}], [968, {668}]]","[[0, {13}], [12, {73}], [13, {255}], [39, {325...","[[0, {496, 14}], [15, {5377, 899, 4553, 16, 44...","[[0, {15}], [15, {17, 92, 118}], [23, {17}], [...","[[0, {16}]]","[[0, {17}]]","[[0, {18}]]","[[0, {19}], [23, {1795}], [210, {10}], [286, {...","[[0, {20}], [84, {10206}], [100, {716}], [843,...","[[0, {22}], [10, {531}], [22, {1360}], [25, {8...","[[0, {23}], [28, {22}], [35, {3560, 3534, 1233...","[[0, {28}], [15, {23}], [18, {281}], [23, {23}...","[[0, {30}]]","[[0, {31}], [10, {0}], [15, {2440, 3211, 2324,...","[[0, {32}], [39, {4}], [50, {761, 1299, 2020}]...","[[0, {33}], [1, {174}], [7, {93, 87}], [10, {2...","[[0, {34}], [15, {4463, 4850, 4211, 5206, 4983...","[[0, {35}], [15, {97, 74, 106}], [17, {712, 72...","[[0, {36}], [15, {3490, 4213}], [59, {18}], [7...","[[0, {37}], [1, {411}], [4, {414}], [15, {3715...","[[0, {38}], [1, {428}], [5, {124}], [7, {32, 1...","[[0, {39}], [15, {36}], [23, {36}], [25, {25}]...","[[0, {40}], [15, {4067, 37, 3889, 2482, 1234, ...","[[0, {41}]]","[[0, {97, 66, 452, 837, 231, 44, 556, 147, 53,...","[[0, {98, 1527, 356, 440, 1510, 776, 330, 491,...","[[0, {46}], [6, {108}], [12, {29}], [24, {912,...","[[0, {47}], [15, {2049, 3713, 2443, 3214, 2327...","[[0, {49}], [15, {5283}], [25, {700}], [29, {4...",...,"[[1130, {240}]]","[[1130, {262}]]","[[1130, {499}]]","[[1130, {512}]]","[[1131, {0}]]","[[1131, {1}]]","[[1131, {9}]]","[[1131, {10}]]","[[1131, {24}]]","[[1131, {28}]]","[[1131, {182}]]","[[1131, {274, 275}]]","[[1131, {289}]]","[[1131, {313}]]","[[1131, {318}]]","[[1131, {339}]]","[[1131, {342}]]","[[1131, {344}]]","[[1131, {345}]]","[[1131, {347}]]","[[1131, {374}]]","[[1131, {427}]]","[[1131, {443}]]","[[1131, {476}]]","[[1131, {572}]]","[[1131, {601}]]","[[1131, {604}]]","[[1131, {614}]]","[[1131, {672}]]","[[1131, {727}]]","[[1131, {748}]]","[[1131, {757}]]","[[1132, {50}]]","[[1132, {267, 61, 534}]]","[[1132, {268}]]","[[1132, {576}]]","[[1132, {577}]]","[[1132, {578}]]","[[1132, {615}]]","[[1132, {636}]]"
1,145,412,65,7,6,7,1176,405,567,8326,1222,3,75,500,147,1,1,1,16,5,172,209,199,1,836,71,1045,171,352,348,189,191,187,406,1,623,591,137,395,40,...,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,1,1,1,1,1


In [20]:
######################################   part c   ############################################

def check_tokens_in_index(tokens):
  for token in tokens:
    if token not in positional_index:
      return False
  return True

def find_word_positions_in_doc(posting_list, doc):
    for a in posting_list:
        if a[0] == doc:     
            return a[1]            #return set of word positions in doc
    return {}

def find_doc_position_pair_list(word):
    doc_position_pair_list = []
    word_postings = positional_index[word][0]
    for a in word_postings:
        for position in a[1]:      #for each position of doc a[0]
            doc_position_pair_list.append((a[0], position))
    return doc_position_pair_list

def positional_func(first_word_doc_position_set, query_tokens):
    matched_docs = []
    for a in first_word_doc_position_set:
        doc = a[0]
        pos = a[1]
        token_count = 0

        for token in query_tokens:                          #for all words after first word
            pos = pos+1                                     #to check if next word is on next position
            token_posting = positional_index[token][0]      #posting list of next word
            token_docs = [a[0] for a in token_posting]      #docs list of next word
            if doc in token_docs:                           #if same doc as of first word in next word also
                doc_positions = find_word_positions_in_doc(token_posting, doc)
                if pos in doc_positions:
                    token_count += 1
                else:
                    token_count += 1
                    break
            if token_count == len(query_tokens):
                matched_docs.append(a[0])

    return set(matched_docs)

def starter_method():
    query = input("Enter phrase query: ")
    if query == "":
      print("Enter valid phrase query!")
      return
    query_tokens = preprocess_data(query)
    if(check_tokens_in_index(query_tokens)):                    #check only if all token exists in positional index
      retrieved_doc_count = 0
      retrieved_doc_list = []
      if len(query_tokens)==1:                                  #if query has only one word
        retrieved_doc_count = len(positional_index[query_tokens[0]][0])
        for a in positional_index[query_tokens[0]][0]:
          retrieved_doc_list.append(doc_map[a[0]])
        print("The number of documents retrieved:",retrieved_doc_count)
        print("The list of document names retrieved:",retrieved_doc_list)

      else:                                                     #if query has more than one word
          first_word = query_tokens[0]
          first_word_doc_position_list = find_doc_position_pair_list(first_word)
          query_tokens.pop(0)
          matched_docs_set = positional_func(first_word_doc_position_list, query_tokens)  #find common docs in first word posting and remaining words
          print("The number of documents retrieved:", len(matched_docs_set))
          
          for doc in matched_docs_set:
            retrieved_doc_list.append(doc_map[doc])
          print("The list of document names retrieved:",retrieved_doc_list)


    else:
      print("No result available!")                           
  

In [32]:
starter_method()

Enter phrase query: welcome
The number of documents retrieved: 94
The list of document names retrieved: ['st_silic.txt', 'top10st2.txt', 'top10st1.txt', 'top10.txt', 'texican.dic', 'texican.lex', 'quack26.txt', 'quotes.jok', 'pracjoke.txt', 'renorthr.txt', 'nigel.6', 'nukewar.txt', 'psilaine.hum', 'nysucks.hum', 'passage.hum', 'onetoone.hum', 'onetotwo.hum', 'oldeng.hum', 'dieter.txt', 'dover.poem', 'feggaqui.txt', 'goforth.hum', 'hotel.txt', 'grail.txt', 'insults1.txt', 'jac&tuu.hum', 'insult.lst', 'jokes1.txt', 'humor9.txt', 'letter_f.sch', 'lawyer.jok', 'looser.hum', 'lawskool.txt', 'math.1', 'making_y.wel', 'luvstory.txt', 'luzerzo2.hum', 'misery.hum', 'manners.txt', 'miami.hum', 'adcopy.hum', 'myheart.hum', 'moose.txt', 'boneles2.txt', 'bmdn01.txt', 'bbq.txt', 'dead5.txt', 'conan.txt', 'modest.hum', 'mlverb.hum', 'comrevi1.hum', 'dead2.txt', 'bhb.ill', 'bread.rcp', 'beginn.ers', 'butcher.txt', 'candy.txt', 'get.drunk.cheap', 'homebrew.txt', 'stuf10.txt', 'films_gl.txt', 'bnbeg2.4.