In [1]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import warnings
import math

## Loading preprocessed document wise vocab

In [2]:
vocab_doc_wise_tokenization = np.load('vocab_doc_wise_tokenization.npy', allow_pickle='TRUE').item()
vocab_doc_wise_stemming = np.load('vocab_doc_wise_stemming.npy', allow_pickle='TRUE').item()
print(vocab_doc_wise_tokenization)
print(vocab_doc_wise_stemming)

{'P_386': ['vrihadaswa', 'said', 'after', 'long', 'time', 'had', 'passed', 'away', 'brahmana', 'named', 'parnada', 'returned', 'to', 'the', 'city', 'of', 'the', 'vidarbhas', 'and', 'said', 'unto', 'the', 'daughter', 'of', 'bhima', 'damayanti', 'seeking', 'nala', 'the', 'king', 'of', 'nishadhas', 'came', 'to', 'the', 'city', 'of', 'ayodhya', 'and', 'appeared', 'before', 'the', 'son', 'of', 'bhangasura', 'and', 'best', 'of', 'women', 'repeated', 'those', 'words', 'of', 'thine', 'in', 'the', 'presence', 'of', 'the', 'blessed', 'rituparna', 'but', 'hearing', 'them', 'neither', 'that', 'ruler', 'of', 'men', 'nor', 'his', 'courtiers', 'answered', 'anything', 'although', 'uttered', 'them', 'repeatedly', 'then', 'after', 'had', 'been', 'dismissed', 'by', 'the', 'monarch', 'was', 'accosted', 'by', 'person', 'in', 'the', 'service', 'of', 'rituparna', 'named', 'vahuka', 'and', 'vahuka', 'is', 'the', 'charioteer', 'of', 'that', 'king', 'of', 'unsightly', 'appearance', 'and', 'possessed', 'of', 'sh

## Boolean Retrieval System

In [3]:
#Creating Node which has three sub-nodes containing document ID, freq of word in that docID 
#and next to link with next docID
class Node:
    def __init__(self, docID, freq=None):
        self.docID = docID
        self.freq = freq
        self.next = None

In [4]:
#Creating word freq for each doc
def get_word_freq(vocab):
    word_freq={}
    for word in vocab:
        if word in word_freq.keys():
            word_freq[word]+=1
        else: word_freq[word]=1
    return word_freq

### Creating Postings list

In [5]:
postings_list = {}
doc_index = {}
ind=0
for doc_id, vocab in vocab_doc_wise_stemming.items():
    word_freq = get_word_freq(vocab)
    for word, freq in word_freq.items():
        if word in postings_list.keys():
            firstNode = postings_list[word]
            while firstNode.next is not None:
                firstNode = firstNode.next
            firstNode.next = Node(ind, freq)
        else:
            postings_list[word] = Node(ind, freq)
    doc_index[ind] = doc_id
    ind+=1

In [26]:
for word, node in postings_list.items():
    print(word, end='->')
    while node is not None:
        print(node.docID, end='->')
        print(node.freq, end=' ')
        node=node.next
    print('\n')

vrihadaswa->0->1 

said->0->5 1->1 4->2 

after->0->5 1->9 2->18 3->9 4->2 

long->0->1 1->6 2->1 3->4 

time->0->2 1->5 2->21 3->12 

had->0->6 1->8 2->8 3->11 4->1 

pass->0->1 2->1 3->1 

away->0->2 1->1 4->1 

brahmana->0->4 

name->0->2 1->9 2->1 3->5 4->2 

parnada->0->3 

return->0->1 1->1 2->3 3->2 

to->0->10 1->58 2->262 3->48 4->31 

the->0->28 1->333 2->538 3->151 4->136 

citi->0->4 1->1 2->1 3->6 4->5 

of->0->33 1->150 2->421 3->96 4->60 

vidarbha->0->2 

and->0->28 1->71 2->334 3->83 4->20 

unto->0->3 

daughter->0->2 4->1 

bhima->0->4 

damayanti->0->5 

seek->0->1 3->1 

nala->0->4 

king->0->8 1->8 4->1 

nishadha->0->1 

came->0->2 1->2 

ayodhya->0->3 

appear->0->2 1->2 2->6 3->1 

befor->0->1 1->2 2->6 3->1 4->2 

son->0->1 1->6 2->1 3->1 

bhangasura->0->1 

best->0->3 3->24 

women->0->3 2->22 4->1 

repeat->0->2 1->4 2->5 3->4 4->4 

those->0->1 1->1 2->8 3->3 

word->0->7 1->8 2->4 3->4 4->4 

thine->0->2 

in->0->11 1->89 2->305 3->51 4->34 

presenc->0->

other->1->7 2->14 3->6 4->4 

galleri->1->2 

refer->1->4 2->6 3->3 4->3 

bibliographi->1->2 2->2 

extern->1->2 2->3 3->2 4->2 

link->1->4 2->8 3->2 4->2 

edit->1->23 2->40 3->15 4->12 

fortress->1->2 

deriv->1->1 

gwalipa->1->3 

accord->1->5 3->1 4->1 

legend->1->2 

cure->1->2 

local->1->3 2->1 

chieftain->1->2 

suraj->1->4 

sena->1->2 

leprosi->1->2 

gratitud->1->1 

seen->1->3 2->5 

resid->1->2 

decemb->1->4 2->10 

click->1->1 

see->1->1 2->2 3->3 

detail->1->1 2->2 3->1 

an->1->10 2->34 3->9 4->6 

outcrop->1->1 

vindhyan->1->1 

solitari->1->1 

rocki->1->1 

gopach->1->5 

this->1->9 2->29 3->2 4->12 

featur->1->1 3->1 4->4 

thin->1->1 

steep->1->1 

geolog->1->2 

rang->1->2 2->8 

rock->1->9 

format->1->2 2->3 3->1 4->1 

ochr->1->1 

colour->1->1 4->1 

cover->1->2 4->1 

basalt->1->1 

horizont->1->1 

stratum->1->2 

feet->1->10 4->5 

highest->1->2 2->4 

point->1->2 2->2 3->9 

length->1->1 3->1 

mile->1->1 

km->1->1 

averag->1->1 

yard->1->1

co->1->1 3->1 

loui->1->1 2->1 

fenech->1->1 

martyrdom->1->1 

oxford->1->2 

hs->1->2 

singha->1->1 

studi->1->1 2->31 

hemkunt->1->1 

syan->1->1 

milit->1->1 

seventeenth->1->1 

ib->1->1 

tauri->1->1 

mahajan->1->1 

chand->1->1 

new->1->2 2->9 3->25 

phylli->1->1 

jestic->1->1 

holi->1->1 

peopl->1->1 2->22 3->1 4->2 

cross->1->1 2->1 

cultur->1->4 2->5 3->1 

encyclopedia->1->1 2->1 

abc->1->1 

clio->1->1 

arvind->1->1 

mandair->1->1 

sikhism->1->3 

guid->1->3 

perplex->1->1 

black->1->1 2->2 3->1 

fauja->1->1 

harban->1->1 

ed->1->4 2->5 

encyclopaedia->1->1 

punjabi->1->2 

retriev->1->1 2->23 3->2 4->3 

review->1->1 2->19 3->2 

issu->1->1 

centr->1->1 4->1 

eleanor->1->1 

nesbitt->1->1 

introduct->1->2 

toni->1->2 3->15 

mcclenaghan->1->2 

archiv->1->1 2->42 3->9 

ghostarch->1->1 

wayback->1->1 2->4 3->4 

machin->1->1 2->4 3->4 

gvaaliyr->1->1 

kaa->1->1 

itihaa->1->1 

youtub->1->1 

kamat->1->1 

potpourri->1->1 

webpag->1->1 



preval->2->11 

undercook->2->4 

contact->2->4 3->1 

handl->2->2 

knive->2->1 

utensil->2->1 

board->2->1 3->2 

contamin->2->3 

unwash->2->1 

fruit->2->1 

veget->2->1 

soil->2->3 

garden->2->3 4->2 

sandpit->2->1 

environ->2->4 

untreat->2->3 

unfilt->2->1 

consumpt->2->7 

util->2->2 

unpasteur->2->1 

milk->2->5 

product->2->3 3->4 

particular->2->6 3->1 

goat->2->6 

seafood->2->1 

excret->2->2 

contract->2->5 

intermedi->2->4 

shed->2->6 

undergo->2->2 

sporul->2->2 

potenti->2->6 

involv->2->3 3->1 

vari->2->10 4->2 

speci->2->6 

mode->2->1 

seroneg->2->5 

recipi->2->6 

seroposit->2->15 

donor->2->2 

reactiv->2->2 

immunosuppress->2->2 

striat->2->1 

make->2->4 3->1 

screen->2->5 3->3 

prior->2->4 4->1 

procedur->2->1 

specif->2->18 

unborn->2->2 

placenta->2->2 

fetal->2->5 

miscarriag->2->1 

hydrocephalus->2->3 

cerebr->2->3 

calcif->2->1 

encephalopathi->2->1 

blind->2->1 

titer->2->8 

previous->2->5 

ensur->2->1 

safeti->

pt->2->5 

jcs->2->1 

weiss->2->5 

lm->2->3 

orlofski->2->1 

growth->2->1 

chemistri->2->1 

lalibert->2->1 

carruth->2->2 

vb->2->3 

perspect->2->2 

elsevieracadem->2->1 

derouin->2->1 

pelloux->2->2 

escmid->2->1 

khurana->2->1 

sumeeta->2->1 

batra->2->1 

nitya->2->1 

cidci->2->1 

veterinari->2->8 

divis->2->1 3->1 

signori->2->1 

pereira->2->1 

karen->2->2 

franco->2->1 

regina->2->1 

leal->2->1 

diego->2->1 

advanc->2->3 

assadi->2->1 

rad->2->1 

john->2->1 3->13 

patton->2->2 

sharon->2->2 

sow->2->1 

tennesse->2->1 

coster->2->1 

lo->2->1 

sterker->2->1 

ribot->2->1 

albaba->2->1 

issert->2->1 

bastien->2->1 

pratlong->2->1 

peripher->2->1 

diagmicrobio->2->1 

di->2->1 

mario->2->1 

basevi->2->1 

gagliotti->2->1 

spettoli->2->1 

gori->2->1 

damico->2->1 

magrini->2->1 

cochran->2->1 

databas->2->2 3->3 

normativa->2->1 

sobr->2->1 

cuidado->2->1 

pre->2->1 

concepcionai->2->1 

direccao->2->1 

geral->2->1 

saud->2->1 


jewish->3->6 

lawyer->3->1 

schoolteach->3->1 

gave->3->2 

marri->3->1 

flatbush->3->1 

borough->3->1 

attend->3->2 

erasmus->3->1 

hall->3->1 4->2 

sister->3->1 

edith->3->1 

chorea->3->1 

patern->3->1 

grandpar->3->1 

orthodox->3->1 

jew->3->1 

atheist->3->1 

kosher->3->1 

sake->3->1 

lax->3->1 

synagogu->3->1 

holiday->3->1 

mitzvah->3->1 

religi->3->1 4->1 

reject->3->1 

fundamentalist->3->1 

himself->3->2 

admit->3->1 

sound->3->1 

job->3->1 

graduat->3->2 

robson->3->1 

instructor->3->1 

cbs->3->2 

directorproduc->3->1 

submit->3->3 

comed->3->1 

fantasi->3->1 

clairvoy->3->1 

columbia->3->1 

workshop->3->1 

shirley->3->1 

booth->3->1 

led->3->1 

hire->3->3 

lux->3->1 

theater->3->2 

interrupt->3->1 

draft->3->1 

cleric->3->1 

saw->3->1 

instead->3->1 

assign->3->3 

studio->3->3 

astoria->3->1 

wrote->3->5 

cukor->3->2 

holden->3->1 

reassign->3->1 

harvey->3->2 

fierstein->3->1 

jerri->3->3 

herman->3->1 

creator->3

### Query preprocessing

Steps followed
1. Tokenize the query
2. Convert infix query expression to postfix query expression using stack approach
        a. Check if the given expression is balanced or not
        b. Check is there any extra parenthesis in the expression
3. Processing two operator only in the query **\&**(and) , **\|** (or) and **\~**(negation) and giving higeher precedence to the former
4. Using **snowball_stemmer** as a stemmer algorithm to find the stem word in the given query
5. Generate binary vector based on document size and consider negation sign as well while processing
6. Find document which contains the query word using **find_matched_doc** function and return a binary vector that shows which document contains that word
7. Remove stop words from query

In [7]:
def is_operator(token):
    if token in ['&' , '|']:
        return True
    return False

#Precedence of operators
def precedence_oper(token):
    if token=='&': return 2
    elif token=='|': return 1
    else: return -1

def get_postfix_list(tokens):
    stack = []
    postfix_list = []
    for token in tokens:
        #If token is left small bracket '('
        if token == '(': stack.append(token)
        elif token == ')':
            while(len(stack)>0 and stack[-1]!='('):
                postfix_list.append(stack.pop())
            if len(stack)==0 and token==')':
                raise ValueError('Either unnecessary parenthesis or Not a balanced query')
            stack.pop()
            if len(stack)>0 and stack[-1] == '(':
                raise ValueError('Either unnecessary parenthesis or Not a balanced query')
        elif is_operator(token):
            while(len(stack)>0 and precedence_oper(token) <= precedence_oper(stack[-1])):
                postfix_list.append(stack.pop())
            stack.append(token)
        else: 
            postfix_list.append(token)
    while len(stack)>0:
        postfix_list.append(stack.pop())
    return postfix_list

In [8]:
def query_preprocessing(q):
    #Remove stop words from query
    stop_words = set(stopwords.words('english'))
    #Tokenize query first
    q_tokens = word_tokenize(q)
    new_q_tokens = [word for word in q_tokens if word not in stop_words]
    #Convert this infix list into postfix list to process operator in right way
    q_tokens = get_postfix_list(new_q_tokens)
    return q_tokens

In [9]:
def get_stemmer(stemmer_type):
    if(stemmer_type=='porter_stemmer'): stemmer = nltk.PorterStemmer()
    elif(stemmer_type=='snowball_stemmer'): stemmer = nltk.SnowballStemmer(language = 'english')
    return stemmer

In [10]:
def get_binary_vec(token, postings_list, doc_size):
    word_embedd = np.zeros(doc_size, dtype=int)
    vocab = postings_list.keys()
    negation = False
    if token[0]=='~':
        negation=True
        token=token[1:]
    if token not in vocab:
        print(token + " was not found in the corpus")
        return word_embedd
    node = postings_list[token]
    while node is not None:
        word_embedd[node.docID] = 1
        node=node.next
    if negation:
        word_embedd = np.invert(word_embedd)
    return word_embedd

In [20]:
def find_matched_doc(query_tokens, postings_list, doc_index, top_k):
    
    word_embedd_stack = []
    doc_size = len(doc_index)
    
    for token in query_tokens:
        if is_operator(token):
            if(len(word_embedd_stack)<2): 
                raise ValueError("Query is not correct or use more stopping words")
            first_operand = word_embedd_stack.pop()
            second_operand = word_embedd_stack.pop()
            
            if token=='&': word_embedd_stack.append(first_operand & second_operand)
            elif token=='|': word_embedd_stack.append(first_operand | second_operand)
            else:
                raise ValueError('Can\'t process this operator: ', token)
        else:
            stemmer = get_stemmer('snowball_stemmer')
            st = stemmer.stem(token)
            
            token_embedd = get_binary_vec(token, postings_list, doc_size)
            word_embedd_stack.append(token_embedd)
    matched_doc = [doc_index[docID] for docID in np.where(word_embedd_stack[-1])[0]]
    return matched_doc[:top_k]

In [21]:
queries_list = ['person | technology', 'man | indiashow']
top_k=3
print("Top {} documents retrieved".format(top_k))
for query in queries_list:
    query_tokens = query_preprocessing(query)
    matched_doc = find_matched_doc(query_tokens, postings_list, doc_index, top_k)
    print(matched_doc)

Top 3 documents retrieved
technology was not found in the corpus
['P_386', 'D00585']
['T00921', 'D00585']


In [13]:
vocab_doc_wise_stemming.keys()

dict_keys(['P_386', 'T00921', 'D00585', 'L00119', 'T00755'])

In [32]:
list(postings_list.keys()).index('return')

11

## Tf-Idf Retrieval System

In [63]:
def get_doc_tf(doc, query_tokens, postings_list, doc_doc_index):
    doc_vector = np.zeros(len(postings_list.keys()), dtype=int)
    for token in doc:
        if token not in postings_list.keys():
            print(token + ' was not found in corpus')
        elif token not in query_tokens:
            continue
        else:
            node = postings_list[token]
            while node is not None:
                if node.docID == doc_doc_index: break
                node=node.next
            idx = list(postings_list.keys()).index(token)
            doc_vector[idx]=node.freq
    return doc_vector

def get_doc_idf(doc, query_tokens, postings_list, doc_size):
    inverse_freq={}
    for token in doc:
        if (token not in postings_list.keys()) or (token not in query_tokens):
            inverse_freq[token]=0
        else:
            node = postings_list[token]
            dft=0
            while node is not None:
                dft+=1
                node=node.next
            inverse_freq[token] = math.log(doc_size / dft)
    return inverse_freq

def get_doc_tf_idf(doc_vector, postings_list, inv_doc_freq):
    for token, token_freq in inv_doc_freq.items():
        idx = list(postings_list.keys()).index(token)
        doc_vector[idx] = doc_vector[idx]*token_freq
    doc_len = np.linalg.norm(doc_vector)
    doc_vector = doc_vector/doc_len
    return doc_vector

def get_query_vector(query_tokens, postings_list):
    query_vector = np.zeros(len(postings_list.keys()), dtype=int)
    for token in query_tokens:
        if token not in postings_list.keys():
            print(token + ' was not found in corpus')
        else:
            idx = list(postings_list.keys()).index(token)
            query_vector[idx] += 1
    return query_vector

def get_scoring_vec(tf_idf_matrix, doc_size):
    score_vector = np.zeros(doc_size, dtype=float)
    for _, vec in tf_idf_matrix.items():
        score_vector += vec
    return score_vector

def get_mapped_doc(score_vec, doc_index, top_k):
    doc_idx_mapping = np.arange(len(doc_index))
    get_matched_doc = [doc_index[docID] for score, docID in sorted(zip(score_vec, doc_idx_mapping), reverse=True)]
    return get_matched_doc[:top_k]

In [69]:
def get_docs_tf_idf_vector(query_tokens, vocab_doc_wise_stemming, postings_list, doc_index):
    doc_tf_idf_vector={}
    for doc, doc_vocab in vocab_doc_wise_stemming.items():
#         print(doc)
        idx = list(doc_index.values()).index(doc)
        doc_vector = get_doc_tf(doc_vocab, query_tokens, postings_list, idx)
    #     print("Doc vector: ", len(doc_vector))
        doc_inv_term_freq = get_doc_idf(doc_vocab, query_tokens, postings_list, len(doc_index))
    #     print("Doc inv freq: ", doc_inv_term_freq)
        doc_tf_idf = get_doc_tf_idf(doc_vector,postings_list, doc_inv_term_freq)
        doc_tf_idf_vector[doc] = doc_tf_idf
    #     print("doc_tf_idf: ", len(doc_tf_idf))
    return doc_tf_idf_vector

In [73]:
queries_list = ['person and technology and brahmana but not movie', 'man or indiashow']
top_k=3
print("Top {} documents retrieved".format(top_k))
for query in queries_list:
    #Tokenize query first
    q_tokens = word_tokenize(query)
    #Remove stop words from query
    stop_words = set(stopwords.words('english'))
    new_q_tokens = [word for word in q_tokens if word not in stop_words]
    V_q = get_query_vector(new_q_tokens, postings_list)
#     print(new_q_tokens)
#     print(V_q)
    q_sim_score=[]
    q_sim_score_map_doc_name=[]
    doc_tf_idf_vector = get_docs_tf_idf_vector(new_q_tokens, vocab_doc_wise_stemming, postings_list, doc_index)
    for doc, v_d in doc_tf_idf_vector.items():
        cos_sim = np.dot(V_q, v_d)/(np.linalg.norm(V_q) * np.linalg.norm(v_d))
        q_sim_score.append(cos_sim)
        q_sim_score_map_doc_name.append(doc)
        print(cos_sim)
    print(q_sim_score_map_doc_name)
    sim_score = [docName for score, docName in sorted(zip(q_sim_score, q_sim_score_map_doc_name), reverse=True)]
    print(sim_score[:top_k])

Top 3 documents retrieved
technology was not found in corpus
movie was not found in corpus




0.7071067811865475
nan
0.7071067811865475
nan
nan
['P_386', 'T00921', 'D00585', 'L00119', 'T00755']
['P_386', 'T00921', 'D00585']
nan
0.7556890827898176
nan
nan
nan
['P_386', 'T00921', 'D00585', 'L00119', 'T00755']
['P_386', 'T00921', 'D00585']


In [23]:
len(doc_index)

5