In [1]:
import os
import math
import numpy as np
import pandas as pd
from tqdm import tqdm 
import nltk
import itertools
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tabulate import tabulate

In [2]:
def getDocs(path):
    numberOfDocs = 0
    Docs = {}
    for file in tqdm(os.listdir(path)):
          f_path = os.path.join(path, file)
          f_open = open(f_path, "r")
          index = file.split("/")[-1].split(".")[0] 
          Docs[int(index)] = f_open.read()
          numberOfDocs += 1  
    Docs = dict(sorted(Docs.items()))
    return Docs,numberOfDocs

### Tokenizer

In [3]:
def getWordsTokenize(sentences):
    words = []
    for word_data in sentences:
      words += word_data.split()
    return words


### Remove StopWords

In [4]:
def getWordsStopWords(words):
    en_stops = set(stopwords.words('english'))
    en_stops.remove('in')
    en_stops.remove('to')
    en_stops.remove('where')
    stopWords = []
    for word in words:
        if word not in en_stops:
          stopWords.append(word)
    return stopWords



### Remove Duplicates Words

In [5]:
def removeDuplicates(x):
  return list(dict.fromkeys(x))

### builds the positional index matrix

In [6]:
def positionalIndex(words,d_Docs):    
    positional_index = {}
    
    for word in words:
        for doc,words in d_Docs.items():
            index = 0
            for word_in_file in words:
                index += 1
                # index is the position of word in file
                if word == word_in_file:
                    if (len(positional_index) == 0):
                        positional_index[word] = {'Doc'+str(doc): [index]}
                    else:
                        if word in positional_index:
                            if 'Doc'+str(doc) in positional_index[word]:
                                positional_index[word]['Doc'+str(doc)].append(index)
                            else:
                                positional_index[word]['Doc'+str(doc)] = [index]
                                
                        else:
                            positional_index[word] = {'Doc'+str(doc): [index]}
                            
    for key,values in positional_index.items():
        positional_index.update({key : [len(values) , values]})
    return positional_index

### search index

In [7]:
def search_index(query, positional_index):

    matched_docs_sets = []

    if len(query) == 1:
        
        if query[0] in positional_index:
            result = [key for key in positional_index[query[0]][1]]
        else:
            result = []

        return result
    
    else:

        for i in range(len(query)-1):
            matched_docs = []
            if query[i] in positional_index:
                first_docs = positional_index[query[i]][1]
                if query[i+1] in positional_index:
                    second_docs = positional_index[query[i+1]][1]
                    for doc in first_docs:
                        if doc in second_docs:
                            for pos in first_docs[doc]:
                                if pos + 1 in second_docs[doc]:
                                    matched_docs.append(doc)
                else:
                    break
            else:
                break

            if matched_docs:
                matched_docs_sets.append(set(matched_docs))
            else:
                result = []
                return result
    if matched_docs_sets:
        result = set.intersection(*matched_docs_sets)
    else:
        return []
    return list(result)

### builds the TF and w tf(1+ log tf) matrix

In [8]:

def calc_wtf(tf):
    tf = 1 + math.log10(tf)
    return tf

In [9]:
def build_tf_wtf_matrix(positional_index,N):
    tf_= {} 
    wtf_ = {}
    for term in positional_index:
        for i in range(N):
            if 'Doc'+str(i+1) in positional_index[term][1]:
                if term not in tf_:
                    tf_[term] = {'Doc'+str(i+1) : len(positional_index[term][1]['Doc'+str(i+1)])}
                    wtf_[term] = {'Doc'+str(i+1) : calc_wtf(tf_[term]['Doc'+str(i+1)])}
                else :
                    tf_[term]['Doc'+str(i+1)] = len(positional_index[term][1]['Doc'+str(i+1)])
                    wtf_[term]['Doc'+str(i+1)] = calc_wtf(tf_[term]['Doc'+str(i+1)])
            else :
                if term not in tf_:
                    tf_[term] = {'Doc'+str(i+1) : 0}
                    wtf_[term] = {'Doc'+str(i+1) : 0.0}
                else :
                    tf_[term]['Doc'+str(i+1)] = 0
                    wtf_[term]['Doc'+str(i+1)] = 0.0

    return tf_,wtf_

### builds the df and idf matrix

In [10]:
def build_df_idf_matrix(positional_index, N):
    df_idf = {}
    for term , value in positional_index.items():
        df_idf[term] = {'df' : value[0]}
        df_idf[term]['idf'] = round(math.log10(N / value[0]),10)
    
    return df_idf

### builds the tf.idf matrix

In [11]:
def calc_tfidf(tf_, idf):

    tfidf = [round(element * idf, 10) for element in tf_.values()]
    
    return tfidf

In [12]:
def build_tfidf_matrix(positional_index,wtf,idf, N):
    
    tfidf_matrix = {}
    for term in positional_index:
        tfidf = calc_tfidf(wtf[term], idf[term]['idf'])
        tfidf_matrix[term] = tfidf

    return tfidf_matrix

### calculating documents lengths to be used for normalization 

In [13]:
def calc_doc_length(tfidf_matrix, N):

    doc_length = [0.0] * N
    for term in tfidf_matrix:
        doc_length = np.add(doc_length, [math.pow(element, 2) for element in tfidf_matrix[term]])
        
    
    doc_length = [round(math.sqrt(element),9) for element in doc_length]

    return doc_length

In [14]:
def normalize_terms(tfidf_matrix, doc_length):
    normalized_matrix = {}
    for term in tfidf_matrix:
        normalized_matrix[term] = []
        i = 0
        for element in tfidf_matrix[term]:
            normalized_matrix[term].append(round((element / doc_length[i]), 10))
            i = i + 1
    return normalized_matrix

### prints the positional index, the tf.idf matrix normalized matrix, TF matrix, WTF matrix, df_idf and doc_lengths

In [15]:
def prepareToPrint(Dict):
    listof = []
    if type(Dict) is dict:
        listofkey = list(Dict.keys())
        listofvalue = list(Dict.values())
        l = [0]
        l2 = [0]
        for i in range(len(listofkey)):
            l [0] = listofkey[i]
            if type(listofvalue[0]) is dict:
                listof.append(l+ list(listofvalue[i].values()))
            elif type(listofvalue[0]) is list:
                listof.append(l+ listofvalue[i])
            else:
                l2[0] = listofvalue[i]
                listof.append(l+ l2)
    else :
        l1 = [0]
        l2 = [0]
        for i in range(len(Dict)):
            l1[0] = 'Doc'+str(i+1)
            l2[0] = Dict[i]
            listof.append(l1+ l2)
    return listof

In [16]:
def show_data(positionalindex, tfidf_matrix, normalized_terms, TF, WTF, df_idf, doc_lengths, N):
    print("Positional Index")
    print("")
    for term in positionalindex:
        print(term, positionalindex[term])
    print("----------------------------------------")
    print("TF Matrix")
    print("")
    print(tabulate(prepareToPrint(TF), headers=[" ","Doc1","Doc2","Doc3","Doc4","Doc5","Doc6","Doc7","Doc8","Doc9","Doc10"]))
    print("----------------------------------------")
    print("w tf(1+ log tf) Matrix")
    print("")
    print(tabulate(prepareToPrint(WTF), headers=[" ","Doc1","Doc2","Doc3","Doc4","Doc5","Doc6","Doc7","Doc8","Doc9","Doc10"]))
    print("----------------------------------------")
    print("df & idf Matrix")
    print("")
    print(tabulate(prepareToPrint(df_idf), headers=[" ","df","idf"]))
    print("----------------------------------------")
    print("TF.IDF Matrix")
    print("")
    print(tabulate(prepareToPrint(tfidf_matrix), headers=[" ","Doc1","Doc2","Doc3","Doc4","Doc5","Doc6","Doc7","Doc8","Doc9","Doc10"]))
    print("----------------------------------------")
    print("Docs length Matrix")
    print("")
    print(tabulate(prepareToPrint(doc_lengths), headers=[" ","length"]))
    print("----------------------------------------")
    print("Normalized tf.id")
    print("")
    print(tabulate(prepareToPrint(normalized_terms), headers=[" ","Doc1","Doc2","Doc3","Doc4","Doc5","Doc6","Doc7","Doc8","Doc9","Doc10"]))
    print("----------------------------------------")

### query calculations

In [17]:
def query_rtf(token_list):

    rtf = {}
    for token in token_list:
        if token in rtf:
            rtf[token] += 1
        else:
            rtf[token] = 1
    return rtf

In [18]:
def query_tf(rtf):

    tf = {}
    for token in rtf:
        tf[token] = calc_wtf(rtf[token])
    return tf

In [19]:
def query_idf(df, N):

    idf = math.log10(N / df)
    return idf

In [20]:
def query_tfidf(qtf, pos_index, N):

    tfidf = {}
    idf = {}
    for term in qtf:
        if term in pos_index:
            idf[term] = query_idf(pos_index[term][0], N)
        else:
            idf[term] = 0
        tfidf[term] = round(idf[term] * qtf[term], 10)
    return tfidf,idf

In [21]:
def query_length(q_tfidf):

    length = 0.0
    for term in q_tfidf:
        length += math.pow(q_tfidf[term], 2)
    length = math.sqrt(length)
    return length

In [22]:
def query_normalize(length, q_tfidf):

    q_normalized = {}
    for term in q_tfidf:
        if length == 0:
            q_normalized[term] = 0
        else:
            q_normalized[term] = q_tfidf[term] / length
    return q_normalized

In [23]:
def doc_normalize(doc_no, doc_lengths, tfidf_matrix, token_list):

    doc_normalized = {}
    for token in token_list:
        if token in tfidf_matrix:
            doc_normalized[token] = tfidf_matrix[token][int(doc_no[-1])-1] / doc_lengths[int(doc_no[-1])-1]
        else:
            doc_normalized[token] = 0
    return doc_normalized

### applying our calculations to the query, positional index needed for idf

In [24]:
def query_processing(query, positional_index, N):

    token_list = getWordsStopWords(getWordsTokenize([query.lower()]))
    rtf = query_rtf(token_list)
    tf = query_tf(rtf)
    tfidf,idf = query_tfidf(tf, positional_index, N)
    length = query_length(tfidf)
    query_normalized = query_normalize(length, tfidf)
    return token_list, query_normalized,rtf,tf,length,tfidf,idf

### calculates document score using normalized weights for terms in query and document collection

In [25]:
def doc_score(q_normalized, d_normalized):

    score = 0.0
    product ={}
    i = 1
    for term in q_normalized:
        score += q_normalized[term] * d_normalized[term]
        product[term] = q_normalized[term] * d_normalized[term]
        if i == len(q_normalized):
            product['SUM'] = score
        i += 1
    return score, product

### utilizing everything we built to return the matched documents, ranked

In [26]:
def engine(query, positional_index, tfidf_matrix, doc_lengths, N):
    docs = {}
    product = {}
    token_list, query_normalized,rtf,tf,length,tfidf,idf= query_processing(query, positional_index, N)
    matched_docs = search_index(token_list, positional_index)
    for i in range(len(matched_docs)):
        doc_normalized = doc_normalize(matched_docs[i], doc_lengths, tfidf_matrix, token_list)
        docs[matched_docs[i]],product[matched_docs[i]] = doc_score(query_normalized, doc_normalized)
    
    ranked_results = {}
    product_results = {}
    sorted_keys_docs = dict(sorted(docs.items(), key=lambda x:x[1], reverse=True))
    sorted_keys_prod = sorted(product)
    
    for key in sorted_keys_docs:
        ranked_results[key] = docs[key]
        
    for key in sorted_keys_prod:
        product_results[key] = product[key]
        
    if ranked_results == {}:
        return {}
    else:
        return ranked_results, query_normalized, rtf,tf, length, tfidf, idf, product_results

In [27]:
path = 'E:/level 4/Material 4th Level/IR/project/project_test'
d_doc,numberOfDocs = getDocs(path)
d_Doc = {}
for key,val in d_doc.items():
    value = []
    value.append(val)
    d_Doc[key] = getWordsTokenize(value)
d_Doc
words = list(d_doc.values())
words = getWordsTokenize(words)
words = getWordsStopWords(words)
words = removeDuplicates(words)
numberOfDocs

100%|████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 1667.91it/s]


10

### initializes the necessary structures to perform multiple searches later

In [28]:
positional_index = positionalIndex(words,d_Doc)
tf_,wtf = build_tf_wtf_matrix(positional_index,numberOfDocs) 
df_idf = build_df_idf_matrix(positional_index,numberOfDocs)
tfidf_matrix = build_tfidf_matrix(positional_index,wtf,df_idf, numberOfDocs)
doc_lengths = calc_doc_length(tfidf_matrix, numberOfDocs)
normalized_terms = normalize_terms(tfidf_matrix, doc_lengths)

In [29]:
positional_index

{'antony': [3, {'Doc1': [1], 'Doc2': [1], 'Doc6': [1]}],
 'brutus': [3, {'Doc1': [2], 'Doc2': [2], 'Doc4': [1]}],
 'caeser': [5,
  {'Doc1': [3], 'Doc2': [3], 'Doc4': [2], 'Doc5': [1], 'Doc6': [2]}],
 'cleopatra': [1, {'Doc1': [4]}],
 'mercy': [5,
  {'Doc1': [5], 'Doc3': [1], 'Doc4': [3], 'Doc5': [2], 'Doc6': [3]}],
 'worser': [4, {'Doc1': [6], 'Doc3': [2], 'Doc4': [4], 'Doc5': [3]}],
 'calpurnia': [1, {'Doc2': [4]}],
 'angels': [3, {'Doc7': [1], 'Doc8': [1], 'Doc9': [1]}],
 'fools': [4, {'Doc7': [2], 'Doc8': [2], 'Doc9': [2], 'Doc10': [1]}],
 'fear': [3, {'Doc7': [3], 'Doc8': [3], 'Doc10': [2]}],
 'in': [4, {'Doc7': [4], 'Doc8': [4], 'Doc9': [3], 'Doc10': [3]}],
 'rush': [4, {'Doc7': [5], 'Doc8': [5], 'Doc9': [4], 'Doc10': [4]}],
 'to': [4, {'Doc7': [6], 'Doc8': [6], 'Doc9': [5], 'Doc10': [5]}],
 'tread': [4, {'Doc7': [7], 'Doc8': [7], 'Doc9': [6], 'Doc10': [6]}],
 'where': [4, {'Doc7': [8], 'Doc8': [8], 'Doc9': [7], 'Doc10': [7]}]}

### prints the positional index, the tf.idf matrix normalized matrix, TF matrix, WTF matrix, df_idf and doc_lengths

In [30]:
show_data(positional_index, tfidf_matrix, normalized_terms, tf_, wtf, df_idf, doc_lengths, numberOfDocs)

Positional Index

antony [3, {'Doc1': [1], 'Doc2': [1], 'Doc6': [1]}]
brutus [3, {'Doc1': [2], 'Doc2': [2], 'Doc4': [1]}]
caeser [5, {'Doc1': [3], 'Doc2': [3], 'Doc4': [2], 'Doc5': [1], 'Doc6': [2]}]
cleopatra [1, {'Doc1': [4]}]
mercy [5, {'Doc1': [5], 'Doc3': [1], 'Doc4': [3], 'Doc5': [2], 'Doc6': [3]}]
worser [4, {'Doc1': [6], 'Doc3': [2], 'Doc4': [4], 'Doc5': [3]}]
calpurnia [1, {'Doc2': [4]}]
angels [3, {'Doc7': [1], 'Doc8': [1], 'Doc9': [1]}]
fools [4, {'Doc7': [2], 'Doc8': [2], 'Doc9': [2], 'Doc10': [1]}]
fear [3, {'Doc7': [3], 'Doc8': [3], 'Doc10': [2]}]
in [4, {'Doc7': [4], 'Doc8': [4], 'Doc9': [3], 'Doc10': [3]}]
rush [4, {'Doc7': [5], 'Doc8': [5], 'Doc9': [4], 'Doc10': [4]}]
to [4, {'Doc7': [6], 'Doc8': [6], 'Doc9': [5], 'Doc10': [5]}]
tread [4, {'Doc7': [7], 'Doc8': [7], 'Doc9': [6], 'Doc10': [6]}]
where [4, {'Doc7': [8], 'Doc8': [8], 'Doc9': [7], 'Doc10': [7]}]
----------------------------------------
TF Matrix

             Doc1    Doc2    Doc3    Doc4    Doc5    Doc6    D

In [31]:
print("TF Matrix")
print()
print(tf_)
print()
print("w tf(1+ log tf) Matrix")
print()
print(wtf)
print()
print("df & idf Matrix")
print()
print(df_idf)
print()
print("TF.IDF Matrix")
print()
print(tfidf_matrix)
print()
print("Docs length Matrix")
print()
print(doc_lengths)
print()
print("Normalized tf.id")
print()
print(normalized_terms)

TF Matrix

{'antony': {'Doc1': 1, 'Doc2': 1, 'Doc3': 0, 'Doc4': 0, 'Doc5': 0, 'Doc6': 1, 'Doc7': 0, 'Doc8': 0, 'Doc9': 0, 'Doc10': 0}, 'brutus': {'Doc1': 1, 'Doc2': 1, 'Doc3': 0, 'Doc4': 1, 'Doc5': 0, 'Doc6': 0, 'Doc7': 0, 'Doc8': 0, 'Doc9': 0, 'Doc10': 0}, 'caeser': {'Doc1': 1, 'Doc2': 1, 'Doc3': 0, 'Doc4': 1, 'Doc5': 1, 'Doc6': 1, 'Doc7': 0, 'Doc8': 0, 'Doc9': 0, 'Doc10': 0}, 'cleopatra': {'Doc1': 1, 'Doc2': 0, 'Doc3': 0, 'Doc4': 0, 'Doc5': 0, 'Doc6': 0, 'Doc7': 0, 'Doc8': 0, 'Doc9': 0, 'Doc10': 0}, 'mercy': {'Doc1': 1, 'Doc2': 0, 'Doc3': 1, 'Doc4': 1, 'Doc5': 1, 'Doc6': 1, 'Doc7': 0, 'Doc8': 0, 'Doc9': 0, 'Doc10': 0}, 'worser': {'Doc1': 1, 'Doc2': 0, 'Doc3': 1, 'Doc4': 1, 'Doc5': 1, 'Doc6': 0, 'Doc7': 0, 'Doc8': 0, 'Doc9': 0, 'Doc10': 0}, 'calpurnia': {'Doc1': 0, 'Doc2': 1, 'Doc3': 0, 'Doc4': 0, 'Doc5': 0, 'Doc6': 0, 'Doc7': 0, 'Doc8': 0, 'Doc9': 0, 'Doc10': 0}, 'angels': {'Doc1': 0, 'Doc2': 0, 'Doc3': 0, 'Doc4': 0, 'Doc5': 0, 'Doc6': 0, 'Doc7': 1, 'Doc8': 1, 'Doc9': 1, 'Doc10': 0},

In [32]:
query = input("Enter your search query:\n")
ranked_results, query_normalized, rtf,tf, length, tfidf, idf, product = engine(query, positional_index, tfidf_matrix, doc_lengths, numberOfDocs)
if ranked_results == {}:
    print("There are no matching documents for your search query")
else:
    print("")
    print("")
    print(tabulate(prepareToPrint(rtf), headers=[" ","tf-raw"]))
    print("")
    print("----------------------------------------")
    print("")
    print(tabulate(prepareToPrint(tf), headers=[" ","w tf(1+ log tf)"]))
    print("")
    print("----------------------------------------")
    print("")
    print(tabulate(prepareToPrint(idf), headers=[" ","idf"]))
    print("")
    print("----------------------------------------")
    print("")
    print(tabulate(prepareToPrint(tfidf), headers=[" ","tf*idf"]))
    print("")
    print("----------------------------------------")
    print("")
    print(tabulate(prepareToPrint(query_normalized), headers=[" ","normalized"]))
    print("")
    print("----------------------------------------")
    print("")
    print(tabulate(prepareToPrint(product), headers=list(list(product.values())[0].keys())))
    print("")
    print("----------------------------------------")
    print("")
    print('query length', length)
    print("")
    print("----------------------------------------")
    print("")
    print("  Search results ranked\n")
    i = 1
    for result in ranked_results:
        print(str(i) + "." + " Document name: " + str(result) + "\n   Cosine similarity: "
              + str(round(ranked_results[result], 3)) + "\n")
        i += 1

Enter your search query:
antony brutus


          tf-raw
------  --------
antony         1
brutus         1

----------------------------------------

          w tf(1+ log tf)
------  -----------------
antony                  1
brutus                  1

----------------------------------------

             idf
------  --------
antony  0.522879
brutus  0.522879

----------------------------------------

          tf*idf
------  --------
antony  0.522879
brutus  0.522879

----------------------------------------

          normalized
------  ------------
antony      0.707107
brutus      0.707107

----------------------------------------

        antony    brutus       SUM
----  --------  --------  --------
Doc1  0.269196  0.269196  0.538393
Doc2  0.288939  0.288939  0.577877

----------------------------------------

query length 0.7394622130798872

----------------------------------------

  Search results ranked

1. Document name: Doc2
   Cosine similarity: 0.578

2. Document name:

In [33]:
antony brutus

SyntaxError: invalid syntax (1741790972.py, line 1)

In [None]:
#antony brutus caeser cleopatra mercy worser