In [51]:
from os import walk
import json
import numpy as np
from math import sqrt
from tkinter import *
from tkinter import ttk
from tkinter import filedialog
import tkinter as tk
import sys, os
from math import sqrt
from nltk.stem import WordNetLemmatizer
global dirname
dirname="Trump Speechs"
r_flag=0

In [52]:
def read_cache():
    try:
        #loading saved compressed dictionary of numpy arrays
        loaded = np.load('TrumpSpeechs.npz')
        return 1,loaded
    except Exception as e:
        print("in read cache")
        print(e)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
        return 0,None
def write_cache(to_save):
    try:
        #saving compressed dictionary of numpy arrays 
        np.savez_compressed('TrumpSpeechs',terms=np.array(to_save['terms']),weight=to_save['weight'],tf=to_save['tf'],idf=to_save['idf'],df=to_save['df'],doc_norm=to_save['doc_norm'],doc_length=to_save['doc_length'])
    except Exception as e:
        print("in write cache")
        print(e)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)

In [53]:
def simplify(sentence):
    #remove irrelevent symbols from sentence
    return sentence.replace(';',' ').replace(',',' ').replace('?',' ').replace('[',' ').replace('-',' ').replace(']',' ').replace('(',' ').replace(')',' ').replace('"',' ').replace('.',' ').replace(':',' ')

In [54]:
def extract_stopwords():
    #save stopwords to ignore them from query and documents
    stopword_list=[]
    try:
        stopword_list_dir="Stopword-List.txt"                           #open,read and save stopword list file if found
        stopword_file=open(stopword_list_dir,"r")
        for word1 in stopword_file.readlines():
            if(word1 not in (" ","\n","") ):
                stopword_list.append(word1.replace('\n',' ').split(" ")[0])
    except Exception as e:
        print("in extract stopwords")
        print(e)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
    return np.array(stopword_list)

In [55]:
def preprocessing(word,stopword_list):
    try:
        #convert every word to lower 
        new_word=word.lower()
        #if word in stop word so skip this iteration
        if new_word in stopword_list:                           
            return None
        new_word=new_word.split("'")[0]
        wordnet_lemmatizer = WordNetLemmatizer()
        #lemmatize verbs
        new_word=wordnet_lemmatizer.lemmatize(new_word, pos="v")
        #lemmatize nouns
        new_word=wordnet_lemmatizer.lemmatize(new_word, pos="n")
        return new_word
    except Exception as e:
        print("in preprocessing")
        print(e)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
        return None

In [56]:
def indexing():
    global dirname
    global r_flag
    try:
        print("Processing...")
        files      =[]
        doc_number =-1
        N=56
        to_save={}
        
        #load saved dictionary
        read_flag,to_save=read_cache()
        if(read_flag==1):
            terms      =to_save['terms']
            tf         =to_save['tf']
            df         =to_save['df']
            idf        =to_save['idf']
            doc_length =to_save['doc_length']
            doc_norm   =to_save['doc_norm']
            weight     =to_save['weight']
            N=len(doc_norm)
            return to_save,N

        if(dirname=="Null"):
            dirname="Trump Speechs"
        
        #load files in chosen directory
        for dirpath,dirnames,filenames in walk(dirname):  
            files.extend(filenames)                 
        
        #length of files
        N=len(files)
        
        #sort files order wise 
        if("_" in files[0]):
            files.sort(key=lambda filename : int(filename.split('_')[1].split('.')[0]))
        
        #get stopwords
        stopword_list=extract_stopwords()
        
        #save every term from every document
        terms2d=[[] for i in range(N)]
        for file in files:           
            doc_number=doc_number+1
            f=open(dirname+'/'+file,"r")           
            lines=f.readlines()                      
            f.close()
            #to ignore first sentence if there is
            line_index=1 if len(lines)>1 else 0
            sentence=simplify(lines[line_index])
            for word in sentence.split():
                #preprocess the word
                new_word=preprocessing(word,stopword_list)
                if(new_word==None or len(new_word)<2):
                    continue
                terms2d[doc_number].append(new_word)
        pad = len(max(terms2d, key=len))
        #pad the terms array to perfect square size to vectorize the operations
        terms=np.array([i + ["-1marker"]*(pad-len(i)) for i in terms2d])
        terms=np.reshape(terms,-1)
        terms=terms[terms != "-1marker"]
        terms=np.array(list(set(terms)))

        #initialize term frequency for every document and term
        tf=[[0 for i in range(len(terms))] for i in range(N)]
        for doc_no in range(len(terms2d)):
            for word in terms2d[doc_no]:
                term_index=list(terms).index(word)
                tf[doc_no][term_index]=tf[doc_no][term_index]+1
        tf=np.array(tf)
        #calculate length of document
        doc_length=np.sum(tf,axis=1)
        #calculate the length of documents term t appears in
        df=np.sum(tf.astype(bool),axis=0)
        #calculate idf
        idf=np.log10(N/df)
        #calculate weight by idf* normalized tf
        weight=(tf/np.reshape(doc_length,(len(doc_length),1)))*idf
        #calculate doc_norm
        doc_norm=np.linalg.norm(weight,axis=1)
        
        #save the dictionary of numpy arrays
        to_save={}
        if(read_flag==0):
            to_save['terms']=terms
            to_save['tf']=tf
            to_save['df']=df
            to_save['idf']=idf
            to_save['doc_length']=doc_length
            to_save['doc_norm']=doc_norm
            to_save['weight']=weight
            write_cache(to_save)
        return to_save,len(doc_norm)
    
    except Exception as e:
        print("in indexing")
        print(e)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)

In [57]:
def fileDialog():
    try:
        global labelFrame,dirname
        dirname=filedialog.askdirectory()
        label=ttk.Label(labelFrame,text="")
        label=ttk.Label(labelFrame,text=dirname)
        label.grid(column=2,row=1,padx=10,pady=10,sticky='nesw')
    except Exception as e:
        dirname="Trump Speechs"
        print("in file dialog")
        print(e)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)

In [58]:
def cosine_similarity(x,y):
    #sim (x,y) = x.y / |x|*|y|
    x=np.array(x)
    y=np.array(y)
    x_norm=np.linalg.norm(x)
    y_norm=np.linalg.norm(y)
    result=(x*y)/(x_norm*y_norm)
    return np.sum(result)

In [59]:
def process_query():
    try:
        #to filter documents
        alpha=0.0005
        global labelFrame
        #index the files in chosen directory
        saved,N=indexing()
        #load the saved numpy arrays
        terms=saved['terms']
        idf=saved['idf']
        weight=saved['weight']
        tf=saved['tf']
        df=saved['df']
        doc_length=saved['doc_length']
        #get query from user
        query=str(e1.get())
        #return if no query is entered
        if(len(query)==0):
            return
        #create vector of term frequency of  query words
        tfq=np.zeros(len(terms))
        weight_q=np.zeros(len(terms))
        query_words=[]
        query_simplified=simplify(query)
        stopword_list=extract_stopwords()
        for words in query_simplified.split():
            new_word=preprocessing(words,stopword_list)
            if(new_word==None):
                continue
            query_words.append(new_word)
            if new_word in terms:
                index=list(terms).index(new_word)
                tfq[index]=tfq[index]+1
                
        #calculate weight of query
        weight_q=(tfq/np.sum(tfq))*idf
        weight_q=np.reshape(weight_q,(1,-1))
        scores=np.zeros(N)
        t_count=0
        for i in range(N):
            temp=np.reshape(weight[i],(1,-1))
            #call similarity function to get score
            if(cosine_similarity(weight_q,temp)>0):
                scores[i]=cosine_similarity(weight_q,temp)
        
        print("before trimmed")
        print(scores)
        score_dict={}
        print("after trimmed")
        for i in range(N):
            if(scores[i]>=alpha):
                score_dict[i]=scores[i]
            else:
                t_count=t_count+1
        #sort the scores and get the corresponding documents
        for j in sorted(score_dict, key=score_dict.get, reverse=True):
            print(j, score_dict[j])

        label1=ttk.Label(labelFrame,wraplength=300,anchor=tk.E,text="")
        label2=ttk.Label(labelFrame,wraplength=300,anchor=tk.E,text="" )
        label3=ttk.Label(labelFrame,wraplength=300,anchor=tk.E,text="")
        label1.grid(column=2,row=4,sticky='nesw')
        label2.grid(column=2,row=6,sticky='nesw')
        label3.grid(column=2,row=8,sticky='nesw')
        label1.configure(text=t_count)
        label2.configure(text=len(score_dict))
        label3.configure(text=str(list(sorted(score_dict, key=score_dict.get, reverse=True)))) 
                
    except Exception as e:
        print("in process query")
        print(e)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)

In [60]:
try:
    master = Tk()
    master.title("Vector Space Model")
    master.minsize(640, 340)

    labelFrame=ttk.LabelFrame(master,text="K173850 Saad Amin")
    labelFrame.grid(column=0,row=1,padx=30,pady=70)

    label=ttk.Label(labelFrame,text="Default Index Directory is Trump Speechs")
    label.grid(column=2,row=1,padx=10,pady=10)

    ttk.Label(labelFrame,text="Input").grid(row=2,column=1)
    e1=ttk.Entry(labelFrame,width=50)
    e1.grid(row=2,column=2)

    b1=ttk.Button(labelFrame,text="Browse Index Directory",command=fileDialog)
    b2=ttk.Button(labelFrame,text="Search",command=process_query)
    b1.grid(column=1,row=1,sticky='nesw',padx=20,pady=20)
    b2.grid(column=5,row=2,padx=20,pady=20)

    ttk.Label(labelFrame,text="Document Trimmed").grid(row=4,column=1,padx=10,pady=10)
    ttk.Label(labelFrame,text="Result Length").grid(row=6,column=1,padx=10,pady=10)
    ttk.Label(labelFrame,text="Result (1st,2nd,3rd...)").grid(row=8,column=1,padx=10,pady=10)

    mainloop()
except Exception as e:
    print("in Main")
    print(e)
    exc_type, exc_obj, exc_tb = sys.exc_info()
    fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
    print(exc_type, fname, exc_tb.tb_lineno)

Processing...
before trimmed
[0.         0.         0.04558693 0.12792798 0.08699376 0.
 0.0140561  0.02328538 0.         0.05117938 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.0355178  0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.        ]
after trimmed
3 0.12792798068586325
4 0.0869937587191838
9 0.05117937810809804
2 0.045586928674328744
20 0.03551779766753212
7 0.023285379904168704
6 0.014056102076463758
Processing...
before trimmed
[0.00047964 0.00263465 0.006219   0.00466632 0.00678761 0.00027529
 0.00072658 0.00665413 0.00077826 0.00497908 0.04573785 0.05013626
 0.04044699 0.01739904 0.03933707 0.01387046 0.01956538 0.01132667
 0.01614468 0.0

Processing...
before trimmed
[2.34427119e-03 2.08289289e-02 1.29573511e-03 9.82700468e-05
 3.37112766e-03 1.07918622e-02 8.50032055e-04 1.36841657e-03
 4.20948109e-03 3.26420289e-03 7.43559201e-03 8.27357915e-03
 6.13934292e-05 6.33120128e-05 3.33362908e-05 7.33962339e-03
 2.29635339e-03 2.79788187e-02 3.13868579e-03 1.40651684e-02
 5.30514412e-05 1.14834306e-04 5.83454415e-05 4.44173716e-05
 1.11960501e-02 0.00000000e+00 1.56568186e-03 8.33293301e-05
 1.11251412e-02 4.93118100e-05 4.45981819e-03 2.32815146e-05
 5.73208017e-05 1.18440802e-04 1.30891327e-02 3.50658333e-03
 2.12756143e-03 3.11864579e-03 0.00000000e+00 1.21775781e-02
 1.50355093e-02 1.63944724e-02 2.71558040e-03 1.79572664e-03
 2.87921905e-03 1.76823786e-03 5.27345090e-03 2.07146172e-03
 2.26995298e-03 3.20695042e-03 2.56262093e-03 2.33575849e-03
 5.18135890e-02 3.50787231e-03 4.03694218e-03 4.55132214e-03]
after trimmed
52 0.05181358904028377
17 0.027978818698692277
1 0.020828928931246608
41 0.016394472375354803
40 0.015

Processing...
before trimmed
[0.         0.         0.00720936 0.00291329 0.00440245 0.08097099
 0.0005335  0.00147299 0.00013853 0.00080938 0.003327   0.0419493
 0.00040446 0.00062564 0.00065885 0.         0.00153898 0.00283314
 0.03758536 0.00043246 0.00269615 0.00247589 0.00296519 0.00188113
 0.00446796 0.00517441 0.00242099 0.00352909 0.         0.00409329
 0.         0.0013804  0.00056644 0.00383048 0.00064856 0.
 0.00138922 0.00068635 0.         0.0014887  0.00155361 0.
 0.0611585  0.0003952  0.00064199 0.00079003 0.         0.
 0.00099544 0.00142212 0.         0.00154215 0.00182704 0.
 0.00135275 0.        ]
after trimmed
5 0.08097098843890078
42 0.061158498266828495
11 0.04194930478207058
18 0.03758536475387861
2 0.007209356816256279
25 0.00517440601006347
24 0.0044679573997583755
4 0.004402448267568089
29 0.00409328548970319
33 0.00383047790176252
27 0.003529091443286909
10 0.003327003033171943
22 0.002965194579412411
3 0.002913293394332841
17 0.002833138844202646
20 0.0026961