# Vector Space Model

## From Scratch

### Algorithm:

* inverted_index = {}
* *For each doc in document*:
        for each unique term:
            add doc number in posting list ==> inverted index
        make doc vector for each doc // normalized vector
        
        for eg : India COEP ===> (1,1) ===> (0.707, 0.707)
*  *for each term in query*:
        calculate t.f
        calculate d.f
        calculate i.d.f
        calculate w (t,q)  // do not normalize
        
        do fetch postings list for term
        for each pair(d, tf(t,d)) in postings list
            do Scores[d] += wf(t,d) × w(t,q)
*   *for each d*:
        do Scores[d] = Scores[d]/Length[d]
*     return Top K components of Scores[]

In [1]:
import re
import string
import numpy as np
import pandas as pd
from collections import deque
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import SpaceTokenizer
from nltk.tokenize import TweetTokenizer

imported libraries 

In [76]:
import os
from collections import Counter

*process_line()* is useful for preprocessing of document linewise and query which is itself a line

In [38]:
def process_line(line):
    """Process doc function.
    Input:
        doc: a string containing a information
    Output:
        doc_clean: a list of words containing the processed doc

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    line_tokens = tokenizer.tokenize(line)

    line_clean = []
    for word in line_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # doc_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            line_clean.append(stem_word)

    return line_clean

*process_doc()* is a function which reads file linewise and stores all terms in one file with the term frequency

In [138]:
def process_doc(docdata, file):
    global vsm
    terms = []
    for line in docdata:
        linedata = process_line(line)
        for term in linedata:
            vsm.PostingList(term, file)
            terms.append(term)
    terms = Counter(terms)
    vsm.AddAllTerms(terms, file)

*GetData()* function is useful for making document vector

In [94]:
def GetData(filelist, path):
    global vsm
    for file in filelist:
        filename = os.path.join(path, file)
        f = open(filename, 'r')
        docdata = f.readlines()
        process_doc(docdata, file)
    vsm.MakeDocVector()

helper snippet to call *GetData()* function so that Corpus processing can take place

In [139]:
dirname = os.getcwd()
path = os.path.join(dirname, 'Data')
filelist = os.listdir(path)
GetData(filelist, path)

This is important class of Vector Space Model which is useful to store Inverted Index, Document Frequencies and Document Vectors

In [122]:
class VSM():
    def __init__(self):
        self.posting_list = {}
        self.docvectors = {}
        self.docinfo = {}
        
        dirname = os.getcwd()
        path = os.path.join(dirname, 'Data')
        filelist = os.listdir(path)
        self.N = len(filelist)
        
    def MakeDocVector(self):
        for doc in self.docinfo.keys():
            vector = []
            c = self.docinfo[doc]
            for term in self.posting_list.keys():
                tf = self.TermFrequency(c[term])
                idf = self.InverseDocumentFrequency(len(self.posting_list[term]))
                vector.append(tf * idf)
            self.docvectors[doc] = vector
        self.df = pd.DataFrame(data = self.docvectors, index = self.posting_list.keys())
                
    def PostingList(self, term, docname):
        if term in self.posting_list.keys():
            if docname not in self.posting_list[term]:
                self.posting_list[term].append(docname)
        else:
            self.posting_list[term] = []
            self.posting_list[term].append(docname)
    
    def AddAllTerms(self, terms, docname):
        self.docinfo[docname] = terms
    
    def TermFrequency(self, termfrequency):
        if termfrequency == 0:
            return 0
        else:
            return 1 + np.log10(termfrequency)
    
    def InverseDocumentFrequency(self, documentfrequency):
        return np.log10(self.N / documentfrequency)
vsm = VSM()

In [124]:
vsm.df

Unnamed: 0,hillary_diane_rodham_clinton.txt,foreign_investement _to_gujrat.txt,obama.txt,barack_hussein_obama.txt,united_state_presidential_election_2016.txt,president_of_the_united_states.txt,narendra_damodardas_modi.txt,modi_visit_us.txt,united_states_of_america.txt
hillari,0.477121,0.0,0.0,0.477121,0.477121,0.0,0.0,0.0,0.000000
dian,0.954243,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
rodham,0.954243,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
clinton,0.954243,0.0,0.0,0.477121,0.620749,0.0,0.0,0.0,0.000000
ˈhɪləri,0.954243,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...
scientif,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.954243
research,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.954243
technolog,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.954243
innov,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.954243


Query Processing Unit and Result :

In [137]:
query = input('Enter Your Query:')
query = process_line(query)
c = Counter(query)
vector = []
for term in vsm.posting_list:
    tf = vsm.TermFrequency(c[term])
    idf = vsm.InverseDocumentFrequency(len(vsm.posting_list.get(term,[])))
    vector.append(tf * idf)
for key, docvector in vsm.df.iteritems():
    print(np.dot(docvector, vector)/ np.linalg.norm(docvector), key)

Enter Your Query:Narendra Modi
0.0 hillary_diane_rodham_clinton.txt
0.0795117096900166 foreign_investement _to_gujrat.txt
0.0 obama.txt
0.0 barack_hussein_obama.txt
0.0 united_state_presidential_election_2016.txt
0.0 president_of_the_united_states.txt
0.06978501195246557 narendra_damodardas_modi.txt
0.06983458798985998 modi_visit_us.txt
0.0 united_states_of_america.txt


### Sample Input and its result
Enter Your Query:   Narendra Modi  <br> 


Score               Document Name <br> 
    
    0.0                 hillary_diane_rodham_clinton.txt <br>
    0.0795117096900166  foreign_investement _to_gujrat.txt <br>
    0.0                 obama.txt<br>
    0.0                 barack_hussein_obama.txt<br>
    0.0                 united_state_presidential_election_2016.txt<br>
    0.0                 president_of_the_united_states.txt<br>
    0.06978501195246557 narendra_damodardas_modi.txt<br>
    0.06983458798985998 modi_visit_us.txt<br>
    0.0                 united_states_of_america.txt<br>

