# TF-IDF + Cosine Score

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 21, 2019
@about: compute cosine similarity of docs/keywords etc.
@author: praveentn
"""

'\nCreated on Mon Jan 21, 2019\n@about: compute cosine similarity of docs/keywords etc.\n@author: praveentn\n'

In [2]:
import math
from nltk.stem.porter import *

def irange(sequence):
    return zip(range(len(sequence)), sequence)

class CosineScore(object):
    def __init__(self,all_docs):
        self.documents = all_docs #list all docs [doc1,doc2..]
        self.ndocs = len(all_docs)
        self.posting_list = {} #term frequency list, don't care about term position
        #term => {docId => freq}
        self.pstemmer = PorterStemmer()

        self._term_indexer()

    def _term_indexer(self):
        #Create term frequency dict
        #Run each word through stemmer
        for doc_id,document in irange(self.documents):
            for word in document.split(' '):
                s_word = self.pstemmer.stem(word)
                if s_word in self.posting_list:
                    doc_id_mapping = self.posting_list[s_word]
                    if doc_id in doc_id_mapping:
                        doc_id_mapping[doc_id] += 1
                    else:
                        doc_id_mapping[doc_id] = 1
                else:
                    self.posting_list[s_word] = {doc_id: 1}

    def _term_frequency(self,term):
        if term in self.posting_list:
            return self.posting_list[term]
        else:
            return -1

    def _listToString(self,arg):
        if isinstance(arg,str):
            return arg.split(' ')

    def __qTermFrequency(self,term,bWords):
        count =0
        for i,bWordsObj in irange(bWords):
            if bWordsObj == term:
                count = count +1
        return count

    def _docListWeights(self) :

        all_terms = self.posting_list.keys()
        doclist_weights = [0.0] * self.ndocs

        #for all terms in the corpus
        for i,term in irange(all_terms):
            #for all docs in corpus that contain this term
            docs = self.posting_list[term].keys()
            for j,doc_id in irange(docs):
                tf = self.posting_list[term][doc_id]
                tfSquared = (tf * tf)
                doclist_weights[doc_id] += tfSquared

            for k in range(self.ndocs):
                doclist_weights[k] = math.sqrt(doclist_weights[k])
        return doclist_weights

    def compute(self,query,mIDF=0):
        '''
        dft - document term frequency
        idf - inverse document frequency
        wTQ - weights for each query term
        mIDF - max tf normalization
        '''

        scores = [0.0] * self.ndocs
        bWords = self._listToString(query)
        normalizationFactor = self._docListWeights()

        for qterm in bWords:
            term = self.pstemmer.stem(qterm)
            #calculate WT
            #dft = 	__qTermFrequency(queryTerm,bWords)
            #wTQ = math.log10(int(N)/dft)

            term_posting_doclist = []
            if self._term_frequency(term) != -1:
                #Find all documents with this query term

                term_posting_doclist = self.posting_list[term].keys()
                #total_term_frequency_in_corpus = sum(self.posting_list[term].values())

                if(mIDF!=0):
                    dft = mIDF
                else:
                    dft = len(term_posting_doclist)

                _wTQ = float(self.ndocs)/float(dft)
                wTQ = math.log10(float(_wTQ)) #idf

            #cosinescore algorithm
            for doc_id in term_posting_doclist:
                if normalizationFactor[doc_id] != 0:
                    #wFTD = termDocFrequencyList/ normalizationFactor(doc_id)
                    wFTD = self.posting_list[term][doc_id] / float(normalizationFactor[doc_id])
                else:
                    wFTD = 0.0

                scores[doc_id] +=  (wTQ * wFTD)
        return scores

if __name__ == "__main__":
    docs = ["cloud storage", "flash usb storage", "backup", "retail" ]
    q = "cloud based online storage for providing automated backups"
    cs = CosineScore(docs)
    print (cs.compute(q))

[0.8837386308214606, 0.26962664743151177, 0.6020599913279624, 0.0]
