
0. Intersection (all docId that contain all query terms)  
1. Calculate tf-idf for each query term with in each doc from step 0  
2. For each doc add up tf-idf (all query terms in doc)  
3. Rank highest to lowest, return doc id  

In [1]:
from HelperClass import InvertedIndex
from nltk.stem import PorterStemmer
from collections import defaultdict
stemmer = PorterStemmer()
import math
import re

def calculate_idf(N, df):
    idf = math.log((N + 0.1) / (df + 0.1))
    return idf

def getTfIdf(tf, df, N):
    return tf * calculate_idf(N, df)

def stemQuery(query:str) -> list:
    queryList = list()
    line = query.strip()
    if line != '':
        for aToken in re.split('[^a-z0-9]', line.lower()):
            if (aToken != ''):
                token = stemmer.stem(aToken)
                queryList.append(token)
    return queryList

def getDfDict(query:set, invIndex):
    queryDict = defaultdict(lambda: '')
    for token in query:
        if token in invIndex:
            queryDict[token] = len(invIndex['pos'][token]) #df
        else:
            queryDict[token] = 0
    return queryDict

def getDocIds(qDict:dict, invIndex) -> set:
    unionSet = set()
    for word in qDict:
        if len(unionSet) == 0:
            unionSet = set(invIndex['pos'][word].keys())
        else:
            unionSet = unionSet.intersection(set(invIndex['pos'][word].keys()))
    return unionSet

def getTfIdfDict(qDict:dict, invIndex, N) -> dict:
    tfIdfDict = defaultdict(lambda: 0)
    for docId in getDocIds(qDict, invIndex):
        for word in qDict:
            tf = len(invIndex['pos'][word][docId])
            tfIdfDict[docId] += getTfIdf(tf, qDict[word], N)
    
    #Sort based on tfIdf
    return {k: v for k, v in sorted(tfIdfDict.items(), key=lambda item: item[1], reverse=True)}

N = 5000
query = 'university of california irvine'
stemList = stemQuery(query)
invIndex = InvertedIndex()
invIndex.load(stemList)
qDict = getDfDict(stemList,invIndex)
tfIdfDict = getTfIdfDict(qDict, invIndex, N)

In [2]:
tfIdfDict

{4594: 277.4572294198993,
 4610: 277.4572294198993,
 4597: 242.87357938458828,
 4733: 242.87357938458828,
 4513: 90.78095995195508,
 4719: 90.78095995195508,
 4750: 86.92476747098024,
 4463: 80.97170975422448,
 4600: 80.97170975422448,
 4725: 80.97170975422448,
 4450: 74.93217210220696,
 4489: 74.93217210220696,
 4543: 74.93217210220696,
 4544: 74.93217210220696,
 4709: 74.93217210220696,
 4590: 68.24731795956315,
 4671: 68.24731795956315,
 4694: 68.24731795956315,
 707: 52.33236400468197,
 4875: 48.45338778292318,
 4947: 48.45338778292318,
 1113: 44.28305021787396,
 4632: 42.2464727388167,
 4669: 42.2464727388167,
 4556: 41.84114443512986,
 4599: 41.84114443512986,
 4595: 41.659465267315966,
 4628: 41.659465267315966,
 4716: 41.659465267315966,
 559: 41.24211908214913,
 1053: 41.24211908214913,
 4580: 41.07935591393184,
 4622: 41.07935591393184,
 4727: 41.07935591393184,
 4553: 40.97206589572167,
 4681: 40.97206589572167,
 4679: 40.12802085764557,
 4582: 39.802478364909994,
 4618: 39.