In [4]:
import pyarrow.feather as feather
import json

def getFpDataframe(fileName:str):
    fpDf = None
    with open(f'Data/{fileName}.feather', 'rb') as f:
        fpDf = feather.read_feather(f)
    return fpDf

def getTermData(term:str, fpDf):
    termInfo = None
    if term not in fpDf['fp']: return termInfo
    with open(f'Data/Index.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][term])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

def getDocData(docId:int, fpDf):
    docInfo = None
    if docId not in fpDf['fp']: return docInfo
    with open(f'Data/docId.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][docId])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

In [73]:
from nltk.stem import PorterStemmer
from heapq import heapify, heappush, heappop
import re

def stemQuery(query:str, termFpDf:dict) -> dict:
    stemmer = PorterStemmer()
    queryDict = {}
    intersectOrder = []; heapify(intersectOrder)
    line = query.strip()
    if line != '':
        for aToken in re.split('[^a-z0-9]', line.lower()):
            if (aToken != ''):
                token = stemmer.stem(aToken)
                termData = getTermData(token, termFpDf)
                if termData:
                    queryDict[token] = termData
                    heappush(intersectOrder, (termData['idf']*-1,token))
    return queryDict, intersectOrder

def getDocIds(queryDict:dict[str], order:list[tuple]) -> set:
    intersection_keys = set()
    for tup in order:
        if len(intersection_keys) == 0:
            intersection_keys = set(queryDict[tup[1]].keys())
            intersection_keys.remove('cList')
            intersection_keys.remove('idf')
        else:
            intersection_keys = intersection_keys & queryDict[tup[1]].keys()

    return intersection_keys

def getTf_IdfRank(docIds:set[str], qDict:dict[str]) -> list[tuple]:
    docRanks = []; heapify(docRanks)
    tfSet = set()
    for docIdStr in docIds:
        sumTf_Idf = 0
        for term in qDict:
            sumTf_Idf += qDict[term][docIdStr]
        sumTf_Idf = round(sumTf_Idf, 5)
        if sumTf_Idf not in tfSet:
            if len(docRanks) < 10:
                heappush(docRanks, (sumTf_Idf,docIdStr))
                tfSet.add(sumTf_Idf)
            else:
                if sumTf_Idf > docRanks[0][0]:
                    heappop(docRanks)
                    heappush(docRanks, (sumTf_Idf,docIdStr))
                    tfSet.add(sumTf_Idf)
    return docRanks

In [5]:
termFpDf = getFpDataframe('indexFp')
docIdFpDf = getFpDataframe('docIdFp')

In [83]:
import time
start = time.time()

query = 'University of California Irvine Computer Science good or bad?'
termDict, intersectOrder = stemQuery(query, termFpDf)
docIds = getDocIds(termDict, intersectOrder)
docRanks = getTf_IdfRank(docIds, termDict)
for tup in sorted(docRanks, reverse=True):
    url = getDocData(int(tup[1]),docIdFpDf)['url']
    print(tup[1], url, tup[0])

end = time.time()
print(round((end - start) * 1000),'ms') #Time in milliseconds

19585 http://sli.ics.uci.edu/pmwiki/pmwiki.php?n=AIStats%2FPostings 83.24155
54414 https://www.informatics.uci.edu/2018/10/ 79.0712
54084 https://www.informatics.uci.edu/very-top-footer-menu-items/news/page/11/ 78.66034
54098 https://www.informatics.uci.edu/2017/11/#content 75.2595
32894 https://www.ics.uci.edu/~eppstein/pubs/pubs.ff 74.24377
49861 https://www.ics.uci.edu/~eppstein/pubs/all.html 74.166
4610 https://cml.ics.uci.edu/category/aiml/page/2/ 74.05639
52479 https://www.ics.uci.edu/~eppstein/pubs/geom-all.html 71.71979
23029 https://www.ics.uci.edu/community/news/view_news.php?id=665 68.54787
54337 https://www.informatics.uci.edu/2018/12/#content 67.17466
73 ms
