In [1]:
import pyarrow.feather as feather
import json

def getFpDataframe(fileName:str):
    fpDf = None
    with open(f'Data/{fileName}.feather', 'rb') as f:
        fpDf = feather.read_feather(f)
    return fpDf

def getTermData(term:str, fpDf):
    termInfo = None
    if term not in fpDf['fp']: return termInfo
    with open(f'Data/Index.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][term])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

def getDocData(docId:int, fpDf):
    docInfo = None
    if docId not in fpDf['fp']: return docInfo
    with open(f'Data/docId.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][docId])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

In [2]:
from nltk.stem import PorterStemmer
from heapq import heapify, heappush, heappop
import re
import math

def stemQuery(query:str, termFpDf:dict, countDict:dict) -> dict:
    stemmer = PorterStemmer()
    queryDict = {}
    intersectOrder = []; heapify(intersectOrder)
    line = query.strip()
    if line != '':
        for aToken in re.split('[^a-z0-9]', line.lower()):
            if (aToken != ''):
                token = stemmer.stem(aToken)
                if token not in queryDict:
                    termData = getTermData(token, termFpDf)
                    countDict['terms'][token] = 1
                    countDict['len'] += 1
                    if termData:
                        queryDict[token] = termData
                        heappush(intersectOrder, (termData['idf']*-1,token))
                else:
                    countDict['terms'][token] += 1
                    countDict['len'] += 1
    return queryDict, intersectOrder

def getDocIds(queryDict:dict[str], order:list[tuple]) -> set:
    intersection_keys = set()
    for tup in order:
        if len(intersection_keys) == 0:
            intersection_keys = set(queryDict[tup[1]].keys())
            intersection_keys.remove('cList')
            intersection_keys.remove('idf')
        else:
            intersection_keys = intersection_keys & queryDict[tup[1]].keys()

    return intersection_keys

def getTf_IdfRank(docIds:set[str], qDict:dict[str]) -> list[tuple]:
    docRanks = []; heapify(docRanks)
    tfSet = set()
    for docIdStr in docIds:
        sumTf_Idf = 0
        for term in qDict:
            sumTf_Idf += qDict[term][docIdStr]
        sumTf_Idf = round(sumTf_Idf, 5)
        if sumTf_Idf not in tfSet:
            if len(docRanks) < 10:
                heappush(docRanks, (sumTf_Idf,docIdStr))
                tfSet.add(sumTf_Idf)
            else:
                if sumTf_Idf > docRanks[0][0]:
                    heappop(docRanks)
                    heappush(docRanks, (sumTf_Idf,docIdStr))
                    tfSet.add(sumTf_Idf)
    return docRanks

def calculate_idf(df, N = 55382):
    try:
        idf = math.log((N) / (df))
    except ZeroDivisionError as e:
        idf = 0
    return idf

def calculate_tf_idf(tf, idf):
    return ((1 + math.log(tf)) * idf)

In [3]:
termFpDf = getFpDataframe('indexFp')
docIdFpDf = getFpDataframe('docIdFp')

In [4]:
def getQVector(termDict:dict[str], countDict:dict[dict]) -> list:
    qVector = []
    for term in termDict:
        idf = termDict[term]['idf']
        tfIdf = calculate_tf_idf(countDict['terms'][term],idf)
        qVector.append(tfIdf)
    return qVector

In [24]:
import time
start = time.time_ns()

query = 'master of software engineer'
countDict = {'terms':{}, 'len': 0}
termDict, intersectOrder = stemQuery(query, termFpDf, countDict)
docIds = getDocIds(termDict, intersectOrder)
docRanks = getTf_IdfRank(docIds, termDict)
for tup in sorted(docRanks, reverse=True):
    url = getDocData(int(tup[1]),docIdFpDf)['url']
    print(tup[1], url, tup[0])
qVector = getQVector(termDict, countDict)
end = time.time_ns()
print(round(end - start) / (10**6),'ms') #Time in milliseconds

29716 https://www.ics.uci.edu/~neno/vita/vita.html 46.98415
17001 https://mswe.ics.uci.edu/ 45.9336
54680 https://www.informatics.uci.edu/grad/mswe/#content 44.21775
24257 https://www.ics.uci.edu/grad/degrees/index 43.71712
51294 https://www.ics.uci.edu/grad/admissions/index.php 43.66838
16995 https://mswe.ics.uci.edu/program/curriculum/ 42.92395
34271 https://www.ics.uci.edu/~mrrahimi/Education.html 40.82172
53900 https://www.informatics.uci.edu/grad/ms-software-engineering/ 40.80404
21729 https://www.ics.uci.edu/~epaikari/#portfolioModal3 40.38412
54391 https://www.informatics.uci.edu/alumni-spotlights-ics-grads-help-autogravity-revolutionize-digital-car-buying/ 40.07312
41.2932 ms
