In [1]:
import pyarrow.feather as feather
import json

def getFpDataframe(fileName:str):
    fpDf = None
    with open(f'Data/{fileName}.feather', 'rb') as f:
        fpDf = feather.read_feather(f)
    return fpDf

def getTermData(term:str, fpDf):
    termInfo = None
    if term not in fpDf['fp']: return termInfo
    with open(f'Data/Index.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][term])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

def getDocData(docId:int, fpDf):
    docInfo = None
    if docId not in fpDf['fp']: return docInfo
    with open(f'Data/docId.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][docId])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

In [87]:
from nltk.stem import PorterStemmer
from heapq import heapify, heappush, heappop
import re

def stemQuery(query:str, termFpDf:dict) -> dict:
    stemmer = PorterStemmer()
    queryDict = {}
    intersectOrder = []; heapify(intersectOrder)
    line = query.strip()
    if line != '':
        for aToken in re.split('[^a-z0-9]', line.lower()):
            if (aToken != ''):
                token = stemmer.stem(aToken)
                termData = getTermData(token, termFpDf)
                if termData:
                    queryDict[token] = termData
                    heappush(intersectOrder, (termData['idf']*-1,token))
    return queryDict, intersectOrder

def getDocIds(queryDict:dict[str], order:list[tuple]) -> set:
    intersection_keys = set()
    for tup in order:
        if len(intersection_keys) == 0:
            intersection_keys = set(queryDict[tup[1]].keys())
            intersection_keys.remove('cList')
            intersection_keys.remove('idf')
        else:
            intersection_keys = intersection_keys & queryDict[tup[1]].keys()

    return intersection_keys

In [32]:
termFpDf = getFpDataframe('indexFp')
docIdFpDf = getFpDataframe('docIdFp')

In [110]:
import time
start = time.time()

query = 'University of California Irvine'
termDict, intersectOrder = stemQuery(query, termFpDf)
docIds = getDocIds(termDict, intersectOrder)

end = time.time()
print(round((end - start) * 1000),'ms') #Time in milliseconds

108 ms
