In [1]:
import os
import json
import math
from heapq import heapify, heappush, heappop

def getFpDict(count):
    fpDict = {}
    for i in range(1,count+1):
        with open(f'Data/Posting{i}.txt', "r") as fp:
            while not (fp.tell() == os.fstat(fp.fileno()).st_size):
                fpPos = fp.tell()
                line = fp.readline().split('-')
                term = line[0]
                if term not in fpDict:
                    fpDict[term] = {}
                fpDict[term][i] = fpPos
    return fpDict

def getTermInfo(term:str, fpDict:dict):
    termInfo = None
    if term not in fpDict: return termInfo
    for count in fpDict[term]:
        with open(f'Data/Posting{count}.txt', "r") as fp:
            fp.seek(fpDict[term][count])
            tInfo = fp.readline().strip().split('-')[1]
            tInfo = json.loads(tInfo)
            if not termInfo:
                termInfo = tInfo
            else:
                #Merge
                termInfo['df'] += tInfo['df']
                termInfo['wTf'].update(tInfo['wTf'])
                termInfo['docIds'].extend(tInfo['docIds'])
    return termInfo

def getFpIdDict():
    fpDict = {}
    with open(f'Data/docId.txt', "r") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split('>')
            docId = line[0]
            fpDict[int(docId)] = fpPos
    return fpDict

def getUrl(docId, fpDict) -> str:
    with open(f'Data/docId.txt', "r") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['url']
    
def getDocLen(docId:int, fpDict) -> str:
    with open(f'Data/docId.txt', "r") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['docLen']
    
def calculate_idf(df, N = 55382):
    try:
        idf = math.log((N) / (df))
    except ZeroDivisionError as e:
        idf = 0
    return idf

def calculate_tf_idf(tf, idf):
    return ((1 + math.log(tf)) * idf)
    
def writeIndex(fpDict, docIdDict):
    for term in fpDict:
        termInfo = getTermInfo(term,fpDict)
        idf = calculate_idf(termInfo['df'])
        termInfoDict = {'idf':idf, 'cList':[]}
        cList = []; heapify(cList)

        for docIdStr in termInfo['wTf']:
            # docLen = getDocLen(int(docIdStr), docIdDict)
            tf = termInfo['wTf'][docIdStr]
            tf_idf = round(calculate_tf_idf(tf, idf),5)
            termInfoDict[docIdStr] = tf_idf
            if len(cList) < 10:
                heappush(cList, (tf_idf,docIdStr))
            else:
                minTf_Idf = cList[0][0]
                if tf_idf > minTf_Idf:
                    heappop(cList)
                    heappush(cList, (tf_idf,docIdStr))

        for tup in cList:
            termInfoDict['cList'].append(tup[1])
            
        with open(f'Data/Index.txt', "a") as fp:
            fp.write(f'{term}>{json.dumps(termInfoDict)}\n')

def getFpIndex():
    fpDict = {}
    with open(f'Data/Index.txt', "r") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split('>')
            term = line[0]
            fpDict[term] = fpPos
    return fpDict

In [2]:
idDict = getFpIdDict()

In [3]:
fpDict = getFpDict(5)

In [4]:
writeIndex(fpDict, idDict)

In [5]:
fpIndex = getFpIndex()

In [6]:
import pandas as pd
import pyarrow.feather as feather

def writeFeather(fileName:str, colName:str, dictObj: dict):
    df = pd.DataFrame.from_dict({colName:dictObj})
    feather.write_feather(df, f'Data/{fileName}.feather')

writeFeather('indexFp', 'fp', fpIndex)
writeFeather('docIdFp', 'fp', idDict)

In [7]:
import pyarrow.feather as feather
import json

def getFpDataframe(fileName:str):
    fpDf = None
    with open(f'Data/{fileName}.feather', 'rb') as f:
        fpDf = feather.read_feather(f)
    return fpDf

def getTermData(term:str, fpDf):
    termInfo = None
    if term not in fpDf['fp']: return termInfo
    with open(f'Data/Index.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][term])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

def getDocData(docId:int, fpDf):
    docInfo = None
    if docId not in fpDf['fp']: return docInfo
    with open(f'Data/docId.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][docId])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

In [8]:
termFpDf = getFpDataframe('indexFp')
docIdFpDf = getFpDataframe('docIdFp')

In [9]:
print(getTermData('hello', termFpDf))

{'idf': 4.64348848602763, 'cList': ['4425', '29326', '26035', '29803', '48088', '33471', '45286', '34577', '33210', '47491'], '60': 4.64349, '4382': 28.23893, '4383': 22.90088, '4390': 28.23893, '4396': 22.90088, '4400': 22.90088, '4402': 28.23893, '4403': 22.90088, '4406': 28.23893, '4410': 22.90088, '4411': 28.23893, '4416': 31.75092, '4420': 28.23893, '4425': 31.75092, '4426': 22.90088, '4432': 22.90088, '4996': 4.64349, '5291': 4.64349, '5296': 4.64349, '5327': 4.64349, '5345': 4.64349, '5425': 4.64349, '5461': 4.64349, '5566': 4.64349, '6099': 4.64349, '6143': 4.64349, '6362': 4.64349, '7199': 4.64349, '7464': 4.64349, '7832': 4.64349, '7946': 12.9635, '7985': 7.86211, '8009': 12.9635, '8010': 4.64349, '8024': 4.64349, '8051': 4.64349, '8053': 4.64349, '8104': 7.86211, '8194': 12.9635, '8203': 4.64349, '8245': 12.9635, '8276': 12.9635, '8298': 14.29935, '8324': 4.64349, '8335': 4.64349, '8353': 7.86211, '8362': 4.64349, '8369': 4.64349, '8371': 4.64349, '8389': 4.64349, '8456': 4.

In [10]:
print(getDocData(1, docIdFpDf))

{'url': 'https://aiclub.ics.uci.edu/', 'docLen': 390}
