In [18]:
import os
import json
import math
from heapq import heapify, heappush, heappop

def getFpDict(count):
    fpDict = {}
    for i in range(1,count+1):
        with open(f'Data/Posting{i}.txt', "r") as fp:
            while not (fp.tell() == os.fstat(fp.fileno()).st_size):
                fpPos = fp.tell()
                line = fp.readline().split('-')
                term = line[0]
                if term not in fpDict:
                    fpDict[term] = {}
                fpDict[term][i] = fpPos
    return fpDict

def getTermInfo(term:str, fpDict:dict):
    termInfo = None
    if term not in fpDict: return termInfo
    for count in fpDict[term]:
        with open(f'Data/Posting{count}.txt', "r") as fp:
            fp.seek(fpDict[term][count])
            tInfo = fp.readline().strip().split('-')[1]
            tInfo = json.loads(tInfo)
            if not termInfo:
                termInfo = tInfo
            else:
                #Merge
                termInfo['df'] += tInfo['df']
                termInfo['wTf'].update(tInfo['wTf'])
                termInfo['docIds'].extend(tInfo['docIds'])
    return termInfo

def getFpIdDict():
    fpDict = {}
    with open(f'Data/docId.txt', "r") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split('>')
            docId = line[0]
            fpDict[int(docId)] = fpPos
    return fpDict

def getUrl(docId, fpDict) -> str:
    with open(f'Data/docId.txt', "r") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['url']
    
def getDocLen(docId:int, fpDict) -> str:
    with open(f'Data/docId.txt', "r") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['docLen']
    
def calculate_idf(df, N = 55382):
    try:
        idf = math.log((N) / (df))
    except ZeroDivisionError as e:
        idf = 0
    return idf
    
def writeIndex(fpDict, docIdDict):
    for term in fpDict:
        termInfo = getTermInfo(term,fpDict)
        idf = calculate_idf(termInfo['df'])
        termInfoDict = {'idf':idf, 'cList':[]}
        cList = []; heapify(cList)

        for docIdStr in termInfo['wTf']:
            tf_idf = round((termInfo['wTf'][docIdStr] / getDocLen(int(docIdStr), docIdDict)) * idf,5)
            termInfoDict[docIdStr] = tf_idf
            if len(cList) < 10:
                heappush(cList, (tf_idf,docIdStr))
            else:
                minTf_Idf = cList[0][0]
                if tf_idf > minTf_Idf:
                    heappop(cList)
                    heappush(cList, (tf_idf,docIdStr))

        for tup in cList:
            termInfoDict['cList'].append(tup[1])
            
        with open(f'Data/Index.txt', "a") as fp:
            fp.write(f'{term}>{json.dumps(termInfoDict)}\n')

def getFpIndex():
    fpDict = {}
    with open(f'Data/Index.txt', "r") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split('>')
            term = line[0]
            fpDict[term] = fpPos
    return fpDict

In [2]:
idDict = getFpIdDict()

In [3]:
fpDict = getFpDict(5)
writeIndex(fpDict, idDict)

In [19]:
fpIndex = getFpIndex()

In [20]:
import pandas as pd
import pyarrow.feather as feather

def writeFeather(fileName:str, colName:str, dictObj: dict):
    df = pd.DataFrame.from_dict({colName:dictObj})
    feather.write_feather(df, f'Data/{fileName}.feather')

writeFeather('indexFp', 'fp', fpIndex)
writeFeather('docIdFp', 'fp', idDict)

In [46]:
import pyarrow.feather as feather
import json

def getFpDataframe(fileName:str):
    fpDf = None
    with open(f'Data/{fileName}.feather', 'rb') as f:
        fpDf = feather.read_feather(f)
    return fpDf

def getTermData(term:str, fpDf):
    termInfo = None
    if term not in fpDf['fp']: return termInfo
    with open(f'Data/Index.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][term])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

def getDocData(docId:int, fpDf):
    docInfo = None
    if docId not in fpDf['fp']: return docInfo
    with open(f'Data/docId.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][docId])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

In [22]:
termFpDf = getFpDataframe('indexFp')
docIdFpDf = getFpDataframe('docIdFp')

In [42]:
print(getTermData('hello', termFpDf))

{'idf': 4.64348848602763, 'cList': ['4390', '4382', '4402', '4406', '43349', '4425', '4420', '4416', '34548', '53021'], '60': 0.03656, '4382': 9.58464, '4383': 1.57879, '4390': 6.50088, '4396': 1.52786, '4400': 1.47092, '4402': 6.55791, '4403': 1.31566, '4406': 9.96802, '4410': 1.56833, '4411': 3.701, '4416': 11.88594, '4420': 10.10272, '4425': 11.62567, '4426': 1.48011, '4432': 1.63323, '4996': 0.01386, '5291': 0.00392, '5296': 0.01036, '5327': 0.00392, '5345': 0.01039, '5425': 0.00093, '5461': 0.00555, '5566': 0.00539, '6099': 0.00092, '6143': 0.01407, '6362': 0.00555, '7199': 0.00093, '7464': 0.00539, '7832': 0.01407, '7946': 0.02379, '7985': 0.00235, '8009': 0.02324, '8010': 0.00218, '8024': 0.00356, '8051': 0.00169, '8053': 0.0015, '8104': 0.00231, '8194': 0.02408, '8203': 0.00171, '8245': 0.03518, '8276': 0.02347, '8298': 0.15478, '8324': 0.00152, '8335': 0.0032, '8353': 0.00425, '8362': 0.00361, '8369': 0.00165, '8371': 0.00215, '8389': 0.00185, '8456': 0.0015, '8489': 0.05495, 

In [50]:
print(getDocData(1, docIdFpDf))

{'url': 'https://aiclub.ics.uci.edu/', 'docLen': 390}
