In [1]:
import os
import json
import math
from heapq import heapify, heappush, heappop

def getFpDict(count):
    fpDict = {}
    for i in range(1,count+1):
        with open(f'Data/Posting{i}.txt', "r") as fp:
            while not (fp.tell() == os.fstat(fp.fileno()).st_size):
                fpPos = fp.tell()
                line = fp.readline().split('-')
                term = line[0]
                if term not in fpDict:
                    fpDict[term] = {}
                fpDict[term][i] = fpPos
    return fpDict

def getTermInfo(term:str, fpDict:dict):
    termInfo = None
    if term not in fpDict: return termInfo
    for count in fpDict[term]:
        with open(f'Data/Posting{count}.txt', "r") as fp:
            fp.seek(fpDict[term][count])
            tInfo = fp.readline().strip().split('-')[1]
            tInfo = json.loads(tInfo)
            if not termInfo:
                termInfo = tInfo
            else:
                #Merge
                termInfo['df'] += tInfo['df']
                termInfo['wTf'].update(tInfo['wTf'])
                termInfo['docIds'].extend(tInfo['docIds'])
    return termInfo

def getFpIdDict():
    fpDict = {}
    with open(f'Data/docId.txt', "r") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split('>')
            docId = line[0]
            fpDict[int(docId)] = fpPos
    return fpDict

def getUrl(docId, fpDict) -> str:
    with open(f'Data/docId.txt', "r") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['url']
    
def getDocLen(docId:int, fpDict) -> str:
    with open(f'Data/docId.txt', "r") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['docLen']
    
def calculate_idf(df, N = 55382):
    try:
        idf = math.log10((N) / (df))
    except ZeroDivisionError as e:
        idf = 0
    return idf

def calculate_tf_idf(tf, idf):
    return ((1 + math.log10(tf)) * idf)
    
def writeIndex(fpDict, docIdDict):
    for term in fpDict:
        termInfo = getTermInfo(term,fpDict)
        idf = calculate_idf(termInfo['df'])
        termInfoDict = {'idf':idf, 'cList':[]}
        cList = []; heapify(cList)

        for docIdStr in termInfo['wTf']:
            tf = termInfo['wTf'][docIdStr]
            tf_idf = calculate_tf_idf(tf, idf) 
            termInfoDict[docIdStr] = tf_idf
            if len(cList) < 50:
                heappush(cList, (tf_idf,docIdStr))
            else:
                minTf_Idf = cList[0][0]
                if tf_idf > minTf_Idf:
                    heappop(cList)
                    heappush(cList, (tf_idf,docIdStr))

        for tup in cList:
            termInfoDict['cList'].append(tup[1])
            
        with open(f'Data/Index.txt', "a") as fp:
            fp.write(f'{term}>{json.dumps(termInfoDict)}\n')

def getFpIndex():
    fpDict = {}
    with open(f'Data/Index.txt', "r") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split('>')
            term = line[0]
            fpDict[term] = fpPos
    return fpDict

In [2]:
idDict = getFpIdDict()

In [3]:
fpDict = getFpDict(5)

In [4]:
writeIndex(fpDict, idDict)

In [11]:
fpIndex = getFpIndex()

In [12]:
import pandas as pd
import pyarrow.feather as feather

def writeFeather(fileName:str, colName:str, dictObj: dict):
    df = pd.DataFrame.from_dict({colName:dictObj})
    feather.write_feather(df, f'Data/{fileName}.feather')

writeFeather('indexFp', 'fp', fpIndex)
writeFeather('docIdFp', 'fp', idDict)

In [13]:
import pyarrow.feather as feather
import json

def getFpDataframe(fileName:str):
    fpDf = None
    with open(f'Data/{fileName}.feather', 'rb') as f:
        fpDf = feather.read_feather(f)
    return fpDf

def getTermData(term:str, fpDf):
    termInfo = None
    if term not in fpDf['fp']: return termInfo
    with open(f'Data/Index.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][term])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

def getDocData(docId:int, fpDf):
    docInfo = None
    if docId not in fpDf['fp']: return docInfo
    with open(f'Data/docId.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][docId])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

In [14]:
termFpDf = getFpDataframe('indexFp')
docIdFpDf = getFpDataframe('docIdFp')

In [17]:
print(getTermData('hello', termFpDf))

{'idf': 4.64348848602763, 'cList': ['4390', '29326', '4402', '22992', '29244', '20478', '29205', '39364', '4406', '50761', '43349', '4416', '4420', '4425', '20937', '4382', '33471', '47540', '34548', '53021'], '60': -29.515312420732528, '4382': -8.74414851789642, '4383': -17.70627344266, '4390': -12.074397164848401, '4396': -17.754095579923824, '4400': -18.30297120542436, '4402': -12.010140127278483, '4403': -19.418575248747825, '4406': -8.15466228568343, '4410': -16.767800863452177, '4411': -16.461525416121905, '4416': -7.079827201035462, '4420': -7.690600971067065, '4425': -7.242810725466941, '4426': -16.900748706085473, '4432': -17.364940537929982, '4996': -30.81992146341055, '5291': -39.049124357722754, '5296': -32.37175956927809, '5327': -39.04950486237213, '5345': -32.37015643151453, '5425': -34.90785705560096, '5461': -26.600821005587246, '5566': -26.73764553276914, '6099': -34.92915962316669, '6143': -22.270438944002578, '6362': -26.600821005587246, '7199': -34.90785705560096, 

In [18]:
print(getDocData(1, docIdFpDf))

{'url': 'https://aiclub.ics.uci.edu/', 'docLen': 9719}
