In [1]:
import os
import json
import math
from heapq import heapify, heappush, heappop

def getFpDict(count):
    fpDict = {}
    for i in range(1,count+1):
        with open(f'Data/Posting{i}.txt', "r") as fp:
            while not (fp.tell() == os.fstat(fp.fileno()).st_size):
                fpPos = fp.tell()
                line = fp.readline().split('-')
                term = line[0]
                if term not in fpDict:
                    fpDict[term] = {}
                fpDict[term][i] = fpPos
    return fpDict

def getTermInfo(term:str, fpDict:dict):
    termInfo = None
    if term not in fpDict: return termInfo
    for count in fpDict[term]:
        with open(f'Data/Posting{count}.txt', "r") as fp:
            fp.seek(fpDict[term][count])
            tInfo = fp.readline().strip().split('-')[1]
            tInfo = json.loads(tInfo)
            if not termInfo:
                termInfo = tInfo
            else:
                #Merge
                termInfo['df'] += tInfo['df']
                termInfo['wTf'].update(tInfo['wTf'])
    return termInfo

def getFpIdDict():
    fpDict = {}
    with open(f'Data/docId.txt', "r") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split('>')
            docId = line[0]
            fpDict[int(docId)] = fpPos
    return fpDict

def getUrl(docId, fpDict) -> str:
    with open(f'Data/docId.txt', "r") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['url']
    
def getDocLen(docId:int, fpDict) -> str:
    with open(f'Data/docId.txt', "r") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['docLen']
    
def calculate_idf(df, N = 55382):
    try:
        idf = math.log10((N) / (df))
    except ZeroDivisionError as e:
        idf = 0
    return idf

def calculate_tf_idf(tf, idf):
    return ((1 + math.log10(tf)) * idf)
    
def writeIndex(fpDict, docIdDict):
    for term in fpDict:
        termInfo = getTermInfo(term,fpDict)
        idf = calculate_idf(termInfo['df'])
        termInfoDict = {'idf':idf, 'cList':[]}
        cList = []; heapify(cList)

        for docIdStr in termInfo['wTf']:
            tf = termInfo['wTf'][docIdStr]
            tf_idf = calculate_tf_idf(tf, idf) 
            termInfoDict[docIdStr] = tf_idf
            if len(cList) < 50:
                heappush(cList, (tf_idf,docIdStr))
            else:
                minTf_Idf = cList[0][0]
                if tf_idf > minTf_Idf:
                    heappop(cList)
                    heappush(cList, (tf_idf,docIdStr))

        for tup in cList:
            termInfoDict['cList'].append(tup[1])
            
        with open(f'Data/Index.txt', "a") as fp:
            fp.write(f'{term}>{json.dumps(termInfoDict)}\n')

def getFpIndex():
    fpDict = {}
    with open(f'Data/Index.txt', "r") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split('>')
            term = line[0]
            fpDict[term] = fpPos
    return fpDict

In [2]:
idDict = getFpIdDict()

In [3]:
fpDict = getFpDict(5)

In [4]:
writeIndex(fpDict, idDict)

In [5]:
fpIndex = getFpIndex()

In [6]:
import pandas as pd
import pyarrow.feather as feather

def writeFeather(fileName:str, colName:str, dictObj: dict):
    df = pd.DataFrame.from_dict({colName:dictObj})
    feather.write_feather(df, f'Data/{fileName}.feather')

writeFeather('indexFp', 'fp', fpIndex)
writeFeather('docIdFp', 'fp', idDict)

In [7]:
import pyarrow.feather as feather
import json

def getFpDataframe(fileName:str):
    fpDf = None
    with open(f'Data/{fileName}.feather', 'rb') as f:
        fpDf = feather.read_feather(f)
    return fpDf

def getTermData(term:str, fpDf):
    termInfo = None
    if term not in fpDf['fp']: return termInfo
    with open(f'Data/Index.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][term])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

def getDocData(docId:int, fpDf):
    docInfo = None
    if docId not in fpDf['fp']: return docInfo
    with open(f'Data/docId.txt', "r") as indexFile:
        indexFile.seek(fpDf['fp'][docId])
        tInfo = indexFile.readline().strip().split('>')[1]
        return json.loads(tInfo) 

In [8]:
termFpDf = getFpDataframe('indexFp')
docIdFpDf = getFpDataframe('docIdFp')

In [9]:
print(getTermData('hello', termFpDf))

{'idf': 2.0166414262630847, 'cList': ['20525', '20614', '20803', '21729', '21103', '43472', '40613', '45166', '32699', '32546', '24539', '44858', '4420', '20937', '4402', '4390', '43349', '25587', '42529', '30421', '47305', '22736', '34292', '41982', '50462', '46602', '29803', '4425', '47491', '44051', '27306', '48088', '33471', '29326', '26029', '4406', '38471', '29046', '45936', '4382', '45286', '22138', '34548', '4411', '34577', '20478', '33210', '4416', '26035', '53021'], '60': 2.0166414262630847, '4382': 6.467017908418235, '4383': 5.460198181633534, '4390': 6.467017908418235, '4396': 5.460198181633534, '4400': 5.460198181633534, '4402': 6.467017908418235, '4403': 5.460198181633534, '4406': 6.467017908418235, '4410': 5.460198181633534, '4411': 6.467017908418235, '4416': 7.129420576502549, '4420': 6.467017908418235, '4425': 7.129420576502549, '4426': 5.460198181633534, '4432': 5.460198181633534, '4996': 2.0166414262630847, '5291': 2.0166414262630847, '5296': 2.0166414262630847, '532

In [10]:
print(getDocData(1, docIdFpDf))

{'url': 'https://aiclub.ics.uci.edu/', 'docLen': 9719}
