In [20]:
import os
import json
import math
from heapq import heapify, heappush, heappop

def getFpDict(count):
    fpDict = {}
    for i in range(1,count+1):
        with open(f'Data/Posting{i}.txt', "r+") as fp:
            while not (fp.tell() == os.fstat(fp.fileno()).st_size):
                fpPos = fp.tell()
                line = fp.readline().split('-')
                term = line[0]
                if term not in fpDict:
                    fpDict[term] = {}
                fpDict[term][i] = fpPos
    return fpDict

def getTermInfo(term:str, fpDict:dict):
    termInfo = None
    if term not in fpDict: return termInfo
    for count in fpDict[term]:
        with open(f'Data/Posting{count}.txt', "r+") as fp:
            fp.seek(fpDict[term][count])
            tInfo = fp.readline().strip().split('-')[1]
            tInfo = json.loads(tInfo)
            if not termInfo:
                termInfo = tInfo
            else:
                #Merge
                termInfo['df'] += tInfo['df']
                termInfo['wTf'].update(tInfo['wTf'])
                termInfo['docIds'].extend(tInfo['docIds'])
    return termInfo

def getFpIdDict():
    fpDict = {}
    with open(f'Data/docId.txt', "r+") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split('>')
            docId = line[0]
            fpDict[int(docId)] = fpPos
    return fpDict

def getUrl(docId, fpDict) -> str:
    with open(f'Data/docId.txt', "r+") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['url']
    
def getDocLen(docId:int, fpDict) -> str:
    with open(f'Data/docId.txt', "r+") as fp:
        fp.seek(fpDict[docId])
        docInfo = json.loads(fp.readline().split('>')[1])
        return docInfo['docLen']
    
def calculate_idf(df, N = 55382):
    idf = math.log((N + 0.1) / (df + 0.1))
    return idf
    
def writeIndex(fpDict, docIdDict):
    for term in fpDict:
        termInfo = getTermInfo(term,fpDict)
        idf = calculate_idf(termInfo['df'])
        termInfoDict = {'idf':idf, 'cList':[]}
        cList = []; heapify(cList)

        for docIdStr in termInfo['wTf']:
            tf_idf = round((termInfo['wTf'][docIdStr] / getDocLen(int(docIdStr), docIdDict)) * idf,5)
            termInfoDict[docIdStr] = tf_idf
            if len(cList) < 10:
                heappush(cList, (tf_idf,docIdStr))
            else:
                minTf_Idf = cList[0][0]
                if tf_idf > minTf_Idf:
                    heappop(cList)
                    heappush(cList, (tf_idf,docIdStr))

        for tup in cList:
            termInfoDict['cList'].append(tup[1])
            
        with open(f'Data/Index.txt', "a") as fp:
            fp.write(f'{term}-{json.dumps(termInfoDict)}\n')

In [9]:
idDict = getFpIdDict()

In [10]:
fpDict = getFpDict(5)

In [21]:
writeIndex(fpDict, idDict)