In [1]:
import os
import json

def getFpDict(count):
    fpDict = {}
    for i in range(1,count+1):
        with open(f'Data/Posting{i}.txt', "r+") as fp:
            while not (fp.tell() == os.fstat(fp.fileno()).st_size):
                fpPos = fp.tell()
                line = fp.readline().split('-')
                term = line[0]
                if term not in fpDict:
                    fpDict[term] = {}
                fpDict[term][i] = fpPos
    return fpDict

def getTermInfo(term:str, fpDict:dict):
    termInfo = None
    if term not in fpDict: return termInfo
    for count in fpDict[term]:
        with open(f'Data/Posting{count}.txt', "r+") as fp:
            fp.seek(fpDict[term][count])
            tInfo = fp.readline().strip().split('-')[1]
            tInfo = json.loads(tInfo)
            if not termInfo:
                termInfo = tInfo
            else:
                #Merge
                termInfo['df'] += tInfo['df']
                termInfo['tf'].update(tInfo['tf'])
                termInfo['docIds'].extend(tInfo['docIds'])
    return termInfo


def getFpIdDict():
    fpDict = {}
    with open(f'Data/docId.txt', "r+") as fp:
        while not (fp.tell() == os.fstat(fp.fileno()).st_size):
            fpPos = fp.tell()
            line = fp.readline().split()
            docId = line[0]
            fpDict[int(docId)] = fpPos
    return fpDict

def getUrl(docId, fpDict) -> str:
    with open(f'Data/docId.txt', "r+") as fp:
        fp.seek(fpDict[docId])
        return fp.readline().strip().split()[1]
    
def getDocLen(docId, fpDict) -> str:
    with open(f'Data/docId.txt', "r+") as fp:
        fp.seek(fpDict[docId])
        return int(fp.readline().strip().split()[2])

In [2]:
import math
# compute idf of a token  *idfs are the same for the same token, only depend on the df and N in docs
def calculate_idf(N, df):
    idf = math.log((N + 0.1) / (df + 0.1))
    return idf

#comupte tfidf by tf and idf    
def calculate_tfidf(tf, idf):   
    return tf*idf

#compute cosine_similarity bewteen query vector and doc vector,support any lenth
def calculate_cosine_similarity(vector_query,vector_doc):
    dot_product = sum(q * d for q, d in zip(vector_query, vector_doc))
    norm_query = math.sqrt(sum(q * q for q in vector_query))
    norm_doc = math.sqrt(sum(d * d for d in vector_doc))

    return dot_product / ((norm_query * norm_doc)+0.1)

#create query vector, query is like "fox dog", idf_dict is like {"fox":idf of fox,"dog":idf of dog}
def create_query_vector(qWords, idf_dict):
    query_vector = []

    # Calculate TF-IDF for each word in the query
    for word in qWords:
        tf = qWords.count(word) / len(qWords)
        idf = idf_dict[word]# Get IDF from the IDF dictionary (assuming it's already computed, you can just store df and N, and use calculate_idf to compute idf)
        tf_idf = tf * idf
        query_vector.append(tf_idf)

    return query_vector

#create doc vectors, like {'doc1': [0.2556430078148932, 0.10177675964835226], 'doc2': [0.0, 0.30533027894505677]}
def create_doc_vector(qWords, query_vector, inverted_index_tfs, idf_dict):
    doc_vectors = {}

    # Iterate over tokens in the query
    # also replace with our tokenizors!!!
    for token in qWords:
        # Check if the token exists in the inverted index postings
        if token in inverted_index_tfs:
            postings = inverted_index_tfs[token]

            # Iterate over the document IDs and TF values in the postings
            for doc_id, tf in postings.items():
                # Check if the document ID already has a vector
                if doc_id not in doc_vectors:
                    doc_vectors[doc_id] = [0] * len(query_vector)

                # Set the TF-IDF value in the document vector based on the query vector index and IDF
                query_index = qWords.index(token)
                tfidf = tf * idf_dict[token]
                doc_vectors[doc_id][query_index] = tfidf

    return doc_vectors

#compute cosine_similarity and return dictionary of {docid:cosine_similarity},already sorted, can make changes for like top 10
def create_cs_doc(query_vector,doc_vectors):
    doc_similarities={}
    for docid,vetcor in doc_vectors.items():
        doc_similarities[docid]=calculate_cosine_similarity(query_vector,vetcor)
    doc_similarities = {k: v for k, v in sorted(doc_similarities.items(), key=lambda item: item[1], reverse=True)}
    return doc_similarities


In [3]:
from nltk.stem import PorterStemmer
import re

def stemQuery(query:str) -> list:
    stemmer = PorterStemmer()
    queryList = list()
    line = query.strip()
    if line != '':
        for aToken in re.split('[^a-z0-9]', line.lower()):
            if (aToken != ''):
                token = stemmer.stem(aToken)
                queryList.append(token)
    return queryList

def getDocIds(queryWords:list, fpDict) -> set[int]:
    unionSet = set()
    for word in queryWords:
        termInfo = getTermInfo(word,fpDict)
        if termInfo:
            docSet = set(termInfo['docIds'])
            if len(unionSet) == 0:
                unionSet = docSet
            else:
                unionSet = unionSet.intersection(docSet)
    return unionSet

def getIdfDict(queryList:list[str], fpDict, N = 55382) -> dict:
    idfDict = {}
    for word in queryList:
        wordDf = getTermInfo(word,fpDict)['df']
        idfDict[word] = calculate_idf(N, wordDf)
    return idfDict

def getInvertedTf(term:str, docIdList:list[int], fpDict, idDict) -> dict:
    resultTf = {}
    resultTf[term] = {}
    tfDict = getTermInfo(term,fpDict)['tf']
    for docId in docIdList:
        docIdStr = str(docId)
        if docIdStr in tfDict:
            resultTf[term][f'doc{docId}'] = tfDict[docIdStr] / getDocLen(docId, idDict)
    return resultTf

def getInvertedTfDict(queryList:list, docList:list[int], fpDict, idDict) -> dict:
    resultTf = {}
    for word in queryList:
        resultTf.update(getInvertedTf(word,docList, fpDict, idDict))
    return resultTf

In [4]:
#Create Index of Index before query
fpDict = getFpDict(8)
idDict = getFpIdDict()

In [5]:
import time
start = time.time()
# qWords = stemQuery("machine learning")  #Takes LONG AF
qWords = stemQuery("fox dog and") # ['fox', 'dog', 'and']
docIds = getDocIds(qWords,fpDict) # {6047, 7852, 22732, 25237, 25274, 32946, 37727, 42188, 46631, 49958}
idf_dict = getIdfDict(qWords,fpDict) # {'fox': 6.229746822993642, 'dog': 4.945407031631708, 'and': 0.5274289094373557}
inverted_index_tfs = getInvertedTfDict(qWords, docIds, fpDict, idDict)
query_vector = create_query_vector(qWords,idf_dict) # [2.076582274331214, 1.6484690105439026, 0.17580963647911854]
doc_vectors = create_doc_vector(qWords, query_vector, inverted_index_tfs, idf_dict)
cs = create_cs_doc(query_vector,doc_vectors)
print(cs)
end = time.time()

print(round((end - start) * 1000),'ms') #Time in milliseconds

{'doc22705': 0.37207005090957934, 'doc25247': 0.10212813324619774, 'doc42161': 0.10212813324619774, 'doc32919': 0.10071046555461621, 'doc25210': 0.10071046555461621, 'doc37700': 0.0669805272287074, 'doc7852': 0.044648980187927165, 'doc6047': 0.044648980187927165, 'doc46604': 0.03909948276934318, 'doc49931': 0.004073502983292416}
146 ms
