In [1]:
import math

# compute idf of a token  *idfs are the same for the same token, only depend on the df and N in docs
def calculate_idf(N, df):
    idf = math.log((N + 0.1) / (df + 0.1))
    return idf

#comupte tfidf by tf and idf    
def calculate_tfidf(tf, idf):   
    return tf*idf

#compute cosine_similarity bewteen query vector and doc vector,support any lenth
def calculate_cosine_similarity(vector_query,vector_doc):
    dot_product = sum(q * d for q, d in zip(vector_query, vector_doc))
    norm_query = math.sqrt(sum(q * q for q in vector_query))
    norm_doc = math.sqrt(sum(d * d for d in vector_doc))

    return dot_product / ((norm_query * norm_doc)+0.1)

#create query vector, query is like "fox dog", idf_dict is like {"fox":idf of fox,"dog":idf of dog}
def create_query_vector(query, idf_dict):
    #tokenize the query using our tokenizor!!!
    words = query.split()
    query_vector = []

    # Calculate TF-IDF for each word in the query
    for word in words:
        tf = words.count(word) / len(words)
        idf = idf_dict.get(word, 0)# Get IDF from the IDF dictionary (assuming it's already computed, you can just store df and N, and use calculate_idf to compute idf)
        tf_idf = tf * idf
        query_vector.append(tf_idf)

    return query_vector

#create doc vectors, like {'doc1': [0.2556430078148932, 0.10177675964835226], 'doc2': [0.0, 0.30533027894505677]}
def create_doc_vector(query, query_vector, inverted_index_tfs, idf_dict):
    doc_vectors = {}

    # Iterate over tokens in the query
    # also replace with our tokenizors!!!
    for token in query.split():
        # Check if the token exists in the inverted index postings
        if token in inverted_index_tfs:
            postings = inverted_index_tfs[token]

            # Iterate over the document IDs and TF values in the postings
            for doc_id, tf in postings.items():
                # Check if the document ID already has a vector
                if doc_id not in doc_vectors:
                    doc_vectors[doc_id] = [0] * len(query_vector)

                # Set the TF-IDF value in the document vector based on the query vector index and IDF
                query_index = query.split().index(token)
                tfidf = tf * idf_dict[token]
                doc_vectors[doc_id][query_index] = tfidf

    return doc_vectors

#compute cosine_similarity and return dictionary of {docid:cosine_similarity},already sorted, can make changes for like top 10
def create_cs_doc(query_vector,doc_vectors):
    doc_similarities={}
    for docid,vetcor in doc_vectors.items():
        doc_similarities[docid]=calculate_cosine_similarity(query_vector,vetcor)
    doc_similarities = {k: v for k, v in sorted(doc_similarities.items(), key=lambda item: item[1], reverse=True)}
    return doc_similarities

In [2]:
import pandas as pd
import json
from nltk.stem import PorterStemmer
import re

def stemQuery(query:str) -> list:
    stemmer = PorterStemmer()
    queryList = list()
    line = query.strip()
    if line != '':
        for aToken in re.split('[^a-z0-9]', line.lower()):
            if (aToken != ''):
                token = stemmer.stem(aToken)
                queryList.append(token)
    return queryList

def getDf(term:str) -> int:
    df = pd.read_hdf('DevHDF5/TermMeta.hdf5', term[0])
    if term in df:
        return df[term]['df']
    return 0

def getTermDocId(term:str) -> int:
    df = pd.read_hdf('DevHDF5/TermMeta.hdf5', term[0])
    if term in df:
        return df[term]['docIds']
    return 0

def getDocLen(docId:int) -> int:
    def getDocKey(docId:int, N = 60000):
        for doc in range(0,N+1,5000):
            if docId <= doc:
                return str(doc)
        
    df = pd.read_hdf('DevHDF5/DocId.hdf5', getDocKey(docId))
    return df['docLen'][docId]

def getTf(term:str):
    df = pd.read_hdf('DevHDF5/Terms.hdf5', 'terms')
    if term in df['Terms']:
        return json.loads(df['Terms'][term])
    return {}

def getDocIds(queryList:list) -> set:
    unionSet = set()
    for word in queryList:
        docSet = getTermDocId(word)
        if len(unionSet) == 0:
            unionSet = docSet
        else:
            unionSet = unionSet.intersection(docSet)
    return unionSet

def getInvertedTf(term:str, docIdList:list[int]) -> dict:
    resultTf = {}
    resultTf[term] = {}
    tfDict = getTf(term)
    for docId in docIdList:
        docIdStr = str(docId)
        if docIdStr in tfDict:
            resultTf[term][f'doc{docId}'] = tfDict[docIdStr] / getDocLen(docId)
    return resultTf

def getIdfDict(queryList:list[str], N = 55382) -> dict:
    idfDict = {}
    for word in queryList:
        wordDf = getDf(word)
        idfDict[word] = calculate_idf(N, wordDf)
    return idfDict

def getInvertedTfDict(queryList:list, docList:list[int]) -> dict:
    resultTf = {}
    for word in queryList:
        resultTf.update(getInvertedTf(word,docList))
    return resultTf

In [4]:
qWords = stemQuery("fox dog and") # ['fox', 'dog', 'and']
docIds = getDocIds(qWords) #{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ... }
idf_dict = getIdfDict(qWords) #{'fox': 7.007990707741848, 'dog': 5.623194474242979, 'and': 4.930297199969477}
inverted_index_tfs = getInvertedTfDict(qWords, docIds) #{'fox': {'doc1': 0.1111111111111111, 'doc2': 0, 'doc3': 0.3333333333333333}, 'dog': {'doc1': 0.1111111111111111, .... }

In [5]:
Q = "fox dog and"
N = 55382 #total docs
df_fox = 50 
df_dog = 200 
df_and = 400
idf_dict = {"fox":calculate_idf(N, df_fox),"dog":calculate_idf(N, df_dog),"and":calculate_idf(N, df_and)}#idf of each word

inverted_index_tfs = {"fox":{"doc1":(1/9),"doc2":0,"doc3":1/3},"dog":{"doc1":(1/9),"doc2":(1/3),"doc3":1/3},"and":{"doc1":(0/9),"doc2":(1/9),"doc3":1/3}}#tf of each word in different docs

query_vector = create_query_vector(Q,idf_dict)
doc_vectors = create_doc_vector(Q, query_vector, inverted_index_tfs, idf_dict)
cs = create_cs_doc(query_vector,doc_vectors)
#RESULT:{'doc3': 0.8729249902928188, 'doc1': 0.6923695754433984, 'doc2': 0.26863855424377914} the reason that score of the doc3 is not 1 becuase I add 0.1 to each for avoiding divide 0 errors
print(cs)

{'doc3': 0.9915046317016729, 'doc1': 0.8517176154476556, 'doc2': 0.6518100708682779}
