In [1]:
import pandas as pd
from pymongo import MongoClient
import math
import numpy as np

In [2]:
client = MongoClient()
db = client["search_engine"]

In [3]:
%run -i "utilities.py"

In [4]:
def get_matching_docs(words, stems) :
    stem_df = pd.DataFrame(db.stem_frequency_logs.find({"stem" : {"$in" : stems}}, {"_id" : 0}))
    df_counts = pd.DataFrame(stem_df.value_counts("stem"))
    df_counts["stem"] = df_counts.index
    df_counts.reset_index(drop = True, inplace = True)
    no_docs = pd.DataFrame(db.updated_logs.find({},{"filename" : 1})).count()[0]
    df_counts.columns = ["no_data","stem"]
    df_counts["idf"] = df_counts["no_data"].apply(lambda x : math.log(no_docs / x))
    
    merged_df = df_counts.merge(stem_df, left_on = "stem", right_on = "stem")
    merged_df["tf_idf"] = merged_df["term_frequency"] * merged_df["idf"]
    merged_df.drop(["idf", "term_frequency", "no_data"], axis = 1, inplace = True)
    
    word_df = pd.DataFrame(db.word_frequency_logs.find({"word" : {"$in" : words}}, {"_id" : 0}))
    df_counts = pd.DataFrame(word_df.value_counts("word"))
    df_counts["word"] = df_counts.index
    df_counts.reset_index(drop = True, inplace = True)
    df_counts.columns = ["no_data", "word"]
    df_counts["idf"] = df_counts["no_data"].apply(lambda x : math.log(no_docs / x))
                           
    merged_df_2 = df_counts.merge(word_df, left_on = "word", right_on = "word")
    merged_df_2["tf_idf"] = merged_df_2["term_frequency"] * merged_df_2["idf"]
    merged_df_2.drop(["idf", "term_frequency", "no_data"], axis = 1, inplace = True)
    return merged_df, merged_df_2

In [11]:
def get_scores(words, stems) :
    df, df2 = get_matching_docs(words, stems)
    df = pd.DataFrame(df.groupby("filename").tf_idf.agg(sum))
    df.reset_index(inplace = True)
    df2 = pd.DataFrame(df2.groupby("filename").tf_idf.agg(sum))
    df2.reset_index(inplace = True)
    merged_df = df.merge(df2, how = "outer", on = "filename")
    merged_df.replace(np.nan, 0, inplace = True)
    merged_df["tf_idf"] = merged_df["tf_idf_x"] + merged_df["tf_idf_y"]
    merged_df.drop(["tf_idf_x", "tf_idf_y"], axis = 1, inplace = True)
    return merged_df

In [18]:
def run(text) :
    tokens, a, b = stem_file(text)
    words = [token[0] for token in tokens]
    stems = [token[1] for token in tokens]
    scores = get_scores(words, stems)
    scores.sort_values("tf_idf", ascending = False, inplace = True)
    return list(scores["filename"])

In [19]:
text = "विख्यात हैमिग्व आत्म"
files = run(text)
files

['40.txt',
 '12.txt',
 '123.txt',
 '37.txt',
 '144.txt',
 '94.txt',
 '60.txt',
 '61.txt',
 '135.txt',
 '122.txt',
 '63.txt',
 '112.txt',
 '38.txt',
 '120.txt',
 '117.txt',
 '121.txt',
 '34.txt',
 '124.txt',
 '155.txt',
 '52.txt',
 '36.txt',
 '31.txt',
 '154.txt',
 '134.txt']