# IR - HW2 BM model
* In this project, we will have  
    * 50 Queries  
    * 4191 Documents
* Our goal is to implement a BM model
    * BM1
    * BM15
    * BM11
    * BM25
    * BM25L

https://www.kaggle.com/t/7f84706b7b074267ae314582825fb725


In [1]:
import numpy as np
import pandas as pd
import math
import os

## Open files

In [2]:
# return word vector
def open_files(path_query = "data/queries", path_docs = "data/docs", extension = ".txt"):
    qlf = open("data/query_list.txt")
    dlf = open("data/doc_list.txt")
    
    querys = {}
    for fname in qlf:
        fname = fname.strip("\n")
        file = os.path.join(path_query, fname + extension)
        
        fq = open(file)
        query = []
        for q in fq:
            q = q.strip("\n").split(" ")
            for term in q:
                query.append(term)
        querys[fname] = query
        fq.close()

    docs = {}
    for fname in dlf:
        fname = fname.strip("\n")
        file = os.path.join(path_docs, fname + extension)
        
        fd = open(file)
        doc = []
        for d in fd:
            d = d.strip("\n").split(" ")
            for term in d:
                doc.append(term)
        docs[fname] = doc
        fd.close()

    dlf.close()
    qlf.close()

    return querys, docs

## TF-IDF

In [3]:
def tf_idf(all_dict, sublinear=True):
    
    tf = {}
    ni = {}
    for name, terms in all_dict.items():
        tf[name] = {}
        check_term = {}
        for term in terms:
            # ni
            try:
                if term not in check_term:
                    ni[term] += 1
                    check_term[term] = True
            except:
                ni[term] = 1
            # TF
            try:
                tf[name][term] += 1
            except:
                tf[name][term] = 1
    
    # IDF
    idf = {}
    for term, times in ni.items():
        idf[term] = math.log( (len(all_dict) - times + 0.5) / (0.5 + times) )
    
    if sublinear:
        # sublinear_tf: replace tf with 1 + log(tf).
        for name, _ in all_dict.items():
            for term, tf_score in tf[name].items():
                tf[name][term] = (1 + math.log(tf_score))

    return tf, idf

## test section

In [4]:
querys_dict, docs_dict = open_files()
# print(docs_dict["FBIS3-23"])

all_dict = querys_dict.copy()
all_dict.update(docs_dict)

In [5]:
# start = time.time()
tf_dict, idf_dict = tf_idf(all_dict, False)
# print(f"Cost time: {time.time() - start}")

## BM25 model


In [6]:
# BM25 Formula
def bm25(tf_q, tf_d, idf, avgDL, doc, b=0.75, k3=1000, k1=1.2):
    bm_weight = 0.0
    for i in range(len(tf_q)):
        bm_weight += (k1 + 1) * tf_d[i] / (k1 * ((1 - b) + b * len(doc) / avgDL) + tf_d[i]) \
                    * (k3 + 1) * tf_q[i] / (k3 + tf_q[i]) * idf[i]

    return bm_weight

In [7]:
# BM25L Formula
def bm25L(tf_q, tf_d, idf, avgDL, doc, b=0.75, k3 = 0.5, k1=1.2):
    bm_weight = 0.0
    for i in range(len(tf_q)):
        ctd = tf_d[i] / (1 - b + b * len(doc) / avgDL)
        bm_weight += idf[i] * (k1 + 1) * (ctd + k3) / (k1 + ctd + k3)
    return bm_weight

In [8]:
# average docs length
def avgDocLength(docs):
    avgDL = 0
    for term in docs.values():
        avgDL += len(term)
    return avgDL / len(docs)

In [9]:
avgdoclen = avgDocLength(docs_dict)

sim_dict = {}
sorted_sim_dict = {}
for fqname in querys_dict.keys():
    sim_dict[fqname] = {}
    query_tf = []
    idf = []
    query_term = querys_dict[fqname]
    for term in query_term:
        # idf
        idf.append(idf_dict[term])
        # tf of query
        if tf_dict[fqname].get(term) is not None:
            query_tf.append(tf_dict[fqname][term])
        else:
            query_tf.append(0)
    for fdname, terms_d in docs_dict.items():
        doc_tf = []
        # tf of doc
        for term in query_term:
            if tf_dict[fdname].get(term) is not None:
                doc_tf.append(tf_dict[fdname][term])
            else:
                doc_tf.append(0)
        sim_dict[fqname][fdname] = bm25(query_tf, doc_tf, idf, avgdoclen, terms_d, b=0.85, k1=3)
    sorted_sim_dict[fqname] = sorted(sim_dict[fqname], key=sim_dict[fqname].get, reverse=True)

## output file

In [10]:
if os.path.exists("result.txt"):
    os.remove("result.txt")
    
with open("result.txt", "w") as ofile:
    ofile.write("Query,RetrievedDocuments\n")
    for query_name, score_list in sorted_sim_dict.items():
        ofile.write(query_name + ",")
        for score in score_list:
            ofile.write(score + " ")
        ofile.write("\n")