This notebook contains the implementations for functions for BM25 function

## 1. BM25 Computer

https://github.com/badriadhikari/AI/blob/main/Chapter-homeworks.md#chapter-22---implement-bm25-function
    
The objective in this activity is to search for 'relevant' document/s in a document corpus (database) by implementing the BM25 scoring function. Task: A search query “Word1 Word2” is being scored against 40 documents. The number of times the words “Word1” and “Word2” appear in each of the documents is given in the table. Write a Python program to calculate the BM25 score for the query against all the documents and rank the documents by their BM25 score. You will need to compute IDF, DF, TF, N, L, etc. by reading the table. Assume k = 1.2 and b = 0.75. The code block below suggests the structure for your implementation.

**Coding Approach:**

    - Object oriented
    - Readable
    - Optimized
    - Break down functions
    - Pandas intensive
    - Assumption: Word is a string separated by spaces in the given document
    - Application: Search Engine Optimization (SEO)
    - Code commented

In [82]:
import math
import pandas as pd

class BM25Computer:
    
    def __init__(self, k=1.2, b=0.75, n_documents=40):
        self.k = k
        self.b = b
        self.n_documents = n_documents
        self.bm25_table = pd.read_csv("bm25_frequency_table.csv")
        self.L = self._compute_L()


    def _get_tf_of_word_in_doc(self, word: str, document_id: int):
        """
            Count of the number of occurrences of word in a document
        """
        target_col = f"FrequencyOf{word}"
        return self.bm25_table[self.bm25_table['DocumentID'] == document_id][target_col].item()

    def _compute_L(self):
        """
        """
        L_value = 0
        for idx, row in self.bm25_table.iterrows():
            L_value += row['DocumentLength'] / self.n_documents
        print(f"L value: {L_value}")
        return L_value
        
    
    def _get_document_frequency_of_word(self, word: str) -> int:
        """
            DF(qi) = Number of documents that contain the word qi
        """
        target_word_colum = f"FrequencyOf{word}"
        documents_with_word_df = self.bm25_table[self.bm25_table[target_word_colum]
                                              > 0]
        n_documents_with_word = documents_with_word_df.shape[0]
        return n_documents_with_word
                

    def _compute_idf(self, word:str):
        """
            Using pyton's in built math function
        """
        doc_freq_word = self._get_document_frequency_of_word(word)
        idf_numerator =  self.n_documents - doc_freq_word + 0.5
        
        idf_denominator = doc_freq_word + 0.5
        idf_ratio = idf_numerator/idf_denominator
        return math.log(idf_ratio)
        
    
    def _compute_numerator(self, tf_word_in_doc:float):
        """
        """
        return tf_word_in_doc * (self.k + 1)
        
    
    def _compute_denomintor(self, tf_word_in_doc, document_length: int):
        """
        """
        return tf_word_in_doc + self.k * (1 - self.b + (self.b * 
                                                        (document_length/self.L))) 
        
    

    def _compute_bm_for_word(self, word: str, document_id: int):
        """
        """
        word_idf = self._compute_idf(word)
        tf_word_in_doc = self._get_tf_of_word_in_doc(word, document_id)
        bm25_numerator = self._compute_numerator(tf_word_in_doc)
        
        document_length = self.bm25_table[self.bm25_table['DocumentID'] == 
                                          document_id]['DocumentLength'].item()
        bm25_denominator = self._compute_denomintor(tf_word_in_doc, document_length)
        
        ratio = bm25_numerator/bm25_denominator
        
        return (word_idf) * (ratio)

        
    
    def _compute_bm25_of_query_given_document(self, search_query_words: list, 
                                              document_id: int):
        """
        """
        bm25_this_doc = 0
        
        for word in search_query_words:
        
            bm25_this_doc += self._compute_bm_for_word(word, document_id)
            
            
        return bm25_this_doc
            

    def bm25_search_query_main(self, search_query: str, k: float=1.2, b:float=0.75):
        """
            Rank documents by BM25 score
        """
        search_query_words = search_query.split()
        
        for idx, row in self.bm25_table.iterrows():
            
            document_id = row['DocumentID']
           
            bm25_score_this_doc = self._compute_bm25_of_query_given_document(search_query_words,
                                                                             document_id)
        
            
            self.bm25_table.loc[idx, 'bm25_score'] = bm25_score_this_doc
        
        self.bm25_table.sort_values(by=['bm25_score'], inplace=True)
        
        return self.bm25_table.set_index('DocumentID')['bm25_score'].to_dict(), self.bm25_table
            

        
obj = BM25Computer()

sorted_result_dict, sorted_csv_dataframe =  obj.bm25_search_query_main("Word1 Word2")


L value: 18.6


In [83]:
sorted_result_dict # result

{20: 0.0,
 32: 0.0,
 31: 0.0,
 28: 0.0,
 27: 0.0,
 25: 0.0,
 24: 0.0,
 23: 0.0,
 22: 0.0,
 21: 0.0,
 36: 0.0,
 18: 0.0,
 14: 0.0,
 34: 0.0,
 3: 0.0,
 37: 0.0,
 11: 0.0,
 6: 0.0,
 39: 0.49074382934578986,
 7: 0.5823770292609272,
 33: 0.5993394670063911,
 17: 0.7132895127292447,
 8: 0.7259370326421394,
 30: 0.7364984990436453,
 9: 0.7478542027733933,
 38: 0.7877628957288179,
 12: 0.8151880145477083,
 10: 0.8976227575918585,
 26: 0.943218644476541,
 35: 0.9630991311282713,
 15: 0.9653599272107319,
 4: 1.01893788154552,
 16: 1.0421777217226857,
 13: 1.1163452102812879,
 1: 1.4778920020686857,
 40: 1.5818313393786263,
 2: 1.7049050352350674,
 19: 1.7171983278629792,
 5: 1.7941613631686775,
 29: 1.8495904714897815}

In [87]:
sorted_csv_dataframe.reset_index(drop=['index']) # ranked in place

Unnamed: 0,DocumentID,DocumentLength,FrequencyOfWord1,FrequencyOfWord2,bm25_score
0,20,15,0,0,0.0
1,32,23,0,0,0.0
2,31,27,0,0,0.0
3,28,16,0,0,0.0
4,27,25,0,0,0.0
5,25,15,0,0,0.0
6,24,26,0,0,0.0
7,23,7,0,0,0.0
8,22,27,0,0,0.0
9,21,19,0,0,0.0
