## LOAD

suppose you have already download needed csv files.

links:  https://drive.google.com/drive/folders/1zQp9VdhdXmG7_DK7O_j5CP2SEHuM6VTe?usp=drive_link

In [None]:
import pandas as pd
lang = "fr"
# If the data has already been tokenized and saved, only need to run this cell
corpus_file = pd.read_csv(f"Data/test/bm25_{lang}_corpus.csv")
query_file = pd.read_csv(f"Data/test/bm25_{lang}_query.csv")

### BM25 Probabilistic Language Model Implementation

For this notebook, we will define our model as follow: we will use a bag-of-words retrieval function, named BM25. We will use the following scoring for a query Q (with words after tokenizations $\{q_i\}_{i=1, …, n}$) and a document D:

$$score(Q, D) = \sum_{i=1}^n IDF(q_i) * \frac{f(q_i, D) \cdot (k + 1)}{f(q_i, D) + k \cdot (1 - b + b \cdot \frac{|D|}{avglength})}$$

where $|D|$ = number of tokens in document D, $f(q_i, D)$ = number of times $q_i$ occurs in document D, avglength = average length of a token in the text collection.

Moreover, k and b are parameters to finetune.

### Implemention

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def avg_doc_length(corpus):
    """Compute the average length (in tokens) of the documents of the whole corpus"""
    documents = corpus["text_token"]
    return np.mean([len(doc) for doc in documents])

def doc_lengths(corpus):
    documents = corpus["text_token"]
    return np.array([len(doc) for doc in documents])

def scores_bm25(queries, corpus, N, max_features, doc_lengths=None, avgLength=None, k1=1.25, b=0.75):
    vectorizer = CountVectorizer(max_features=max_features)
    doc_term_matrix = vectorizer.fit_transform(corpus['text_token'])
    query_term_matrix = vectorizer.transform(queries['query_token'])
    
    F = doc_term_matrix.toarray()
    if doc_lengths is None:
        doc_lengths = F.sum(axis=1)
    if avgLength is None:
        avg_doc_length = np.mean(doc_lengths)
    else:
        avg_doc_length = avgLength
    
    df = np.count_nonzero(F, axis=0)
    idf = np.log(1 + (N - df + 0.5) / (df + 0.5))
    
    numerator = F * (k1 + 1)
    denominator = F + k1 * (1 - b + b * (doc_lengths[:, None] / avg_doc_length))
    F_adjusted = numerator / denominator
    idf_times_F_adjusted = idf * F_adjusted
    
    BM25_scores = query_term_matrix.toarray() @ idf_times_F_adjusted.T

    scores_list = [
        {'query_id': queries['query_id'][query_idx], 'doc_id': corpus['docid'][doc_idx], 'bm25_score': score}
        for query_idx, query_id in enumerate(queries['query_id'])
        for doc_idx, doc_id in enumerate(corpus['docid'])
        for score in [BM25_scores[query_idx, doc_idx]]
    ]
    
    return pd.DataFrame(scores_list)

In [None]:
def test_acc(pos_docs, query):
    acc = 0
    for i, id in enumerate(query["query_id"]):
        if query["positive_docs"][i] in pos_docs[id]:
            acc += 1
    return acc/len(query)

## Run

change max_features to test for better results

In [None]:
max_features = 25000

avg_doc_len = avg_doc_length(corpus_file)
doc_len = doc_lengths(corpus_file)

scores = scores_bm25(query_file, corpus_file, corpus_file.shape[0], max_features, doc_len, avg_doc_len)

In [None]:
# get top 10 results:
pos_docs = {}
for i, id in enumerate(query_file["query_id"]):
    scores_id = scores[scores["query_id"] == id]
    # sort depending on the score values
    scores_id = scores_id.sort_values(by='bm25_score', ascending=False)
    pos_docs[id] = scores_id["doc_id"][:10].tolist()

In [None]:
test_acc(pos_docs, query_file)

### Save the pkl

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import joblib

# calculate the average document length
def _avg_doc_length(corpus):
    documents = corpus["text_token"]
    return np.mean([len(doc) for doc in documents])

# calculate bm25 scores
def save_bm25_corpus(corpus,lang, max_features=25000, k1=1.25, b=0.75):
    vectorizer = CountVectorizer(max_features=max_features)
    doc_term_matrix = vectorizer.fit_transform(corpus['text_token'])
    
    # transform the doc_term_matrix to an array
    F = doc_term_matrix.toarray()
    doc_lengths = F.sum(axis=1)
    avg_doc_length = _avg_doc_length(corpus)
    
    # get idf
    N = len(corpus)
    df = np.count_nonzero(F, axis=0)
    idf = np.log(1 + (N - df + 0.5) / (df + 0.5))
    
    # calculate the bm25 score
    numerator = F * (k1 + 1)
    denominator = F + k1 * (1 - b + b * (doc_lengths[:, None] / avg_doc_length))
    F_adjusted = numerator / denominator
    idf_times_F_adjusted = idf * F_adjusted
    
    # save the bm25 corpus
    doc_ids = corpus['docid'].tolist()
    joblib.dump((idf_times_F_adjusted, vectorizer, doc_ids), f"bm25_corpus_{lang}.pkl")
    print("BM25 corpus saved successfully!")

lang = "es"
corpus = pd.read_csv(f'Data/preprocess_corpus/bm25corpus_{lang}.csv')
save_bm25_corpus(corpus, lang)

## Test

get results for test.csv


In [None]:
test_file = pd.read_csv(f"Data/preprocess_test/bm25query_{lang}_test.csv")

avg_doc_len = avg_doc_length(corpus_file)
doc_len = doc_lengths(corpus_file)

scores= scores_bm25(test_file, corpus_file, corpus_file.shape[0], 35000, doc_len, avg_doc_len)


# get top 10 results:
pos_docs = {}
for i, id in enumerate(test_file["query_id"]):
    scores_id = scores[scores["query_id"] == id]
    # sort depending on the score values
    scores_id = scores_id.sort_values(by='bm25_score', ascending=False)
    pos_docs[id] = scores_id["doc_id"][:10].tolist()


In [None]:
# save the results as csv

with open(f"Data/bm25_results_{lang}_new.csv", "w") as f:
    f.write("id,docids\n")
    for key in pos_docs.keys():
        f.write(f"{key},{' '.join(pos_docs[key])}\n")