## LOAD

LOAD corpus.pkl and query.csv(with tokenized) for certain language


In [None]:
import numpy as np
import pandas as pd
import joblib

# bm25_query
def bm25_score(queries,corpus_file):
    # load the preprocessed data
    idf_times_F_adjusted, vectorizer, doc_ids = joblib.load(corpus_file)
    
    # preprocess the query
    query_term_matrix = vectorizer.transform(queries['query_token'])
    
    # calculate the BM25 scores
    BM25_scores = query_term_matrix @ idf_times_F_adjusted.T
    
    # get the top 10 documents for each query
    pos_docs = {}
    for query_idx, query_id in enumerate(queries['query_id']):
        scores = BM25_scores[query_idx]
        top_doc_indices = np.argsort(scores)[-10:][::-1]
        pos_docs[query_id] = [doc_ids[idx] for idx in top_doc_indices]
    
    return pos_docs

# run
lang = 'fr'  
corpus_path = f"Data/test/bm25_{lang}_corpus.pkl"  # corpus file
query_file =pd.read_csv(f"Data/test/bm25_{lang}_query.csv")  # query file
pos_docs = bm25_score(query_file, corpus_path)

## Evaluation

In [None]:
# evaluate the performance
query = pd.read_csv('Data/dev.csv')
# make a dictionary for query
query_dict = {}
for i in range(len(query)):
    query_dict[query['query_id'][i]] = query['positive_docs'][i]


acc = 0
# if the positive documents are in the top 10, acc += 1
for key in pos_docs.keys():
    if query_dict[key] in pos_docs[key]:
        acc += 1

print("Accuracy: ", acc/len(pos_docs))

## Test

In [18]:
test_file = pd.read_csv(f"Data/bm25query_{lang}_test.csv")

pos_docs = bm25_score(test_file, corpus_path)

# save the results as csv

with open(f"Data/bm25_results_{lang}.csv", "w") as f:
    f.write("id,doc_id\n")
    for key in pos_docs.keys():
        f.write(f"{key},{' '.join(pos_docs[key])}\n")