In [22]:
# -*- coding: utf-8 -*-
# -*- authors : Vincent Roduit -*-
# -*- date : 2024-09-30 -*-
# -*- Last revision: 2024-09-30 by Vincent Roduit -*-
# -*- python version : 3.9.19 -*-
# -*- Description: Constants used in the code *-

# <center> CS - 423: Distributed Information Systems </center>
## <center> Ecole Polytechnique Fédérale de Lausanne </center>
### <center>Project 1: Document Retrieval </center>
---

In [43]:
#import libraries
import pandas as pd
import numpy as np  

#import files
from corpus_word2vec import CorpusWord2Vec
from constants import *
from corpus_bm25 import CorpusBm25
from scores import recall_at_k

# automatically reload  the module
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 0. Data exploration
This first section will explore the corpus and the queries to have a better understanding of the data we have to deal with.

# 1. Retrieve documents using Word2Vec as word embedding method

In [None]:
documents = CorpusWord2Vec(CORPUS, QUERIES)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_test.csv'))

# 2. Retrieve documents using TF-IDF method and BM25 ranking score

In [2]:
documents = CorpusBm25(CORPUS, QUERIES_TRAIN)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_train.csv'))

Computing idf, tf, avg_doc_len, doc_len
Loading corpus from pickle


100%|██████████| 268022/268022 [01:09<00:00, 3835.64it/s]


Computing df
Computing idf
Computing tf
Computing doc_len
Computing length_norm


Computing inverted index: 100%|██████████| 268022/268022 [01:34<00:00, 2843.31it/s] 


Loading query from ../data/train.csv
Tokenizing corpus


100%|██████████| 21875/21875 [00:02<00:00, 9524.85it/s] 
Calculating BM25 scores: 100%|██████████| 21875/21875 [32:59<00:00, 11.05it/s]   


Time taken to calculate BM25 scores: 1983.80 seconds


## 2.1 Evaluate performances
In this section we will evaluate the performances of the BM25 ranking on the train set. The evaluation will be done using the recall at 10 function, which is the same metric as the one provided on the Kaggle competition.

### 2.1.1 Overall performance
In this section, the performance will be done regardless of the language.

In [17]:
df_results = pd.DataFrame(documents.results)
df_queries = pd.DataFrame(documents.query)

In [18]:
df_results = df_results.merge(df_queries, left_index=True, right_index=True)
predictions = df_results['docids'].tolist()
ground_truth = df_results['positive_docs'].tolist()

In [44]:
recalls = []
for i in range(len(predictions)):
    pred = predictions[i]
    gt = ground_truth[i]
    rec = recall_at_k(pred, gt, 10)
    recalls.append(rec)

print(f"Recall@10: {np.mean(recalls)}")

Recall@10: 0.26765714285714287


### 2.1.2 Performance per langage
This section will focuses on looking at the performances of the BM25 by separating the different langages to see any differences

In [45]:
langs = df_queries['lang'].unique()

recalls = {}
for lang in langs:
    df_lang = df_results[df_results['lang'] == lang]
    predictions = df_lang['docids'].tolist()
    ground_truth = df_lang['positive_docs'].tolist()
    recalls[lang] = []
    for i in range(len(predictions)):
        pred = predictions[i]
        gt = ground_truth[i]
        rec = recall_at_k(pred, gt, 10)
        recalls[lang].append(rec)
    
    print(f"Recall@10 for {lang}: {np.mean(recalls[lang])}")

Recall@10 for en: 0.0559
Recall@10 for fr: 0.22139303482587064
Recall@10 for de: 0.37357877639415266
Recall@10 for es: 0.5248447204968945
Recall@10 for it: 0.33844723384472336
Recall@10 for ko: 0.6014558689717925
Recall@10 for ar: 0.5597138139790864
