In [None]:
# -*- coding: utf-8 -*-
# -*- authors : Vincent Roduit, Yann Cretton, Fabio Palmisano -*-
# -*- date : 2024-09-30 -*-
# -*- Last revision: 2024-09-30 by Vincent Roduit -*-
# -*- python version : 3.9.19 -*-
# -*- Description: Notebook that summarizes the results -*-

# <center> CS - 423: Distributed Information Systems </center>
## <center> Ecole Polytechnique Fédérale de Lausanne </center>
### <center>Project 1: Document Retrieval </center>
---

In [5]:
#import libraries
import pandas as pd
import numpy as np  
from matplotlib import pyplot as plt

#import files
from corpus_word2vec import CorpusWord2Vec
from constants import *
from corpus_bm25 import CorpusBm25
from scores import recall_at_k, evaluate_recall_at_k, evaluate_recall_at_k_per_lang

# automatically reload  the module
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 0. Data exploration
This first section will explore the corpus and the queries to have a better understanding of the data we have to deal with.

## 0.1 corpus.json

In [45]:
corpus = pd.read_json(CORPUS)

In [None]:
corpus.describe()

In [47]:
corpus_per_lang = corpus.groupby('lang')

In [None]:
corpus_per_lang['docid'].describe()

## 0.2 train.csv

In [51]:
df_train = pd.read_csv(QUERIES_TRAIN)

In [None]:
df_train.describe()

In [53]:
df_train_per_lang = df_train.groupby('lang')

In [None]:
df_train_per_lang['query_id'].describe()

## 0.3 dev.csv

In [None]:
df_dev = pd.read_csv(DEV)
df_dev.describe()

In [None]:
df_dev_per_lang = df_dev.groupby('lang')
df_dev_per_lang['query_id'].describe()

## 0.4 test.csv

In [None]:
df_test = pd.read_csv(QUERIES)
df_test.describe()

In [None]:
df_test_per_lang = df_test.groupby('lang')
df_test_per_lang['query_id'].describe()

# 1. Retrieve documents using Word2Vec as word embedding method

In [None]:
#train queries
documents = CorpusWord2Vec(CORPUS, QUERIES_TRAIN)

documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_word2vec.csv'))

In [None]:
#dev queries
documents = CorpusWord2Vec(CORPUS, DEV)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_word2vec_dev.csv'))

### 1.1.1 Evaluate performances
In this section we will evaluate the performances of the Word2Vec ranking on the train set. The evaluation will be done using the recall at 10 function, which is the same metric as the one provided on the Kaggle competition.

In [18]:
print('recall@k for train -------------------')
_ =evaluate_recall_at_k(submission_name='submission_word2vec.csv', queries_path=QUERIES_TRAIN)
print('recall@k for dev -------------------')
_ = evaluate_recall_at_k(submission_name='submission_word2vec_dev.csv', queries_path=DEV)

recall@k for train -------------------
Recall@10: 0.20
recall@k for dev -------------------
Recall@10: 0.28


In [19]:
print('recall@k for train -------------------')
_ = evaluate_recall_at_k_per_lang(submission_name='submission_word2vec.csv', queries_path=QUERIES_TRAIN)
print('recall@k for dev -------------------')
_ = evaluate_recall_at_k_per_lang(submission_name='submission_word2vec_dev.csv', queries_path=DEV)

recall@k for train -------------------
Recall@10 for en: 0.19
Recall@10 for fr: 0.20
Recall@10 for de: 0.15
Recall@10 for es: 0.29
Recall@10 for it: 0.21
Recall@10 for ko: 0.17
Recall@10 for ar: 0.27
recall@k for dev -------------------
Recall@10 for en: 0.17
Recall@10 for fr: 0.39
Recall@10 for de: 0.22
Recall@10 for es: 0.39
Recall@10 for it: 0.33
Recall@10 for ko: 0.14
Recall@10 for ar: 0.29


# 2. Retrieve documents using TF-IDF method and BM25 ranking score

## 2.1 Normal BM25

In [None]:
#train queries
documents = CorpusBm25(CORPUS, QUERIES_TRAIN, filter=False)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_train_bm25_normal.csv'))

In [None]:
#dev queries
documents = CorpusBm25(CORPUS, DEV, filter=False)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_dev_bm25_normal.csv'))

### 2.1.1 Evaluate performances
In this section we will evaluate the performances of the BM25 ranking on the train set. The evaluation will be done using the recall at 10 function, which is the same metric as the one provided on the Kaggle competition.

#### Overall performance
In this section, the performance will be done regardless of the language.

In [20]:
print('recall@k for train -------------------')
_= evaluate_recall_at_k(submission_name='submission_train_bm25_normal.csv', queries_path=QUERIES_TRAIN)
print('recall@k for dev -------------------')
_ = evaluate_recall_at_k(submission_name='submission_dev_bm25_normal.csv', queries_path=DEV)

recall@k for train -------------------
Recall@10: 0.67
recall@k for dev -------------------
Recall@10: 0.78


#### Performance per langage
This section will focuses on looking at the performances of the BM25 by separating the different langages to see any differences

In [21]:
print('recall@k for train -------------------')
_ = evaluate_recall_at_k_per_lang(submission_name='submission_train_bm25_normal.csv', queries_path=QUERIES_TRAIN)
print('recall@k for dev -------------------')
_ = evaluate_recall_at_k_per_lang(submission_name='submission_dev_bm25_normal.csv', queries_path=DEV)

recall@k for train -------------------
Recall@10 for en: 0.84
Recall@10 for fr: 0.41
Recall@10 for de: 0.39
Recall@10 for es: 0.65
Recall@10 for it: 0.49
Recall@10 for ko: 0.60
Recall@10 for ar: 0.56
recall@k for dev -------------------
Recall@10 for en: 0.76
Recall@10 for fr: 0.90
Recall@10 for de: 0.69
Recall@10 for es: 0.93
Recall@10 for it: 0.80
Recall@10 for ko: 0.63
Recall@10 for ar: 0.74


## 2.2 Filtered BM25

In [None]:
#train
documents = CorpusBm25(CORPUS, QUERIES_TRAIN, filter=True, filt_docs=10e3)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_train_bm25_filt_2e5.csv'))

In [None]:
#dev
documents = CorpusBm25(CORPUS, DEV, filter=True, filt_docs=10e3)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_dev_bm25_filt_2e5.csv'))

### 2.1.1 Evaluate performances
In this section we will evaluate the performances of the BM25 ranking with filtered document on the train set. The evaluation will be done using the recall at 10 function, which is the same metric as the one provided on the Kaggle competition.

#### Overall performance
In this section, the performance will be done regardless of the language.

In [23]:
print('recall@k for train -------------------')
_ = evaluate_recall_at_k(submission_name='submission_train_bm25_filt_2e5.csv', queries_path=QUERIES_TRAIN)
print('recall@k for dev -------------------')
_ = evaluate_recall_at_k(submission_name='submission_dev_bm25_filt_2e5.csv', queries_path=DEV)

recall@k for train -------------------
Recall@10: 0.27
recall@k for dev -------------------
Recall@10: 0.80


#### Performance per langage
This section will focuses on looking at the performances of the BM25 by separating the different langages to see any differences

In [24]:
print('recall@k for train -------------------')
_ = evaluate_recall_at_k_per_lang(submission_name='submission_train_bm25_filt_2e5.csv', queries_path=QUERIES_TRAIN)
print('recall@k for dev -------------------')
_ = evaluate_recall_at_k_per_lang(submission_name='submission_dev_bm25_filt_2e5.csv', queries_path=DEV)

recall@k for train -------------------
Recall@10 for en: 0.06
Recall@10 for fr: 0.22
Recall@10 for de: 0.38
Recall@10 for es: 0.52
Recall@10 for it: 0.34
Recall@10 for ko: 0.60
Recall@10 for ar: 0.56
recall@k for dev -------------------
Recall@10 for en: 0.88
Recall@10 for fr: 0.90
Recall@10 for de: 0.69
Recall@10 for es: 0.93
Recall@10 for it: 0.80
Recall@10 for ko: 0.63
Recall@10 for ar: 0.74


# 3. Fine-tuning model

## 3.1 Compare filtering effect

In [None]:
filt_docs = [100, 500, 1000, 5000, 10000, 15000]
recalls = []
recalls_per_lang = []
timestamps = []

for filt in filt_docs:
    print(f'Filt: {filt}')
    documents = CorpusBm25(CORPUS, DEV, filter=True, filt_docs=filt, verbose=False)
    documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, f'submission_train_bm25_filt_{filt}.csv'))
    recall = evaluate_recall_at_k(submission_name=f'submission_train_bm25_filt_{filt}.csv', queries_path=DEV)
    recalls.append(recall)
    recall = evaluate_recall_at_k_per_lang(submission_name=f'submission_train_bm25_filt_{filt}.csv', queries_path=DEV)
    recalls_per_lang.append(recall)
    timestamps.append(documents.time)

In [None]:
plt.plot(filt_docs, recalls)
plt.xlabel('Number of documents')
plt.ylabel('Recall@10')
plt.title('Recall@10 vs Number of documents')
plt.show()

In [None]:
#plot recall per language
for lang in recalls_per_lang[0]:
    plt.plot(filt_docs, [recalls_per_lang[k][lang] for k in range(len(filt_docs))], label=lang)
plt.xlabel('Number of documents')
plt.ylabel('Recall@10')
plt.title('Recall@10 vs Number of documents per language')
plt.legend()
plt.show()

In [None]:
#plot time (yleft) and performance (yright)
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Number of documents')
ax1.set_ylabel('Recall@10', color=color)
ax1.plot(filt_docs, recalls, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Time', color=color)
ax2.plot(filt_docs, timestamps, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()
plt.title('Recall@10 and Time vs Number of documents')
plt.show()


## Fine-tuning b and k1

In [None]:
# study the effect of k1 and b
k1_values = [0.1, 0.5, 1, 1.5, 2, 2.5, 3]
b_values = [0.1, 0.5, 0.75, 0.9, 0.95, 0.99]
recalls = np.zeros((len(k1_values), len(b_values)))
recalls_per_lang = {lang: np.zeros((len(k1_values), len(b_values))) for lang in STOP_WORDS.keys()}
timestamps = np.zeros((len(k1_values), len(b_values)))


for i, k1 in enumerate(k1_values):
    for j, b in enumerate(b_values):
        print(f'k1: {k1}, b: {b}')
        documents = CorpusBm25(CORPUS, DEV, filter=True, filt_docs=10000, k1=k1, b=b, verbose=False)
        documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, f'submission_train_bm25_k1_{k1}_b_{b}.csv'))
        recall = evaluate_recall_at_k(submission_name=f'submission_train_bm25_k1_{k1}_b_{b}.csv', queries_path=DEV)
        recalls[i, j] = recall
        timestamps[i, j] = documents.time

In [None]:
#plot results given k1 and b with values inside the case
plt.figure(figsize=(10, 10))
plt.imshow(recalls, cmap='hot', interpolation='nearest')
plt.xticks(range(len(b_values)), b_values)
plt.yticks(range(len(k1_values)), k1_values)
plt.xlabel('b')
plt.ylabel('k1')
for i in range(len(k1_values)):
    for j in range(len(b_values)):
        plt.text(j, i, f'{recalls[i, j]:.3f}', ha='center', va='center', color='black')
plt.colorbar()
plt.title('Recall@10 vs k1 and b')
plt.show()

#find the best k1 and b
best_k1 = k1_values[np.unravel_index(np.argmax(recalls), recalls.shape)[0]]
best_b = b_values[np.unravel_index(np.argmax(recalls), recalls.shape)[1]]

print(f'Best k1: {best_k1}, Best b: {best_b}')
print(f'Best recall: {np.max(recalls)}')

# 4. Compute the best prediction

In [7]:
#Compute the predictions with the best k1 and b for the dev set
best_k1 = 2
best_b = 0.9
documents = CorpusBm25(CORPUS, DEV, filter=True, k1=best_k1, b=best_b)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, f'submission_dev_bm25_k1_{best_k1}_b_{best_b}.csv'))

Computing idf, tf, avg_doc_len, doc_len
Loading df from pickle
Loading idf from pickle
Loading tf from pickle
Loading doc_len from pickle
Computing length_norm
Loading inverted index from pickle
Loading docid from pickle
Loading lang from pickle
Loading query from ../data/dev.csv
Loading tokenized corpus from pickle


Calculating BM25 scores: 100%|██████████| 1400/1400 [01:19<00:00, 17.59it/s]

Time taken to process queries and compute BM25 scores: 1 min 19 sec





In [11]:
print('recall@k for dev -------------------')
_ = evaluate_recall_at_k(submission_name=f'submission_dev_bm25_k1_{best_k1}_b_{best_b}.csv', queries_path=DEV)

recall@k for dev -------------------
Recall@10: 0.80


In [12]:
print('recall@k for dev -------------------')
_ = evaluate_recall_at_k_per_lang(submission_name=f'submission_dev_bm25_k1_{best_k1}_b_{best_b}.csv', queries_path=DEV)

recall@k for dev -------------------
Recall@10 for en: 0.87
Recall@10 for fr: 0.90
Recall@10 for de: 0.70
Recall@10 for es: 0.94
Recall@10 for it: 0.81
Recall@10 for ko: 0.62
Recall@10 for ar: 0.74


In [None]:
#Compute the test set with the best k1 and b
documents = CorpusBm25(CORPUS, QUERIES, filter=True, k1=best_k1, b=best_b)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, f'possibly_best.csv'))

Computing idf, tf, avg_doc_len, doc_len
Loading df from pickle
Loading idf from pickle
Loading tf from pickle
Loading doc_len from pickle
Computing length_norm
Loading inverted index from pickle
Loading docid from pickle
Loading lang from pickle
Loading query from ../data/test.csv
Loading tokenized corpus from pickle


Calculating BM25 scores: 100%|██████████| 2000/2000 [02:40<00:00, 12.50it/s]

Time taken to process queries and compute BM25 scores: 2 min 40 sec



