In [22]:
# -*- coding: utf-8 -*-
# -*- authors : Vincent Roduit -*-
# -*- date : 2024-09-30 -*-
# -*- Last revision: 2024-09-30 by Vincent Roduit -*-
# -*- python version : 3.9.19 -*-
# -*- Description: Constants used in the code *-

# <center> CS - 423: Distributed Information Systems </center>
## <center> Ecole Polytechnique Fédérale de Lausanne </center>
### <center>Project 1: Document Retrieval </center>
---

In [1]:
#import files
from constants import *
from utils import *
from corpus_word2vec import CorpusWord2Vec
from corpus_bm25 import CorpusBm25

# automatically reload  the module
%load_ext autoreload
%autoreload 2

# 1. Retrieve documents using Word2Vec as word embedding method

In [None]:
documents = CorpusWord2Vec(CORPUS, QUERIES)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_test.csv'))

# 1. Retrieve documents using TF-IDF method and BM25 ranking score

In [10]:
documents = CorpusBm25(CORPUS, QUERIES_REDUCED)
documents.create_submission(output_path=os.path.join(SUBMISSIONS_FOLDER, 'submission_int.csv'))

Computing idf, tf, avg_doc_len, doc_len
Loading df from pickle
Loading idf from pickle
Loading tf from pickle
Loading doc_len from pickle
Computing length_norm
Loading inverted index from pickle
Loading docid from pickle
Loading lang from pickle
Loading query from ../data/test_reduced.csv
Loading tokenized corpus from pickle


Calculating BM25 scores: 100%|██████████| 10/10 [00:01<00:00,  5.72it/s]

Time taken to calculate BM25 scores: 1.76 seconds





In [4]:
import pandas as pd
df1 = pd.read_csv(os.path.join(SUBMISSIONS_FOLDER, 'best_score.csv'))

In [5]:
df2 = pd.read_csv(os.path.join(SUBMISSIONS_FOLDER, 'submission_int.csv'))

In [None]:
from tqdm import tqdm

In [2]:
df_corpus = load_data("corpus.pkl")

In [3]:
tokens_list = df_corpus['tokens'].tolist()

In [17]:
inverted_index = load_data("corpus_inverted_index.pkl")

In [20]:
#keep only doc_id
def create_inverted_index_doc_id(inverted_index):
    inverted_index_doc_id = {}
    for term, doc_ids in inverted_index.items():
        inverted_index_doc_id[term] = list(doc_ids.keys())
    return inverted_index_doc_id

inverted_index_doc_id = create_inverted_index_doc_id(inverted_index)
save_data(inverted_index_doc_id, "corpus_inverted_index_doc_id.pkl")

In [None]:
# Example: Create a term-to-id dictionary to map terms to integers
def create_term_to_id(tokens_list):
    term_to_id = {}
    for document in tqdm(tokens_list):
        for term in document:
            if term not in term_to_id:
                term_to_id[term] = len(term_to_id)
    return term_to_id

term_to_id = create_term_to_id(tokens_list)
save_data(term_to_id, "corpus_term_to_id.pkl")

In [None]:
def create_tf_int(tokens_list, term_to_id):
    tf_int = []
    for document in tqdm(tokens_list):
        frequencies = {}
        for term in document:
            term_id = term_to_id[term]
            term_count = frequencies.get(term_id, 0) + 1
            frequencies[term_id] = term_count

        tf_int.append(frequencies)
    return tf_int
tf_int = create_tf_int(tokens_list, term_to_id)
save_data(tf_int, "tf_int.pkl")

In [3]:
tf_int = load_data("tf_int.pkl")

In [4]:
term_to_id = load_data("corpus_term_to_id.pkl")

In [5]:
df_test_queries = load_data("test_tokenized.pkl")

In [7]:
def transform_query_to_int(query, term_to_id):
    query_int = []
    for term in query:
        if term in term_to_id:
            query_int.append(term_to_id[term])
    return query_int

df_test_queries['query_int'] = df_test_queries['tokens'].apply(lambda x: transform_query_to_int(x, term_to_id))

In [11]:
idf = load_data("corpus_idf.pkl")

#transform the keys of the dictionary from string to int
idf_int = {term_to_id[term]: value for term, value in idf.items()}

In [13]:
df = load_data("corpus_df.pkl")

In [14]:
df

Counter({'wa': 205464,
         'refer': 200635,
         'also': 187537,
         'one': 177703,
         'thi': 174979,
         'first': 174238,
         'time': 167530,
         '1': 165103,
         'new': 164919,
         'two': 163266,
         'year': 161620,
         'includ': 157652,
         '2': 157506,
         'part': 154709,
         'ha': 154569,
         'link': 152602,
         'extern': 152345,
         'hi': 147722,
         'state': 146152,
         '3': 145976,
         'dure': 143493,
         'use': 137040,
         '5': 136158,
         'onli': 134903,
         '4': 134558,
         'follow': 131091,
         '10': 129845,
         'three': 129014,
         'later': 128761,
         'work': 128445,
         'name': 127896,
         'well': 124488,
         'gener': 124022,
         'peopl': 123650,
         'may': 122951,
         'base': 122520,
         'second': 121826,
         'made': 121523,
         'origin': 120560,
         'mani': 118596,
         'pl