In [1]:
# -*- coding: utf-8 -*-
# -*- authors : Vincent Roduit -*-
# -*- date : 2024-09-30 -*-
# -*- Last revision: 2024-09-30 by Vincent Roduit -*-
# -*- python version : 3.9.19 -*-
# -*- Description: Constants used in the code *-

# <center> CS - 423: Distributed Information Systems </center>
## <center> Ecole Polytechnique Fédérale de Lausanne </center>
### <center>Project 1: Document Retrieval </center>
---

In [2]:
#import libraries
import pandas as pd
import os
from nltk.corpus import stopwords
import pickle as pkl
import nltk
from nltk.stem import PorterStemmer
import string
from tqdm import tqdm
import math
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
#from gensim.models import Word2Vec

# automatically reload the module
%load_ext autoreload
%autoreload 2

# 1. Declaring constants

In [80]:
# Path to the data folder
if os.path.exists("../data"):
    DATA_FOLDER = "../data"
else:
    # create the data folder
    os.mkdir("../data")

CORPUS = os.path.join(DATA_FOLDER, "corpus", "corpus.json")
CORPUS_PKL = os.path.join(DATA_FOLDER, "pickles", "corpus.pkl")
CORPUS_REDUCED_PKL = os.path.join(DATA_FOLDER, "pickles", "corpus_reduced.pkl")

STOP_WORDS = {
    "en": set(stopwords.words('english')),
    "fr": set(stopwords.words('french')),
    "de": set(stopwords.words('german')),
    "es": set(stopwords.words('spanish')),
    "it": set(stopwords.words('italian')),
    "ko": set(stopwords.words('korean')),
    "ar": set(stopwords.words('arabic')),
}


# 2. Loading Data

In [163]:
def save_data(data: any, file_name: str, folder: str = os.path.join(DATA_FOLDER, "pickles")):
    """
    Save the data to a file
    Args:

    * data (any): the data to save

    * file_name (str): the name of the file

    * folder (str): the folder where to save the file
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

    file_path = os.path.join(folder, file_name)

    with open(file_path, 'wb') as handle:
        pkl.dump(data, handle)

def load_data(file_name: str, folder: str = os.path.join(DATA_FOLDER, "pickles")) -> any:
    """
    Load the data from a file
    Args:

    * file_name (str): the name of the file

    * folder (str): the folder where to save the file

    Returns:

    * any: the data
    """
    file_path = os.path.join(folder, file_name)

    with open(file_path, 'rb') as handle:
        data = pkl.load(handle)

    return data

In [10]:
# load the corpus
if os.path.exists(CORPUS_PKL):
    print("Loading the corpus from the pickle file")
    corpus = load_data("corpus.pkl")
else:
    print("Loading the corpus from the json file")
    corpus = pd.read_json(CORPUS)

Loading the corpus from the json file


In [11]:
division = len(corpus) // 3

#df_corpus_vincent = corpus[:division].copy()
# df_corpus_fabio = corpus[division:2*division].copy()
df_corpus_yann = corpus[2*division:].copy()

# 3. Preprocessing

In [81]:
# Initialize objects outside the function to avoid re-initialization overhead
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

def tokenize(text, lang="en"):
    """
    Tokenizes and stems the input text efficiently.
    
    :param text: str, input text to process
    :return: list, tokenized and stemmed words
    """
    
    tokens = tokenizer.tokenize(text)
    
    # Combine stemming and stopword filtering into one pass for efficiency
    return [stemmer.stem(word.lower()) for word in tokens if word.lower() not in STOP_WORDS[lang] or stemmer.stem(word.lower()) not in STOP_WORDS[lang]]

In [13]:
tqdm.pandas()

# Apply the tokenization
# df_corpus_fabio['tokenized'] = df_corpus_fabio.progress_apply(lambda row: tokenize(row['text'], lang=row['lang']), axis=1)
# save_data(df_corpus_fabio, "df_corpus_fabio.pkl")

#df_corpus_vincent['tokenized'] = df_corpus_vincent.progress_apply(lambda row: tokenize(row['text'], lang=row['lang']), axis=1)
#save_data(df_corpus_vincent, "df_corpus_vincent.pkl")

df_corpus_yann['tokenized'] = df_corpus_yann.progress_apply(lambda row: tokenize(row['text'], lang=row['lang']), axis=1)
save_data(df_corpus_yann, "df_corpus_yann.pkl")

100%|██████████| 89342/89342 [31:29<00:00, 47.28it/s]  


In [14]:
# tokens_list_fabio = df_corpus_fabio['tokenized'].tolist()
# save_data(tokens_list_fabio, "tokens_list_fabio.pkl")

#tokens_list_vincent = df_corpus_vincent['tokenized'].tolist()
#save_data(tokens_list_vincent, "tokens_list_vincent.pkl")

tokens_list_yann = df_corpus_yann['tokenized'].tolist()
save_data(tokens_list_yann, "tokens_list_yann.pkl")

# 4. Create the TF-IDF matrix

In [None]:
# Functions for the TF/IDF implementation
def idf_values(vocabulary, documents):
    """
    It computes IDF scores, storing idf values in a dictionary.
    
    :param vocabulary: list of str, with the unique tokens of the vocabulary.
    :param documents: list of lists of str, with tokenized sentences.
    :return: dict with the idf values for each vocabulary word.
    """
    idf = {}
    num_documents = len(documents)
    
    # Adding tqdm progress bar for the loop
    for term in tqdm(vocabulary, desc="Calculating IDF values"):
        idf[term] = math.log(num_documents / sum([1 for d in documents if term in d]), math.e)
    
    return idf

def vectorize(document, vocabulary, idf):
    """
    It generates the vector for an input document (with normalization).
    
    :param document: list of str with the tokenized documents.
    :param vocabulary: list of str, with the unique tokens of the vocabulary.
    :param idf: dict with the idf values for each vocabulary word.
    :return: list of floats
    """
    vector = [0]*len(vocabulary)
    counts = Counter(document)
    max_count = counts.most_common(1)[0][1]
    for i,term in enumerate(vocabulary):
        vector[i] = idf[term] * counts[term] / max_count
    return vector

def cosine_similarity(v1,v2):
    """
    It computes cosine similarity.
    
    :param v1: list of floats, with the vector of a document.
    :param v2: list of floats, with the vector of a document.
    :return: float
    """
    
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
            result = 0
    else:
            result =  sumxy / math.sqrt(sumxx*sumyy)
    return result

def search_vec(query, topk, corpus, idf, vocabulary, document_vectors):
    """
    It computes the search result (get the topk documents).
    
    :param query: str
    :param topk: int
    """
    q = query.split()
    q = [stemmer.stem(w) for w in q]
    query_vector = vectorize(q, vocabulary, idf)
    scores = [[cosine_similarity(query_vector, document_vectors[d]), d] for d in range(len(corpus))]
    scores.sort(key=lambda x: -x[0])
    doc_ids = []
    for i in range(topk):
        doc_ids.append(scores[i][1])

    return doc_ids

In [3]:
def term_frequency(corpus):
    """
    It computes the term frequency for each term in the corpus.
    
    :param corpus: list of lists of str, with the tokenized documents.
    :return: dict with the term frequency for each term.
    """
    term_freq = {}
    for document in corpus:
        for term in document:
            if term in term_freq:
                term_freq[term] += 1
            else:
                term_freq[term] = 1
    return term_freq

# 5. BM25

## Preload some data

In [82]:
#tokenize queries from dev.csv
tqdm.pandas()
train_df = pd.read_csv("/Users/yanncretton/data/train.csv")
train_df['tokenized'] = train_df.progress_apply(lambda row: tokenize(row['query'], lang=row['lang']), axis=1)

100%|██████████| 21875/21875 [00:01<00:00, 14653.86it/s]


In [84]:
#save queries_fr
file_path = "/Users/yanncretton/tokenized/train.pkl"

with open(file_path, 'wb') as handle:
    pkl.dump(train_df, handle)


## For now only working with french corpus

In [5]:
file_path = "/Users/yanncretton/tokenized/fr_df.pkl"

# Load the pickle file
with open(file_path, "rb") as file:
    df_corpus_fr = pkl.load(file)

list_corpus_fr = df_corpus_fr["tokenized"].tolist()

In [44]:
idf_fr, tf_fr, avg_doc_len_fr = BM25_preprocessing(list_corpus_fr)

## First submission test

In [3]:
#Load data
file_path = "/Users/yanncretton/tokenized/test_queries.pkl"
with open(file_path, "rb") as file:
    test_queries = pkl.load(file)

file_path = "/Users/yanncretton/tokenized/df_corpus_total.pkl"
with open(file_path, "rb") as file:
    df_corpus = pkl.load(file)

In [None]:
#Retrieve 'docid' from the corpus
docid = df_corpus['docid'].tolist()
#Retrieve language from the corpus
lang = df_corpus['lang'].tolist()
list_corpus = df_corpus["tokenized"].tolist()

list_test_queries = test_queries["tokenized"].tolist()
list_lang_test_queries = test_queries["lang"].tolist()

In [13]:
#define a method to preprocess data for bm25 ranking

# Initialize data structures to store computed values.
def BM25_preprocessing(corpus_tokenized):
    """
    It preprocesses the data for the BM25 ranking.
    
    :param corpus_tokenized: list of lists of str, with tokenized documents.
    :return: dict with the idf values for each vocabulary word.
    """
    
    # Compute the document frequency for each term in the corpus.
    df = {}
    for document in corpus_tokenized:
        for term in set(document):
            if term in df:
                df[term] += 1
            else:
                df[term] = 1
    
    # Compute the inverse document frequency for each term in the corpus.
    idf = {}
    num_documents = len(corpus_tokenized)
    for term in df:
        idf[term] = math.log(1 + (num_documents - df[term] + 0.5) / (df[term] + 0.5))

    #Compute the term frequency for each in term in each document
    tf = {}
    for i, document in enumerate(corpus_tokenized):
        tf[i] = {}
        for term in document:
            if term in tf[i]:
                tf[i][term] += 1
            else:
                tf[i][term] = 1

    # Compute the average document length. 
    avg_doc_len = sum([len(document) for document in corpus_tokenized]) / len(corpus_tokenized)
    #Compute doc_len
    doc_len = [len(document) for document in corpus_tokenized]
    #Retrieve 'docid' from the corpus
    docid = df_corpus['docid'].tolist()
    #Retrieve language from the corpus
    lang = df_corpus['lang'].tolist()


    
    return idf, tf, avg_doc_len, doc_len, docid, lang


In [14]:
idf, tf, avg_doc_len, doc_len, docid, lang = BM25_preprocessing(list_corpus)

In [157]:
from collections import defaultdict
def BM25_score(query, document_id, idf, tf, avg_doc_len, doc_len, k1=1.5, b=0.75):
    """
    Computes the BM25 score for a given query and a given document.
    """
    score = 0
    doc_length = doc_len[document_id]
    length_norm = k1 * (1 - b + b * doc_length / avg_doc_len)  # Precompute normalization factor
    for term in set(query):  # Use set to avoid redundant term checks
        if document_id in tf and term in tf[document_id]:
            idf_term = idf.get(term, 0)  # Use .get() to handle missing terms
            tf_term = tf[document_id][term]
            score += idf_term * (tf_term * (k1 + 1) / (tf_term + length_norm))
    return score

def BM25_search(query, idf, tf, avg_doc_len, doc_len, docid, lang, target_lang, k=10):
    """
    Computes the search result (top-k documents) for documents in the specified language.
    """
    # Filter docid and doc_len by language once
    relevant_docs = [i for i in range(len(docid)) if lang[i] == target_lang]
    # Calculate scores only for relevant documents
    scores = []
    for i in relevant_docs:
        score = BM25_score(query, i, idf, tf, avg_doc_len, doc_len)
        scores.append((score, docid[i]))

    # Sort and get top-k documents by score
    scores.sort(key=lambda x: -x[0])
    top_doc_ids = [doc_id for _, doc_id in scores[:k]]
    
    return top_doc_ids

In [158]:
# Initialize list to store results
results = []

# Loop over each query
for idx, query in tqdm(enumerate(list_test_queries), total=len(list_test_queries), desc="Processing queries"):
    query_lang = list_lang_test_queries[idx]  # Get the language for the current query
    
    # Get the top 10 documents for the current query
    top_docs = BM25_search(query, idf, tf, avg_doc_len, doc_len, docid, lang, target_lang=query_lang, k=10)
    
    # Append the result as a dictionary
    results.append({
        'id': idx,  # You may replace idx with actual query ID if available
        'docids': top_docs
    })

# Convert results into a DataFrame
results_df = pd.DataFrame(results)
print(results_df)


Processing queries: 100%|██████████| 2000/2000 [1:45:19<00:00,  3.16s/it] 

      query_id                                        top_10_docs
0            0  [doc-en-0, doc-en-14117, doc-en-794977, doc-en...
1            1  [doc-en-16, doc-en-773190, doc-en-369832, doc-...
2            2  [doc-en-24920, doc-en-32, doc-en-420197, doc-e...
3            3  [doc-en-814022, doc-en-40, doc-en-535898, doc-...
4            4  [doc-en-701524, doc-en-56, doc-en-11186, doc-e...
...        ...                                                ...
1995      1995  [doc-ar-8335, doc-ar-11184, doc-ar-11005, doc-...
1996      1996  [doc-ar-12903, doc-ar-6265, doc-ar-11504, doc-...
1997      1997  [doc-ar-7659, doc-ar-1252, doc-ar-201, doc-ar-...
1998      1998  [doc-ar-8394, doc-ar-24, doc-ar-1085, doc-ar-1...
1999      1999  [doc-ar-1810, doc-ar-3677, doc-ar-11951, doc-a...

[2000 rows x 2 columns]





## Training

In [92]:
list_train_queries = train_df['tokenized'].tolist()
list_lang_train_queries = train_df['lang'].tolist()

In [93]:
# Initialize list to store results
results = []

# Loop over each query
for idx, query in tqdm(enumerate(list_train_queries), total=len(list_train_queries), desc="Processing queries"):
    query_lang = list_lang_train_queries[idx]  # Get the language for the current query
    
    # Get the top 10 documents for the current query
    top_docs = BM25_search(query, idf, tf, avg_doc_len, doc_len, docid, lang, target_lang=query_lang, k=10)
    
    # Append the result as a dictionary
    results.append({
        'query_id': idx,  # You may replace idx with actual query ID if available
        'top_10_docs': top_docs
    })

# Convert results into a DataFrame
results_df = pd.DataFrame(results)
print(results_df)

Processing queries: 100%|██████████| 21875/21875 [32:13<00:00, 11.31it/s]   

       query_id                                        top_10_docs
0             0  [doc-en-9633, doc-en-11447, doc-en-9696, doc-e...
1             1  [doc-en-9633, doc-en-11447, doc-en-9696, doc-e...
2             2  [doc-en-9633, doc-en-11447, doc-en-9696, doc-e...
3             3  [doc-en-9633, doc-en-11447, doc-en-9696, doc-e...
4             4  [doc-en-9633, doc-en-11447, doc-en-9696, doc-e...
...         ...                                                ...
21870     21870  [doc-ar-2770, doc-ar-8023, doc-ar-7067, doc-ar...
21871     21871  [doc-ar-2770, doc-ar-8023, doc-ar-7067, doc-ar...
21872     21872  [doc-ar-2770, doc-ar-8023, doc-ar-7067, doc-ar...
21873     21873  [doc-ar-2770, doc-ar-8023, doc-ar-7067, doc-ar...
21874     21874  [doc-ar-2770, doc-ar-8023, doc-ar-7067, doc-ar...

[21875 rows x 2 columns]





In [7]:
#Maybe try this with multiprocessing
from multiprocessing import Pool

# Define a function that wraps the BM25_ranking for a single query
def process_query(query):
    return BM25_ranking(query, list_corpus, tf, idf, doc_len, avg_doc_len, ktop=10)

# Use multiprocessing Pool to process queries in parallel
with Pool() as pool:
    ranked_documents = list(tqdm(pool.imap(process_query, list_test_queries), total=len(list_test_queries), desc="Ranking documents for each query"))


Process SpawnPoolWorker-1:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/machinelearning/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/machinelearning/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/envs/machinelearning/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/anaconda3/envs/machinelearning/lib/python3.8/multiprocessing/queues.py", line 358, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'process_query' on <module '__main__' (built-in)>
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-3:
Process SpawnPoolWorker-4:
Process SpawnPoolWorker-5:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/machinelearning/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/anaconda3/envs/machinelearning/l

KeyboardInterrupt: 