In [1]:
# -*- coding: utf-8 -*-
# -*- authors : Vincent Roduit -*-
# -*- date : 2024-09-30 -*-
# -*- Last revision: 2024-09-30 by Vincent Roduit -*-
# -*- python version : 3.9.19 -*-
# -*- Description: Constants used in the code *-

# <center> CS - 423: Distributed Information Systems </center>
## <center> Ecole Polytechnique Fédérale de Lausanne </center>
### <center>Project 1: Document Retrieval </center>
---

In [34]:
#import libraries
import pandas as pd
import os
from nltk.corpus import stopwords
import pickle as pkl
import nltk
from nltk.stem import PorterStemmer
import string
from tqdm import tqdm
import math
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
import multiprocessing
import numpy as np
from time import time
from sklearn.metrics.pairwise import cosine_similarity
cores = multiprocessing.cpu_count()

#import files
from constants import *
from utils import *
from corpus import *

# automatically reload the module
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Loading Data

In [41]:
documents = Corpus(corpus_path=CORPUS)
documents.load_corpus()

Loading corpus from pickle


In [26]:
def vectorize(doc, stopwords, w2v_model) -> np.ndarray:
        """
        Identify the vector values for each word in the given document
        :param doc:
        :return:
        """
        doc = doc.lower()
        words = [w for w in doc.split(" ") if w not in stopwords]
        word_vecs = []
        for word in words:
            try:
                vec = w2v_model[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore, if the word doesn't exist in the vocabulary
                pass
        if not word_vecs:
            # If empty - return zeros
            return np.zeros(w2v_model.vector_size)
        vector = np.mean(word_vecs, axis=0)
        return vector

In [28]:
tqdm.pandas()
corpus['vectors'] = corpus.progress_apply(lambda x: vectorize(x['text'], STOP_WORDS['en'], w2v_model), axis=1)

100%|██████████| 268022/268022 [25:44<00:00, 173.54it/s]


In [29]:
vectors = np.array(corpus['vectors'].tolist())

In [30]:
vectors.shape

(268022, 300)

# 3. Preprocessing

In [33]:
# Initialize objects outside the function to avoid re-initialization overhead
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

def tokenize(text, lang="en"):
    """
    Tokenizes and stems the input text efficiently.
    
    :param text: str, input text to process
    :return: list, tokenized and stemmed words
    """
    
    tokens = tokenizer.tokenize(text)
    
    # Combine stemming and stopword filtering into one pass for efficiency
    return [stemmer.stem(word.lower()) for word in tokens if word.lower() not in STOP_WORDS[lang] or stemmer.stem(word.lower()) not in STOP_WORDS[lang]]

def get_vectors(words, model):
    """
    Get the vectors of the words from the model
    
    :param words: list, list of words
    :param model: Word2Vec, the Word2Vec model
    :return: list, list of vectors
    """
    vectors = []
    for word in words:
        try:
            vectors.append(model.wv[word])
        except KeyError:
            pass
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [34]:
def create_word2vec_model(model_name, tokens_list=None,min_count=1, window=5, vector_size=100, workers=cores-1, epochs=100):
    """
    Create a Word2Vec model from the tokens list and save it to a file.
    
    :param tokens_list: list, list of tokenized words
    :param model_name: str, name of the model file
    :param min_count: int, minimum number of occurrences of a word to be included in the model
    :param window: int, maximum distance between the current and predicted word within a sentence
    :param vector_size: int, dimensionality of the word vectors
    :param workers: int, number of worker threads to train the model
    :param epochs: int, number of iterations over the corpus
    """
    
    if not os.path.exists(os.path.join(DATA_FOLDER, "models", "word2vec.model")):
        if tokens_list is None:
            raise ValueError("tokens_list must be provided to create the model")
        print(f"Creating Word2Vec model with min_count={min_count}, window={window}, vector_size={vector_size}, workers={workers}, epochs={epochs}")
        w2v_model = Word2Vec(min_count=min_count,
                            window=window,
                            vector_size=vector_size,
                            workers=workers)
        
        t = time()
        w2v_model.build_vocab(tokens_list, progress_per=10000)
        print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
        
        t = time()
        w2v_model.train(tokens_list, total_examples=w2v_model.corpus_count, epochs=epochs, report_delay=1)
        print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
        
        # Save the model
        w2v_model.save(os.path.join(DATA_FOLDER, "models", model_name))
        print(f"Word2Vec model saved as {model_name}")
    else:
        print("Word2Vec model already exists, loading it...")
        w2v_model = Word2Vec.load(os.path.join(DATA_FOLDER, "models", "word2vec.model"))
        
    return w2v_model

# Create the Word2Vec model
w2v_model = create_word2vec_model("word2vec.model")

Word2Vec model already exists, loading it...


In [50]:
# def process_df(file_name, w2v_model):
#     """
#     Process the dataframe to get the vectors of the text
    
#     :param df_name: str, the name of the dataframe
#     :return: pd.DataFrame, the dataframe with the vectors
#     """
#     if os.path.exists(os.path.join(DATA_FOLDER, "pickles", file_name)):
#         print(f"Loading {file_name} vectors from the pickle file")
#         df = load_data(file_name)
#     else:
#         if "json" in file_name:
#             df = pd.read_json(os.path.join(DATA_FOLDER, "corpus", file_name))
#         else:
#             df = pd.read_csv(os.path.join(DATA_FOLDER, file_name))
#         print(f"Processing {file_name} vectors")
#         if "text" in df.columns:
#             df['tokenized'] = df.progress_apply(lambda row: tokenize(row['text'], lang=row['lang']), axis=1)
#         elif "query" in df.columns:
#             df['tokenized'] = df.progress_apply(lambda row: tokenize(row['query'], lang=row['lang']), axis=1)
#         else:
#             raise ValueError("The dataframe must contain a 'text' or 'query' column")
#         df['vectors'] = df['tokenized'].progress_apply(lambda x: vectorize(x, w2v_model))
#         save_data(df, file_name.replace(".csv", ".pkl"))
    
#     return df

def process_df(file_name, w2v_model):
    """
    Process the dataframe to get the vectors of the text
    
    :param df_name: str, the name of the dataframe
    :return: pd.DataFrame, the dataframe with the vectors
    """
    if os.path.exists(os.path.join(DATA_FOLDER, "pickles", file_name)):
        print(f"Loading {file_name} vectors from the pickle file")
        df = load_data(file_name)
    else:
        if "json" in file_name:
            df = pd.read_json(os.path.join(DATA_FOLDER, "corpus", file_name))
        else:
            df = pd.read_csv(os.path.join(DATA_FOLDER, file_name))
        print(f"Processing {file_name} vectors")
        print(df.columns)
        if "text" in df.columns:
            df['vectors'] = df.progress_apply(lambda x: vectorize(x['text'],STOP_WORDS[x['lang']], w2v_model), axis=1)
        elif "query" in df.columns:
            df['vectors'] = df.progress_apply(lambda x: vectorize(x['query'],STOP_WORDS[x['lang']], w2v_model), axis=1)
        else:
            raise ValueError("The dataframe must contain a 'text' or 'query' column")
        
        save_data(df, file_name.replace(".csv", ".pkl"))
    
    return df

In [51]:
df_queries_test = process_df("test.csv", w2v_model)

Processing test.csv vectors
Index(['id', 'query_id', 'query', 'lang'], dtype='object')


100%|██████████| 2000/2000 [00:00<00:00, 13380.35it/s]


In [11]:
tqdm.pandas()

# Apply the tokenization
corpus['tokenized'] = corpus.progress_apply(lambda row: tokenize(row['text'], lang=row['lang']), axis=1)
tokens_list = corpus['tokenized'].tolist()
corpus['vectors'] = corpus['tokenized'].progress_apply(lambda x: get_vectors(x, w2v_model))

In [13]:
df_queries = pd.read_csv(os.path.join(DATA_FOLDER, "train.csv"))
df_test_queries = pd.read_csv(os.path.join(DATA_FOLDER, "test.csv"))
df_test_queries['tokenized'] = df_test_queries.progress_apply(lambda row: tokenize(row['query'], lang=row['lang']), axis=1)
df_test_queries['vectors'] = df_test_queries['tokenized'].progress_apply(lambda x: get_vectors(x, w2v_model))

# 4. Calculate similarities

In [53]:
def rank_results(queries, df):
    """
    Rank the results of the queries
    
    :param queries: pd.DataFrame, the queries
    :param df: pd.DataFrame, the corpus
    :param model: Word2Vec, the Word2Vec model

    :return: pd.DataFrame, the ranked results
    """
    # Extract vectors from queries and corpus
    query_vectors = np.stack(queries['vectors'].values)
    doc_vectors = np.stack(df['vectors'].values)

    # Compute cosine similarities in one step for all query-document pairs
    similarities = cosine_similarity(query_vectors, doc_vectors)

    results = []

    for i, row in tqdm(enumerate(queries.iterrows()), total=len(queries)):
        id = row[0]
        # Get similarities for the current query and sort them
        similarity_scores = similarities[i]
        top10_indices = np.argsort(similarity_scores)[::-1][:10]
        top10_docids = df.iloc[top10_indices]['docid'].tolist()

        top10_results = {
            "id": id,
            "docids": top10_docids
        }
        results.append(top10_results)
    
    return results, similarities

ranked_results, similarities = rank_results(df_queries_test, corpus)

100%|██████████| 2000/2000 [00:31<00:00, 62.97it/s] 


In [55]:
#save as csv
ranked_results = pd.DataFrame(ranked_results)
ranked_results.to_csv(os.path.join(DATA_FOLDER, "ranked_results2.csv"), index=False)