In [1]:
# -*- coding: utf-8 -*-
# -*- authors : Vincent Roduit -*-
# -*- date : 2024-09-30 -*-
# -*- Last revision: 2024-09-30 by Vincent Roduit -*-
# -*- python version : 3.9.19 -*-
# -*- Description: Constants used in the code *-

# <center> CS - 423: Distributed Information Systems </center>
## <center> Ecole Polytechnique Fédérale de Lausanne </center>
### <center>Project 1: Document Retrieval </center>
---

In [4]:
#import libraries
import pandas as pd
import os
from nltk.corpus import stopwords
import pickle as pkl
import nltk
from nltk.stem import PorterStemmer
import string
from tqdm import tqdm
import math
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec

# automatically reload the module
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Declaring constants

In [32]:
# Path to the data folder
if os.path.exists("../data"):
    DATA_FOLDER = "../data"
else:
    # create the data folder
    os.mkdir("../data")

CORPUS = os.path.join(DATA_FOLDER, "corpus", "corpus.json")
CORPUS_PKL = os.path.join(DATA_FOLDER, "pickles", "corpus.pkl")
CORPUS_REDUCED_PKL = os.path.join(DATA_FOLDER, "pickles", "corpus_reduced.pkl")

STOP_WORDS = {
    "en": set(stopwords.words('english')),
    "fr": set(stopwords.words('french')),
    "de": set(stopwords.words('german')),
    "es": set(stopwords.words('spanish')),
    "it": set(stopwords.words('italian')),
    "ko": set(stopwords.words('korean')),
    "ar": set(stopwords.words('arabic')),
}


# 2. Loading Data

In [33]:
def save_data(data: any, file_name: str, folder: str = os.path.join(DATA_FOLDER, "pickles")):
    """
    Save the data to a file
    Args:

    * data (any): the data to save

    * file_name (str): the name of the file

    * folder (str): the folder where to save the file
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

    file_path = os.path.join(folder, file_name)

    with open(file_path, 'wb') as handle:
        pkl.dump(data, handle)

def load_data(file_name: str, folder: str = os.path.join(DATA_FOLDER, "pickles")) -> any:
    """
    Load the data from a file
    Args:

    * file_name (str): the name of the file

    * folder (str): the folder where to save the file

    Returns:

    * any: the data
    """
    file_path = os.path.join(folder, file_name)

    with open(file_path, 'rb') as handle:
        data = pkl.load(handle)

    return data

In [34]:
# load the corpus
if os.path.exists(CORPUS_PKL):
    print("Loading the corpus from the pickle file")
    corpus = load_data("corpus.pkl")
else:
    print("Loading the corpus from the json file")
    corpus = pd.read_json(CORPUS)

Loading the corpus from the json file


In [36]:
division = len(corpus) // 3

df_corpus_vincent = corpus[:division].copy()
# df_corpus_fabio = corpus[division:2*division].copy()
# df_corpus_yann = corpus[2*division:].copy()

# 3. Preprocessing

In [39]:
# Initialize objects outside the function to avoid re-initialization overhead
tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

def tokenize(text, lang="en"):
    """
    Tokenizes and stems the input text efficiently.
    
    :param text: str, input text to process
    :return: list, tokenized and stemmed words
    """
    
    tokens = tokenizer.tokenize(text)
    
    # Combine stemming and stopword filtering into one pass for efficiency
    return [stemmer.stem(word.lower()) for word in tokens if word.lower() not in STOP_WORDS[lang] or stemmer.stem(word.lower()) not in STOP_WORDS[lang]]

In [40]:
tqdm.pandas()

# Apply the tokenization
# df_corpus_fabio['tokenized'] = df_corpus_fabio.progress_apply(lambda row: tokenize(row['text'], lang=row['lang']), axis=1)
# save_data(df_corpus_fabio, "df_corpus_fabio.pkl")

df_corpus_vincent['tokenized'] = df_corpus_vincent.progress_apply(lambda row: tokenize(row['text'], lang=row['lang']), axis=1)
save_data(df_corpus_vincent, "df_corpus_vincent.pkl")

# df_corpus_yann['tokenized'] = df_corpus_yann.progress_apply(lambda row: tokenize(row['text'], lang=row['lang']), axis=1)
# save_data(df_corpus_yann, "df_corpus_yann.pkl")

100%|██████████| 100/100 [00:01<00:00, 59.00it/s]


In [42]:
# tokens_list_fabio = df_corpus_fabio['tokenized'].tolist()
# save_data(tokens_list_fabio, "tokens_list_fabio.pkl")

tokens_list_vincent = df_corpus_vincent['tokenized'].tolist()
save_data(tokens_list_vincent, "tokens_list_vincent.pkl")

# tokens_list_yann = df_corpus_yann['tokenized'].tolist()
# save_data(tokens_list_yann, "tokens_list_yann.pkl")

# 4. Create the TF-IDF matrix

In [27]:
# Functions for the TF/IDF implementation
def idf_values(vocabulary, documents):
    """
    It computes IDF scores, storing idf values in a dictionary.
    
    :param vocabulary: list of str, with the unique tokens of the vocabulary.
    :param documents: list of lists of str, with tokenized sentences.
    :return: dict with the idf values for each vocabulary word.
    """
    idf = {}
    num_documents = len(documents)
    
    # Adding tqdm progress bar for the loop
    for term in tqdm(vocabulary, desc="Calculating IDF values"):
        idf[term] = math.log(num_documents / sum([1 for d in documents if term in d]), math.e)
    
    return idf

def vectorize(document, vocabulary, idf):
    """
    It generates the vector for an input document (with normalization).
    
    :param document: list of str with the tokenized documents.
    :param vocabulary: list of str, with the unique tokens of the vocabulary.
    :param idf: dict with the idf values for each vocabulary word.
    :return: list of floats
    """
    vector = [0]*len(vocabulary)
    counts = Counter(document)
    max_count = counts.most_common(1)[0][1]
    for i,term in enumerate(vocabulary):
        vector[i] = idf[term] * counts[term] / max_count
    return vector

def cosine_similarity(v1,v2):
    """
    It computes cosine similarity.
    
    :param v1: list of floats, with the vector of a document.
    :param v2: list of floats, with the vector of a document.
    :return: float
    """
    
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
            result = 0
    else:
            result =  sumxy / math.sqrt(sumxx*sumyy)
    return result

def search_vec(query, topk, corpus, idf, vocabulary, document_vectors):
    """
    It computes the search result (get the topk documents).
    
    :param query: str
    :param topk: int
    """
    q = query.split()
    q = [stemmer.stem(w) for w in q]
    query_vector = vectorize(q, vocabulary, idf)
    scores = [[cosine_similarity(query_vector, document_vectors[d]), d] for d in range(len(corpus))]
    scores.sort(key=lambda x: -x[0])
    doc_ids = []
    for i in range(topk):
        doc_ids.append(scores[i][1])

    return doc_ids

In [9]:
def term_frequency(corpus):
    """
    It computes the term frequency for each term in the corpus.
    
    :param corpus: list of lists of str, with the tokenized documents.
    :return: dict with the term frequency for each term.
    """
    term_freq = {}
    for document in corpus:
        for term in document:
            if term in term_freq:
                term_freq[term] += 1
            else:
                term_freq[term] = 1
    return term_freq