In [1]:
import gzip
import json
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from src.embeddings import ModelRegistry
from src.embeddings import EmbeddingPipeline
from src.utils import set_all_seeds

set_all_seeds()

âœ“ All random seeds set to 42


In [2]:
considered_models = list(ModelRegistry.list_models().keys())
considered_models

['multilingual-e5-large',
 'bge-m3',
 'gte-multilingual-base',
 'jina-embeddings-v3',
 'snowflake-arctic-embed-l-v2.0',
 'labse',
 'use-multilingual',
 'xlm-roberta-large']

In [3]:
data_root = "../data/multipride_data/"
figures_root = "../figures/"
embeddings_root = "../embeddings/"
os.makedirs(figures_root, exist_ok=True)
os.makedirs(embeddings_root, exist_ok=True)

train_files = [file for file in os.listdir(data_root) if (file.endswith(".csv") and ("train" in file))]
train_files

['train_en.csv', 'train_es.csv', 'train_it.csv']

In [4]:
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0)
}


models = {
    "LinearSVC (u)": LinearSVC(class_weight=None, max_iter=1000),
    "LinearSVC (b)": LinearSVC(class_weight='balanced', max_iter=1000),
    "LogisticRegression (u)": LogisticRegression(max_iter=500),
    "LogisticRegression (b)": LogisticRegression(class_weight='balanced', max_iter=500),
    "RandomForest (b)": RandomForestClassifier(class_weight='balanced', n_estimators=200),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Language Specific - Baseline

In [5]:
for considered_model in considered_models:
    print(f"\n============================")
    print(f"Evaluating embedding model: {considered_model}")
    print(f"\n============================")
    embeddings_path = os.path.join(embeddings_root, considered_model + ".json.gz")
    with gzip.open(embeddings_path, 'rt', encoding='utf-8') as f:
        emb_data = json.load(f)
    feature_matrix = []
    labels = []
    text_ids = []
    langs = []
    for item in emb_data:
        text_ids.append(item)
        langs.append(item.split("_")[0])
        feature_matrix.append(emb_data[item]["emb"])
        labels.append(emb_data[item]["label"])

    train_df = pd.DataFrame.from_dict(
        {
            "ids": text_ids,
            "lang": langs,
            "feat": feature_matrix,
            "label": labels
        }
    )

    all_results = []

    df = train_df
    
    for lang in df['lang'].unique():
        
        lang_df = df[df['lang'] == lang]
        X = np.asarray(list(lang_df['feat']))
        labels = lang_df['label'].astype(int)
        
        for name, model in models.items():
            try:
                cv_results = cross_validate(model, X, labels, cv=cv, scoring=scoring)
                all_results.append({
                    'Language': lang,
                    'Model': name,
                    'Accuracy': np.mean(cv_results['test_accuracy']),
                    'Precision': np.mean(cv_results['test_precision']),
                    'Recall': np.mean(cv_results['test_recall']),
                    'F1': np.mean(cv_results['test_f1'])
                })
            except Exception as e:
                print(f"{name} failed for {lang}: {e}")
    
    
    results_df = pd.DataFrame(all_results)
    results_df = results_df.sort_values(['Language', 'F1'], ascending=[True, False])
    
    # Print best per language
    for lang in results_df['Language'].unique():
        best = results_df[results_df['Language'] == lang].iloc[0]
        print(f"\n Best for {lang.upper()}: {best['Model']} "
              f"(Acc={best['Accuracy']:.3f}, Prec={best['Precision']:.3f}, "
              f"Rec={best['Recall']:.3f}, F1={best['F1']:.3f})")
    
    print("\nAll Results Summary:\n", results_df.round(3))
    
    # --- Combined Average Score (across all languages) ---
    combined_avg = results_df[['Accuracy', 'Precision', 'Recall', 'F1']].mean()
    
    print("\nCombined Average Performance Across Languages:")
    print(f" Accuracy:  {combined_avg['Accuracy']:.3f}")
    print(f" Precision: {combined_avg['Precision']:.3f}")
    print(f" Recall:    {combined_avg['Recall']:.3f}")
    print(f" F1-score:  {combined_avg['F1']:.3f}")



Evaluating embedding model: multilingual-e5-large


 Best for EN: LogisticRegression (b) (Acc=0.786, Prec=0.234, Rec=0.659, F1=0.346)

 Best for ES: LinearSVC (b) (Acc=0.826, Prec=0.454, Rec=0.736, F1=0.561)

 Best for IT: LinearSVC (u) (Acc=0.946, Prec=0.970, Rec=0.739, F1=0.837)

All Results Summary:
    Language                   Model  Accuracy  Precision  Recall     F1
13       en  LogisticRegression (b)     0.786      0.234   0.659  0.346
11       en           LinearSVC (b)     0.834      0.254   0.479  0.332
10       en           LinearSVC (u)     0.914      0.000   0.000  0.000
12       en  LogisticRegression (u)     0.914      0.000   0.000  0.000
14       en        RandomForest (b)     0.914      0.000   0.000  0.000
1        es           LinearSVC (b)     0.826      0.454   0.736  0.561
3        es  LogisticRegression (b)     0.799      0.418   0.811  0.551
0        es           LinearSVC (u)     0.871      0.720   0.255  0.374
4        es        RandomForest (b)     0.868 

# Multi-Lingual Baseline

In [6]:
for considered_model in considered_models:
    print(f"\n============================")
    print(f"Evaluating embedding model: {considered_model}")
    print(f"\n============================")
    embeddings_path = os.path.join(embeddings_root, considered_model + ".json.gz")
    with gzip.open(embeddings_path, 'rt', encoding='utf-8') as f:
        emb_data = json.load(f)
    feature_matrix = []
    labels = []
    text_ids = []
    langs = []
    for item in emb_data:
        text_ids.append(item)
        langs.append(item.split("_")[0])
        feature_matrix.append(emb_data[item]["emb"])
        labels.append(emb_data[item]["label"])

    train_df = pd.DataFrame.from_dict(
        {
            "ids": text_ids,
            "lang": langs,
            "feat": feature_matrix,
            "label": labels
        }
    )

    X = np.asarray(list(train_df['feat']))
    labels = train_df['label'].astype(int)

    results = []

    for name, model in models.items():
        pipeline = Pipeline([('clf', model)])
        scores = cross_validate(pipeline, X, labels, cv=cv, scoring=scoring, n_jobs=-1)
        
        results.append({
            'Model': name,
            'Accuracy': np.mean(scores['test_accuracy']),
            'Precision': np.mean(scores['test_precision']),
            'Recall': np.mean(scores['test_recall']),
            'F1': np.mean(scores['test_f1'])
        })
    
    results_df = pd.DataFrame(results).sort_values(by='F1', ascending=False).reset_index(drop=True)
    
    
    print("\n Model Performance (Multilingual Unified Dataset):")
    print(results_df.round(4))

    
    


Evaluating embedding model: multilingual-e5-large


 Model Performance (Multilingual Unified Dataset):
                    Model  Accuracy  Precision  Recall      F1
0           LinearSVC (b)    0.8353     0.4559  0.7548  0.5678
1           LinearSVC (u)    0.9056     0.8446  0.4161  0.5563
2  LogisticRegression (b)    0.8112     0.4152  0.7641  0.5375
3  LogisticRegression (u)    0.8715     0.9211  0.1121  0.1998
4        RandomForest (b)    0.8678     0.8961  0.0865  0.1567

Evaluating embedding model: bge-m3


 Model Performance (Multilingual Unified Dataset):
                    Model  Accuracy  Precision  Recall      F1
0           LinearSVC (b)    0.8360     0.4556  0.6989  0.5498
1  LogisticRegression (b)    0.8176     0.4239  0.7431  0.5389
2           LinearSVC (u)    0.8916     0.7014  0.4232  0.5257
3  LogisticRegression (u)    0.8889     0.8379  0.2781  0.4166
4        RandomForest (b)    0.8628     0.8129  0.0537  0.1005

Evaluating embedding model: gte-multilingual-base
