In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier

In [2]:
data_root = "../data/multipride_data/"
figures_root = "../figures/"
os.makedirs(figures_root, exist_ok=True)

train_files = [file for file in os.listdir(data_root) if (file.endswith(".csv") and ("train" in file))]
train_files

['train_en.csv', 'train_es.csv', 'train_it.csv']

In [3]:
train_df = pd.DataFrame()

for file in train_files:
    temp_df = pd.read_csv(os.path.join(data_root, file))
    if "en" in file:
        temp_df["bio"] = [None] * temp_df.shape[0]
    train_df = pd.concat([train_df, temp_df], ignore_index=True)

print(f"Total training samples: {train_df.shape[0]}")

Total training samples: 2988


# Language Specific - Baseline

In [4]:
stop_words_dict = {
    'en': stopwords.words('english'),
    'es': stopwords.words('spanish'),
    'it': stopwords.words('italian'),
}


scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0)
}


models = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "LinearSVC": LinearSVC(),
    "RidgeClassifier": RidgeClassifier(),
    "SGDClassifier": SGDClassifier(max_iter=1000, tol=1e-3),
    "MultinomialNB": MultinomialNB(),
    "ComplementNB": ComplementNB(),
    "BernoulliNB": BernoulliNB(),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Dummy": DummyClassifier(strategy='most_frequent')
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
all_results = []

df = train_df

for lang in df['lang'].unique():
    
    lang_df = df[df['lang'] == lang]
    texts = lang_df['text'].astype(str)
    labels = lang_df['label'].astype(int)
    
    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words=stop_words_dict.get(lang, None),
        ngram_range=(1, 3),
        max_features=20000
    )
    X = vectorizer.fit_transform(texts)
    
    for name, model in models.items():
        try:
            cv_results = cross_validate(model, X, labels, cv=cv, scoring=scoring)
            all_results.append({
                'Language': lang,
                'Model': name,
                'Accuracy': np.mean(cv_results['test_accuracy']),
                'Precision': np.mean(cv_results['test_precision']),
                'Recall': np.mean(cv_results['test_recall']),
                'F1': np.mean(cv_results['test_f1'])
            })
        except Exception as e:
            print(f"{name} failed for {lang}: {e}")


results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values(['Language', 'F1'], ascending=[True, False])

# Print best per language
for lang in results_df['Language'].unique():
    best = results_df[results_df['Language'] == lang].iloc[0]
    print(f"\n Best for {lang.upper()}: {best['Model']} "
          f"(Acc={best['Accuracy']:.3f}, Prec={best['Precision']:.3f}, "
          f"Rec={best['Recall']:.3f}, F1={best['F1']:.3f})")

print("\nAll Results Summary:\n", results_df.round(3))

# --- Combined Average Score (across all languages) ---
combined_avg = results_df[['Accuracy', 'Precision', 'Recall', 'F1']].mean()

print("\nCombined Average Performance Across Languages:")
print(f" Accuracy:  {combined_avg['Accuracy']:.3f}")
print(f" Precision: {combined_avg['Precision']:.3f}")
print(f" Recall:    {combined_avg['Recall']:.3f}")
print(f" F1-score:  {combined_avg['F1']:.3f}")



 Best for EN: DecisionTree (Acc=0.861, Prec=0.134, Rec=0.113, F1=0.118)

 Best for ES: KNN (Acc=0.839, Prec=0.438, Rec=0.299, F1=0.349)

 Best for IT: DecisionTree (Acc=0.901, Prec=0.746, Rec=0.739, F1=0.741)

All Results Summary:
    Language               Model  Accuracy  Precision  Recall     F1
10       en        DecisionTree     0.861      0.134   0.113  0.118
11       en                 KNN     0.910      0.367   0.034  0.061
8        en    GradientBoosting     0.890      0.158   0.035  0.053
3        en       SGDClassifier     0.915      0.200   0.011  0.021
0        en  LogisticRegression     0.914      0.000   0.000  0.000
1        en           LinearSVC     0.914      0.000   0.000  0.000
2        en     RidgeClassifier     0.914      0.000   0.000  0.000
4        en       MultinomialNB     0.914      0.000   0.000  0.000
5        en        ComplementNB     0.879      0.000   0.000  0.000
6        en         BernoulliNB     0.912      0.000   0.000  0.000
7        en        

# Multi-Lingual Baseline

In [6]:
texts = train_df['text'].astype(str)
labels = train_df['label'].astype(int)
langs  = train_df['lang']

# Merge language-specific stopwords
stop_words_dict = {
    'en': stopwords.words('english'),
    'es': stopwords.words('spanish'),
    'it': stopwords.words('italian'),
}

combined_stopwords = set()
for lang in langs.unique():
    combined_stopwords.update(stop_words_dict.get(lang, []))

In [7]:

tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words=list(combined_stopwords),
    ngram_range=(1, 3),
    max_features=30000
)

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, class_weight='balanced'),
    "Linear SVM": LinearSVC(class_weight='balanced'),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=300, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42)
}

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'recall': make_scorer(recall_score, zero_division=0),
    'f1': make_scorer(f1_score, zero_division=0)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, model in models.items():
    pipeline = Pipeline([('tfidf', tfidf), ('clf', model)])
    scores = cross_validate(pipeline, texts, labels, cv=cv, scoring=scoring, n_jobs=-1)
    
    results.append({
        'Model': name,
        'Accuracy': np.mean(scores['test_accuracy']),
        'Precision': np.mean(scores['test_precision']),
        'Recall': np.mean(scores['test_recall']),
        'F1': np.mean(scores['test_f1'])
    })

results_df = pd.DataFrame(results).sort_values(by='F1', ascending=False).reset_index(drop=True)


print("\n Model Performance (Multilingual Unified Dataset):")
print(results_df.round(4))



 Model Performance (Multilingual Unified Dataset):
                 Model  Accuracy  Precision  Recall      F1
0           Linear SVM    0.8899     0.6659  0.4696  0.5497
1  Logistic Regression    0.8645     0.5253  0.5747  0.5478
2        Random Forest    0.8872     0.6900  0.4087  0.5070
3    Gradient Boosting    0.8869     0.6754  0.4039  0.5031
4                  KNN    0.8487     0.4837  0.4904  0.4791
5        Decision Tree    0.8099     0.3935  0.5937  0.4728
6          Naive Bayes    0.8584     0.6000  0.0117  0.0229


### Observations (GenAI - Curated)

* In the language-specific runs, performance differs sharply by language.
    
* For English, even though the Decision Tree yields the best accuracy (≈0.86), the extremely low precision and recall show that the classifier mostly predicts the dominant class (likely the non-reclamatory one). The imbalance in label distribution causes accuracy to be misleadingly high, while the model fails to capture reclamatory usage.

  
* For Spanish, results improve slightly while using KNN, DecisionTree, and even ensemble models (AdaBoost, GradientBoosting) show moderate precision–recall balance, suggesting that lexical patterns in Spanish are somewhat more distinctive for reclamatory contexts, though still not robustly separable.

  
* For Italian, performance is consistently high, with DecisionTree and ensemble models achieving strong F1 (>0.7). This indicates that Italian examples exhibit clearer lexical markers or less code-mixing noise, allowing the models to learn discriminative cues effectively.

* When all languages are merged into a single multilingual dataset, overall performance stabilizes but slightly compresses across models. Linear SVM and Gradient Boosting achieve the best trade-off (F1 ≈ 0.55).  This shows that shared multilingual representations help capture some general structure of reclamatory intent, but language-specific nuances are diluted. Naive Bayes and Decision Tree drop substantially, implying they cannot generalize well to the mixed distribution of lexical and stylistic patterns.

* Overall, these results imply that reclamatory intent is easier to detect when lexical cues are linguistically coherent (as in Italian), but harder when linguistic mixing or class imbalance dominates (as in English). Unified multilingual models yield moderate and balanced results, suggesting that cross-lingual lexical and stylistic overlap exists. It also suggests that more expressive models (transformers or multilingual embeddings) would likely be necessary to capture subtle sociolinguistic signals beyond surface words.