In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    classification_report,
    confusion_matrix
)


from src.genai import GenAIClassifier

✓ All random seeds set to 42


In [2]:
data_root = "../data/multipride_data/"
figures_root = "../figures/"
results_root = "../results/train/"
os.makedirs(figures_root, exist_ok=True)
os.makedirs(results_root, exist_ok=True)

train_files = [file for file in os.listdir(data_root) if (file.endswith(".csv") and ("train" in file))]
train_files

['train_en.csv', 'train_es.csv', 'train_it.csv']

In [3]:
train_df = pd.DataFrame()

for file in train_files:
    temp_df = pd.read_csv(os.path.join(data_root, file))
    if "en" in file:
        temp_df["bio"] = [None] * temp_df.shape[0]
    train_df = pd.concat([train_df, temp_df], ignore_index=True)

print(f"Total training samples: {train_df.shape[0]}")

Total training samples: 2988


In [4]:
train_df.head()

Unnamed: 0,id,text,label,lang,bio
0,en_1021,"I've never heard anyone use the word ""faggot"" ...",0,en,
1,en_1496,So you don't see the slighest problem of someb...,0,en,
2,en_1312,"And to be fair, getting triggered by slurs is ...",1,en,
3,en_469,"I kinda feel like it's saying ""the faggot comm...",0,en,
4,en_565,"Homophobia, racism, and the resulting endless ...",0,en,


In [5]:
set(train_df.lang)

{'en', 'es', 'it'}

In [6]:
language_mapper = {
    "en": "English",
    "es": "Spanish",
    "it": "Italian"
}

In [7]:
train_df.text[0], train_df.lang[0]

('I\'ve never heard anyone use the word "faggot" and not sound like an insecure 13 year old. ',
 'en')

In [8]:
def calculate_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0)
    }

def calculate_overall_metrics(df, model_columns):
    results = []
    
    for model_col in model_columns:
        metrics = calculate_metrics(df['label'], df[model_col])
        metrics['model'] = model_names.get(model_col, model_col)
        metrics['n_samples'] = len(df)
        results.append(metrics)
    
    return pd.DataFrame(results)

def calculate_language_wise_metrics(df, model_columns):
    results = []
    
    languages = df['lang'].unique()
    
    for lang in sorted(languages):
        df_lang = df[df['lang'] == lang]
        
        for model_col in model_columns:
            metrics = calculate_metrics(df_lang['label'], df_lang[model_col])
            metrics['model'] = model_names.get(model_col, model_col)
            metrics['language'] = lang
            metrics['n_samples'] = len(df_lang)
            results.append(metrics)
    
    return pd.DataFrame(results)

def print_results(overall_df, language_df):
    
    print("="*80)
    print("OVERALL METRICS (All Languages)")
    print("="*80)
    print(overall_df.to_string(index=False))
    print("\n")
    
    print("="*80)
    print("LANGUAGE-WISE METRICS")
    print("="*80)
    
    languages = language_df['language'].unique()
    for lang in sorted(languages):
        print(f"\n{lang.upper()} Language:")
        print("-"*80)
        lang_data = language_df[language_df['language'] == lang]
        print(lang_data[['model', 'accuracy', 'precision', 'recall', 'f1', 'n_samples']].to_string(index=False))
    
    print("\n")

def create_comparison_table(overall_df):
    df_sorted = overall_df.sort_values('f1', ascending=False).reset_index(drop=True)
    df_sorted['rank'] = range(1, len(df_sorted) + 1)
    
    print("="*80)
    print("MODEL RANKING (by F1-Score)")
    print("="*80)
    print(df_sorted[['rank', 'model', 'f1', 'accuracy', 'precision', 'recall', 'n_samples']].to_string(index=False))
    print("\n")
    
    return df_sorted

def calculate_class_distribution(df):
    print("="*80)
    print("CLASS DISTRIBUTION")
    print("="*80)
    
    overall_dist = df['label'].value_counts().sort_index()
    print(f"\nOverall:")
    print(f"  Class 0 (NOT_RECLAMATORY): {overall_dist.get(0, 0)} ({overall_dist.get(0, 0)/len(df)*100:.1f}%)")
    print(f"  Class 1 (RECLAMATORY): {overall_dist.get(1, 0)} ({overall_dist.get(1, 0)/len(df)*100:.1f}%)")
    print(f"  Total: {len(df)}")
    
    print(f"\nPer Language:")
    for lang in sorted(df['lang'].unique()):
        df_lang = df[df['lang'] == lang]
        lang_dist = df_lang['label'].value_counts().sort_index()
        print(f"  {lang.upper()}: Class 0={lang_dist.get(0, 0)}, Class 1={lang_dist.get(1, 0)}, Total={len(df_lang)}")
    print("\n")

def generate_detailed_report(df, model_col):
    model_name = model_names.get(model_col, model_col)
    
    print(f"\n{'='*80}")
    print(f"DETAILED REPORT: {model_name}")
    print(f"{'='*80}")
    
    print("\nOverall Classification Report:")
    print(classification_report(df['label'], df[model_col], 
                                target_names=['NOT_RECLAMATORY', 'RECLAMATORY'],
                                zero_division=0))
    
    cm = confusion_matrix(df['label'], df[model_col])
    print("\nConfusion Matrix:")
    print(f"                Predicted NOT    Predicted REC")
    print(f"Actual NOT      {cm[0][0]:<15}  {cm[0][1]:<15}")
    print(f"Actual REC      {cm[1][0]:<15}  {cm[1][1]:<15}")
    
    print("\n" + "-"*80)
    print("Per-Language Reports:")
    print("-"*80)
    
    for lang in sorted(df['lang'].unique()):
        df_lang = df[df['lang'] == lang]
        print(f"\n{lang.upper()} Language:")
        print(classification_report(df_lang['label'], df_lang[model_col],
                                   target_names=['NOT_RECLAMATORY', 'RECLAMATORY'],
                                   zero_division=0))


# Vannila Prompt Analysis

In [9]:
genai_classifier = GenAIClassifier()

In [10]:
print(genai_classifier.create_prompt(train_df.text[0], language_mapper[train_df.lang[0]], use_advanced=False))

Analyze this tweet for LGBTQ+ reclamatory intent.
            Strictly give final classification answer only, don't give any explanation.

            RECLAMATORY = Pride, empowerment, self-affirmation
            NOT_RECLAMATORY = Derogatory, neutral, or unclear
            
            Tweet (English): "I've never heard anyone use the word "faggot" and not sound like an insecure 13 year old. "
            
            Let's think step by step:
            1. What term(s) are used?
            2. Is the tone positive/affirmative?
            3. Is it reclamatory?
            
            Answer:
            Classification: [RECLAMATORY / NOT_RECLAMATORY]
            


In [11]:
vannila_df = pd.read_csv("../results/train/train_simple_prompt.csv")

In [12]:
vannila_df

Unnamed: 0,id,lang,label,HuggingFaceTB/SmolLM3-3B,microsoft/Phi-3.5-mini-instruct,tiiuae/Falcon3-3B-Instruct,Qwen/Qwen2.5-Omni-7B,google/gemma-3n-E4B-it
0,en_1021,en,0,0,0,1,0,0
1,en_1496,en,0,0,0,1,0,1
2,en_1312,en,1,1,1,1,1,1
3,en_469,en,0,0,0,1,1,0
4,en_565,en,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...
2983,it_1340,it,0,0,1,1,0,1
2984,it_595,it,0,0,1,1,0,0
2985,it_844,it,0,0,0,1,0,0
2986,it_1216,it,0,0,1,1,1,1


In [13]:
model_columns = [m for m in list(vannila_df.columns) if m not in ["id", "lang", "label"]]
model_names = {}
for model_column in model_columns:
    model_names[model_column] = model_column

In [14]:
df = vannila_df

calculate_class_distribution(df)
    
overall_metrics = calculate_overall_metrics(df, model_columns)

language_metrics = calculate_language_wise_metrics(df, model_columns)

print_results(overall_metrics, language_metrics)

ranking = create_comparison_table(overall_metrics)

best_model = ranking.iloc[0]['model']
best_model_col = [k for k, v in model_names.items() if v == best_model][0]
generate_detailed_report(df, best_model_col)

CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 2560 (85.7%)
  Class 1 (RECLAMATORY): 428 (14.3%)
  Total: 2988

Per Language:
  EN: Class 0=938, Class 1=88, Total=1026
  ES: Class 0=743, Class 1=133, Total=876
  IT: Class 0=879, Class 1=207, Total=1086


OVERALL METRICS (All Languages)
 accuracy  precision   recall       f1                           model  n_samples
 0.650602   0.188259 0.434579 0.262712        HuggingFaceTB/SmolLM3-3B       2988
 0.491633   0.191634 0.792056 0.308603 microsoft/Phi-3.5-mini-instruct       2988
 0.144578   0.143432 1.000000 0.250879      tiiuae/Falcon3-3B-Instruct       2988
 0.506359   0.159844 0.574766 0.250127            Qwen/Qwen2.5-Omni-7B       2988
 0.507363   0.178571 0.677570 0.282651          google/gemma-3n-E4B-it       2988


LANGUAGE-WISE METRICS

EN Language:
--------------------------------------------------------------------------------
                          model  accuracy  precision   recall       f1  n_samples
       Hu

# Chain-of-Thought Prompt Analysis

In [15]:
cot_df = pd.read_csv("../results/train/train_cot_prompt.csv")

In [16]:
cot_df

Unnamed: 0,id,lang,label,HuggingFaceTB/SmolLM3-3B,microsoft/Phi-3.5-mini-instruct,tiiuae/Falcon3-3B-Instruct,Qwen/Qwen2.5-Omni-7B,google/gemma-3n-E4B-it
0,en_1021,en,0,0,0,1,0,0
1,en_1496,en,0,0,0,1,0,0
2,en_1312,en,1,1,1,0,1,1
3,en_469,en,0,0,0,1,1,0
4,en_565,en,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...
2983,it_1340,it,0,0,0,1,0,1
2984,it_595,it,0,0,1,1,0,0
2985,it_844,it,0,0,0,1,0,0
2986,it_1216,it,0,1,1,1,1,1


In [17]:
model_columns = [m for m in list(cot_df.columns) if m not in ["id", "lang", "label"]]
model_columns = model_names
model_names = {}
for model_column in model_columns:
    model_names[model_column] = model_column

In [18]:
df = cot_df
    
overall_metrics = calculate_overall_metrics(df, model_columns)

language_metrics = calculate_language_wise_metrics(df, model_columns)

print_results(overall_metrics, language_metrics)

ranking = create_comparison_table(overall_metrics)

best_model = ranking.iloc[0]['model']
best_model_col = [k for k, v in model_names.items() if v == best_model][0]
generate_detailed_report(df, best_model_col)

OVERALL METRICS (All Languages)
 accuracy  precision   recall       f1                           model  n_samples
 0.647256   0.205273 0.509346 0.292617        HuggingFaceTB/SmolLM3-3B       2988
 0.556560   0.197164 0.682243 0.305919 microsoft/Phi-3.5-mini-instruct       2988
 0.280120   0.159082 0.939252 0.272081      tiiuae/Falcon3-3B-Instruct       2988
 0.548193   0.164483 0.528037 0.250832            Qwen/Qwen2.5-Omni-7B       2988
 0.561580   0.164384 0.504673 0.247991          google/gemma-3n-E4B-it       2988


LANGUAGE-WISE METRICS

EN Language:
--------------------------------------------------------------------------------
                          model  accuracy  precision   recall       f1  n_samples
       HuggingFaceTB/SmolLM3-3B  0.825536   0.242938 0.488636 0.324528       1026
microsoft/Phi-3.5-mini-instruct  0.823587   0.251337 0.534091 0.341818       1026
     tiiuae/Falcon3-3B-Instruct  0.451267   0.107438 0.738636 0.187590       1026
           Qwen/Qwen2.5-Omni-