In [1]:
import os
import torch
import pandas as pd
import altair as alt

from src.baseline.baseline import train_df, figures_root
from src.finetune.finetuner import main
from src.baseline.utils import calculate_class_distribution

✓ All random seeds set to 42
training files: ['train_en.csv', 'train_it.csv', 'train_es.csv']
Total training samples: 2988
CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 2560 (85.7%)
  Class 1 (RECLAMATORY): 428 (14.3%)
  Total: 2988

Per Language:
  EN: Class 0=938, Class 1=88, Total=1026
  ES: Class 0=743, Class 1=133, Total=876
  IT: Class 0=879, Class 1=207, Total=1086




In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
original_data = train_df
augmented_data = pd.read_csv("../data/augmented_multilingual_tweets.csv")
print(original_data.shape, augmented_data.shape)

(2988, 5) (5976, 8)


In [4]:
merged_data = pd.concat([original_data, augmented_data[list(original_data.columns)]], ignore_index=True)
print(merged_data.shape)

(8964, 5)


In [5]:
train_data = augmented_data

In [6]:
calculate_class_distribution(train_data)

CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 5120 (85.7%)
  Class 1 (RECLAMATORY): 856 (14.3%)
  Total: 5976

Per Language:
  EN: Class 0=1622, Class 1=340, Total=1962
  ES: Class 0=1817, Class 1=295, Total=2112
  IT: Class 0=1681, Class 1=221, Total=1902




# Fine-Tuning on Merged Data

In [7]:
class Config:
    """Configuration for fine-tuning"""

    # Model configuration
    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"  # Base model
    NUM_LABELS = 2  # Binary classification
    MAX_LENGTH = 128  # Maximum sequence length
    NUM_FROZEN_LAYERS = 3  # Number of initial layers to freeze (0 = only train classification head)

    # Training configuration
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01 
    NUM_EPOCHS = 10
    BATCH_SIZE = 8
    GRADIENT_ACCUMULATION_STEPS = 2
    WARMUP_RATIO = 0.15  # Warmup as % of total steps

    # Early stopping
    PATIENCE = 3
    EVAL_STRATEGY = "epoch"  # Evaluate at end of each epoch

    # Cross-validation
    N_SPLITS = 5
    TRAIN_RATIO = 0.8  # 80% for training from each fold
    VAL_RATIO = 0.2  # 20% for validation from each fold

    # Dynamic undersampling
    DYNAMIC_UNDERSAMPLE = False  # Balance classes per epoch

    # Model saving
    MAX_MODELS_TO_SAVE = 2
    OUTPUT_DIR = "../fine_tuned_models"
    RESULTS_DIR = "../results/roberta-fine-tune/original_and_augumented/"

    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training would be start on the device: {DEVICE}")


class Config:
    """Configuration for fine-tuning"""

    # Model configuration
    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"  # Base model
    NUM_LABELS = 2  # Binary classification
    MAX_LENGTH = 128  # Maximum sequence length
    NUM_FROZEN_LAYERS = 3  # Number of initial layers to freeze (0 = only train classification head)

    # Training configuration
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    NUM_EPOCHS = 10
    BATCH_SIZE = 8
    GRADIENT_ACCUMULATION_STEPS = 2
    WARMUP_RATIO = 0.1  # Warmup as % of total steps

    # Early stopping
    PATIENCE = 3
    EVAL_STRATEGY = "epoch"  # Evaluate at end of each epoch

    # Cross-validation
    N_SPLITS = 5
    TRAIN_RATIO = 0.8  # 80% for training from each fold
    VAL_RATIO = 0.2  # 20% for validation from each fold

    # Dynamic undersampling
    DYNAMIC_UNDERSAMPLE = False  # Balance classes per epoch

    # Model saving
    MAX_MODELS_TO_SAVE = 2
    OUTPUT_DIR = "../fine_tuned_models"
    RESULTS_DIR = "../results/roberta-fine-tune/augumented/"

    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training would be start on the device: {DEVICE}")

Training would be start on the device: cuda
Training would be start on the device: cuda


In [8]:
main(train_data, Config)

2025-12-03 13:37:22,829 - INFO - Starting Fine-tuning Pipeline
2025-12-03 13:37:22,836 - INFO - Fold 0: Train=3824, Val=956
2025-12-03 13:37:22,837 - INFO -   Train label dist: {0: 3276, 1: 548}
2025-12-03 13:37:22,837 - INFO -   Train lang dist: {'es': 1352, 'en': 1255, 'it': 1217}
2025-12-03 13:37:22,841 - INFO - Fold 1: Train=3824, Val=957
2025-12-03 13:37:22,841 - INFO -   Train label dist: {0: 3277, 1: 547}
2025-12-03 13:37:22,842 - INFO -   Train lang dist: {'es': 1352, 'en': 1255, 'it': 1217}
2025-12-03 13:37:22,846 - INFO - Fold 2: Train=3824, Val=957
2025-12-03 13:37:22,846 - INFO -   Train label dist: {0: 3277, 1: 547}
2025-12-03 13:37:22,846 - INFO -   Train lang dist: {'es': 1352, 'en': 1256, 'it': 1216}
2025-12-03 13:37:22,850 - INFO - Fold 3: Train=3824, Val=957
2025-12-03 13:37:22,851 - INFO -   Train label dist: {0: 3278, 1: 546}
2025-12-03 13:37:22,851 - INFO -   Train lang dist: {'es': 1352, 'en': 1255, 'it': 1217}
2025-12-03 13:37:22,855 - INFO - Fold 4: Train=3824, 


FOLD 0:
  Train: 548 positive samples
  Val:   137 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-03 13:37:26,256 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-12-03 13:37:26,256 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-03 13:37:26,257 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-12-03 13:37:26,258 - INFO - Label weights: {0: 0.5836385836385837, 1: 3.489051094890511}
2025-12-03 13:37:26,258 - INFO - Language weights: {'es': 0.9409641154628656, 'en': 1.0136920192078043, 'it': 1.0453438653293299}
2025-12-03 13:37:26,258 - INFO - Pos weight (for BCE): 5.9781
2025-12-03 13:37:26,259 - INFO - 
Epoch 1/10
Training: 100%|██████████|


FOLD 1:
  Train: 547 positive samples
  Val:   138 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-03 13:39:58,533 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-12-03 13:39:58,533 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-03 13:39:58,534 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-12-03 13:39:58,535 - INFO - Label weights: {0: 0.5834604821483064, 1: 3.495429616087751}
2025-12-03 13:39:58,535 - INFO - Language weights: {'es': 0.9409641154628656, 'en': 1.0136920192078043, 'it': 1.0453438653293299}
2025-12-03 13:39:58,535 - INFO - Pos weight (for BCE): 5.9909
2025-12-03 13:39:58,537 - INFO - 
Epoch 1/10
Training: 100%|██████████|


FOLD 2:
  Train: 547 positive samples
  Val:   138 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-03 13:40:59,413 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-12-03 13:40:59,414 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-03 13:40:59,414 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-12-03 13:40:59,415 - INFO - Label weights: {0: 0.5834604821483064, 1: 3.495429616087751}
2025-12-03 13:40:59,415 - INFO - Language weights: {'es': 0.9409476243674836, 'en': 1.0128671880134061, 'it': 1.0461851876191102}
2025-12-03 13:40:59,416 - INFO - Pos weight (for BCE): 5.9909
2025-12-03 13:40:59,417 - INFO - 
Epoch 1/10
Training: 100%|██████████|


FOLD 3:
  Train: 546 positive samples
  Val:   138 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-03 13:42:00,453 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-12-03 13:42:00,453 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-03 13:42:00,454 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-12-03 13:42:00,455 - INFO - Label weights: {0: 0.5832824893227577, 1: 3.501831501831502}
2025-12-03 13:42:00,455 - INFO - Language weights: {'es': 0.9409641154628656, 'en': 1.0136920192078043, 'it': 1.0453438653293299}
2025-12-03 13:42:00,455 - INFO - Pos weight (for BCE): 6.0037
2025-12-03 13:42:00,457 - INFO - 
Epoch 1/10
Training: 100%|██████████|


FOLD 4:
  Train: 547 positive samples
  Val:   138 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-03 13:43:01,543 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-12-03 13:43:01,543 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-03 13:43:01,544 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-12-03 13:43:01,545 - INFO - Label weights: {0: 0.5834604821483064, 1: 3.495429616087751}
2025-12-03 13:43:01,545 - INFO - Language weights: {'es': 0.9416953224870754, 'en': 1.0129222776114961, 'it': 1.045382399901429}
2025-12-03 13:43:01,545 - INFO - Pos weight (for BCE): 5.9909
2025-12-03 13:43:01,546 - INFO - 
Epoch 1/10
Training: 100%|██████████| 

### Visualizing Fine-Tune Metrics

In [9]:
df = pd.read_csv('../results/roberta-fine-tune/augumented/training_results.csv')

In [10]:
f1_plot = alt.Chart(df).mark_line(point=True, size=3).encode(
    x=alt.X('epoch:Q', title='Epoch'),
    y=alt.Y('overall_macro_f1:Q', title='Macro F1 Score', scale=alt.Scale(domain=[0.3, 0.8])),
    color=alt.Color('fold:N', title='Fold'),
    tooltip=['fold:N', 'epoch:Q', alt.Tooltip('overall_macro_f1:Q', format='.4f')]
).properties(width=600, height=300, title='Overall F1 Score by Fold')
f1_plot.save(os.path.join(figures_root, 'f1_vs_fold_augmented.svg'))
os.path.join(figures_root, 'f1_vs_fold_augmented.svg')

'../figures/f1_vs_fold_augmented.svg'

![](../figures/f1_vs_fold_augmented.svg)

In [11]:
loss_data = df[['fold', 'epoch', 'train_loss', 'val_loss']].melt(
    id_vars=['fold', 'epoch'], var_name='loss_type', value_name='loss'
)

loss_plot = alt.Chart(loss_data).mark_line(point=True).encode(
    x='epoch:Q', y='loss:Q', color='loss_type:N', strokeDash='fold:N',
    tooltip=['fold:N', 'epoch:Q', 'loss_type:N', alt.Tooltip('loss:Q', format='.4f')]
).properties(width=600, height=300, title='Training & Validation Loss')

loss_plot.save(os.path.join(figures_root, 'loss_augmented.svg'))
os.path.join(figures_root, 'loss_augmented.svg')

'../figures/loss_augmented.svg'

![](../figures/loss_augmented.svg)

In [12]:
lang_data = df[['fold', 'epoch', 'en_macro_f1', 'es_macro_f1', 'it_macro_f1']].melt(
    id_vars=['fold', 'epoch'], var_name='language', value_name='f1'
)
lang_data['language'] = lang_data['language'].str.replace('_macro_f1', '').str.upper()

lang_plot = alt.Chart(lang_data).mark_line(point=True).encode(
    x='epoch:Q', y=alt.Y('f1:Q', scale=alt.Scale(domain=[0.4, 0.95])), color='language:N', strokeDash='fold:N',
    tooltip=['fold:N', 'epoch:Q', 'language:N', alt.Tooltip('f1:Q', format='.4f')]
).properties(width=600, height=300, title='F1 by Language')

lang_plot.save(os.path.join(figures_root, 'f1_vs_lang_augmented.svg'))
os.path.join(figures_root, 'f1_vs_lang_augmented.svg')

'../figures/f1_vs_lang_augmented.svg'

![](../figures/f1_vs_lang_augmented.svg)

In [13]:
metrics_data = df[['fold', 'epoch', 'overall_macro_precision', 'overall_macro_recall', 'overall_macro_f1']].melt(
    id_vars=['fold', 'epoch'], var_name='metric', value_name='value'
)
metrics_data['metric'] = metrics_data['metric'].str.replace('overall_macro_', '').str.capitalize()
metrics_data['fold_epoch'] = 'F' + metrics_data['fold'].astype(str) + ':E' + metrics_data['epoch'].astype(str)

heatmap = alt.Chart(metrics_data).mark_rect().encode(
    x='fold_epoch:O', y='metric:N',
    color=alt.Color('value:Q', scale=alt.Scale(scheme='viridis')),
    tooltip=['fold_epoch:N', 'metric:N', alt.Tooltip('value:Q', format='.4f')]
).properties(width=700, height=150, title='Precision/Recall/F1 Heatmap')

heatmap.save(os.path.join(figures_root, 'fold_vs_epoch_augmented.svg'))
os.path.join(figures_root, 'fold_vs_epoch_augmented.svg')

'../figures/fold_vs_epoch_augmented.svg'

![](../figures/fold_vs_epoch_augmented.svg)

# Inference

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from typing import List, Dict
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Import your training data
from src.baseline.baseline import train_df

✓ All random seeds set to 42
training files: ['train_en.csv', 'train_it.csv', 'train_es.csv']
Total training samples: 2988
CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 2560 (85.7%)
  Class 1 (RECLAMATORY): 428 (14.3%)
  Total: 2988

Per Language:
  EN: Class 0=938, Class 1=88, Total=1026
  ES: Class 0=743, Class 1=133, Total=876
  IT: Class 0=879, Class 1=207, Total=1086




In [2]:
class InferenceConfig:
    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"
    CHECKPOINT_PATH = "../fine_tuned_models/checkpoints/fold_0_epoch_8_f1_0.6913.pt"
    MAX_LENGTH = 128
    BATCH_SIZE = 32
    NUM_LABELS = 2
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class TweetDataset(Dataset):
    """Dataset for inference"""
    def __init__(self, texts: List[str], tokenizer, max_length: int = 128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }

In [4]:
def calculate_metrics(preds: List[int], labels: List[int], languages: List[str]) -> Dict:
    """
    Calculates and returns evaluation metrics for predictions and labels.
    """
    metrics = {}

    macro_precision = precision_score(labels, preds, average="macro", zero_division=0)
    macro_recall = recall_score(labels, preds, average="macro", zero_division=0)
    macro_f1 = f1_score(labels, preds, average="macro", zero_division=0)

    metrics["overall"] = {
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
    }

    unique_langs = set(languages)
    for lang in unique_langs:
        lang_mask = np.array([l == lang for l in languages])
        lang_preds = np.array(preds)[lang_mask]
        lang_labels = np.array(labels)[lang_mask]

        if len(lang_labels) > 0:
            lang_precision = precision_score(
                lang_labels, lang_preds, average="macro", zero_division=0
            )
            lang_recall = recall_score(
                lang_labels, lang_preds, average="macro", zero_division=0
            )
            lang_f1 = f1_score(lang_labels, lang_preds, average="macro", zero_division=0)

            metrics[lang] = {
                "macro_precision": lang_precision,
                "macro_recall": lang_recall,
                "macro_f1": lang_f1,
            }

    return metrics

In [10]:
def run_inference(df: pd.DataFrame, config: InferenceConfig) -> Dict:
    print(f"Running inference on {len(df)} samples...")
    print(f"Device: {config.DEVICE}")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        config.MODEL_NAME,
        num_labels=config.NUM_LABELS
    )
    
    # Load checkpoint weights - HANDLE DIFFERENT CHECKPOINT FORMATS
    checkpoint = torch.load(config.CHECKPOINT_PATH, map_location=config.DEVICE)
    
    # Try different checkpoint formats
    if isinstance(checkpoint, dict):
        if "model_state_dict" in checkpoint:
            state_dict = checkpoint["model_state_dict"]
        elif "state_dict" in checkpoint:
            state_dict = checkpoint["state_dict"]
        else:
            # Assume the entire checkpoint IS the state dict
            state_dict = checkpoint
    else:
        state_dict = checkpoint
    
    model.load_state_dict(state_dict, strict=False)
    print(f"Loaded checkpoint from: {config.CHECKPOINT_PATH}")
    
    # Try to print checkpoint metadata if available
    if isinstance(checkpoint, dict):
        for key in ["fold", "epoch", "f1_score"]:
            if key in checkpoint:
                print(f"  {key.capitalize()}: {checkpoint[key]}")
    
    model.to(config.DEVICE)
    model.eval()
    
    # Prepare dataset and dataloader
    texts = df["text"].tolist()
    labels = df["label"].tolist()
    languages = df["lang"].tolist()
    
    dataset = TweetDataset(texts, tokenizer, config.MAX_LENGTH)
    dataloader = DataLoader(dataset, batch_size=config.BATCH_SIZE, shuffle=False)
    
    # Run inference
    all_preds = []
    all_probs = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Inference"):
            input_ids = batch["input_ids"].to(config.DEVICE)
            attention_mask = batch["attention_mask"].to(config.DEVICE)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Get probabilities and predictions
            probs = torch.softmax(logits, dim=-1)
            preds = torch.argmax(logits, dim=-1)
            
            all_probs.extend(probs.cpu().numpy().tolist())
            all_preds.extend(preds.cpu().numpy().tolist())
    
    # Calculate metrics
    metrics = calculate_metrics(all_preds, labels, languages)
    
    # Print results
    print("\n" + "=" * 80)
    print("INFERENCE RESULTS ON TRAIN DATA")
    print("=" * 80)
    
    print(f"\nOverall Metrics:")
    print(f"  Macro Precision: {metrics['overall']['macro_precision']:.4f}")
    print(f"  Macro Recall:    {metrics['overall']['macro_recall']:.4f}")
    print(f"  Macro F1:        {metrics['overall']['macro_f1']:.4f}")
    
    print(f"\nPer-Language Metrics:")
    for lang in sorted([k for k in metrics.keys() if k != "overall"]):
        print(f"  {lang.upper()}:")
        print(f"    Precision: {metrics[lang]['macro_precision']:.4f}")
        print(f"    Recall:    {metrics[lang]['macro_recall']:.4f}")
        print(f"    F1:        {metrics[lang]['macro_f1']:.4f}")
    
    # Return results
    return {
        "predictions": all_preds,
        "probabilities": all_probs,
        "labels": labels,
        "languages": languages,
        "metrics": metrics
    }

In [11]:
results = run_inference(train_df, InferenceConfig)

Running inference on 2988 samples...
Device: cuda


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded checkpoint from: ../fine_tuned_models/checkpoints/fold_0_epoch_8_f1_0.6913.pt


Inference: 100%|██████████| 94/94 [00:02<00:00, 37.82it/s]



INFERENCE RESULTS ON TRAIN DATA

Overall Metrics:
  Macro Precision: 0.6256
  Macro Recall:    0.6417
  Macro F1:        0.6326

Per-Language Metrics:
  EN:
    Precision: 0.5329
    Recall:    0.5842
    F1:        0.5134
  ES:
    Precision: 0.6827
    Recall:    0.6815
    F1:        0.6821
  IT:
    Precision: 0.9160
    Recall:    0.6819
    F1:        0.7302
