In [1]:
import os
import torch
import pandas as pd
import altair as alt

from src.baseline.baseline import train_df, figures_root
from src.finetune.finetuner import main

✓ All random seeds set to 42
training files: ['train_es.csv', 'train_en.csv', 'train_it.csv']
Total training samples: 2988
CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 2560 (85.7%)
  Class 1 (RECLAMATORY): 428 (14.3%)
  Total: 2988

Per Language:
  EN: Class 0=938, Class 1=88, Total=1026
  ES: Class 0=743, Class 1=133, Total=876
  IT: Class 0=879, Class 1=207, Total=1086




# Fine-Tuning

In [2]:
class Config:
    """Configuration for fine-tuning"""

    # Model configuration
    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"  # Base model
    NUM_LABELS = 2  # Binary classification
    MAX_LENGTH = 128  # Maximum sequence length
    NUM_FROZEN_LAYERS = 6  # Number of initial layers to freeze (0 = only train classification head)

    # Training configuration
    LEARNING_RATE = 5e-6
    WEIGHT_DECAY = 0.01
    NUM_EPOCHS = 10
    BATCH_SIZE = 4
    GRADIENT_ACCUMULATION_STEPS = 4
    WARMUP_RATIO = 0.05  # Warmup as % of total steps

    # Early stopping
    PATIENCE = 3
    EVAL_STRATEGY = "epoch"  # Evaluate at end of each epoch

    # Cross-validation
    N_SPLITS = 5
    TRAIN_RATIO = 0.8  # 80% for training from each fold
    VAL_RATIO = 0.2  # 20% for validation from each fold

    # Dynamic undersampling
    DYNAMIC_UNDERSAMPLE = True  # Balance classes per epoch

    # Model saving
    MAX_MODELS_TO_SAVE = 2
    OUTPUT_DIR = "../fine_tuned_models"
    RESULTS_DIR = "../results/roberta-fine-tune/"

    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training would be start on the device: {DEVICE}")

Training would be start on the device: cuda


In [3]:
main(train_df, Config)

2025-11-12 08:22:09,852 - INFO - Starting Fine-tuning Pipeline
2025-11-12 08:22:09,857 - INFO - Fold 0: Train=1912, Val=478
2025-11-12 08:22:09,858 - INFO -   Train label dist: {0: 1638, 1: 274}
2025-11-12 08:22:09,858 - INFO -   Train lang dist: {'it': 696, 'en': 656, 'es': 560}
2025-11-12 08:22:09,860 - INFO - Fold 1: Train=1912, Val=478
2025-11-12 08:22:09,861 - INFO -   Train label dist: {0: 1639, 1: 273}
2025-11-12 08:22:09,861 - INFO -   Train lang dist: {'it': 696, 'en': 656, 'es': 560}
2025-11-12 08:22:09,864 - INFO - Fold 2: Train=1912, Val=478
2025-11-12 08:22:09,864 - INFO -   Train label dist: {0: 1638, 1: 274}
2025-11-12 08:22:09,864 - INFO -   Train lang dist: {'it': 695, 'en': 657, 'es': 560}
2025-11-12 08:22:09,867 - INFO - Fold 3: Train=1912, Val=479
2025-11-12 08:22:09,867 - INFO -   Train label dist: {0: 1639, 1: 273}
2025-11-12 08:22:09,867 - INFO -   Train lang dist: {'it': 695, 'en': 657, 'es': 560}
2025-11-12 08:22:09,869 - INFO - Fold 4: Train=1912, Val=479
2025


FOLD 0:
  Train: 274 positive samples
  Val:   68 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-12 08:22:13,133 - INFO - Froze: Embeddings + First 6 Encoder Layers
2025-11-12 08:22:13,133 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-12 08:22:13,134 - INFO - Trainable parameters: 43,119,362 / 278,045,186 (15.51%)
2025-11-12 08:22:13,135 - INFO - Label weights: {0: 0.5836385836385837, 1: 3.489051094890511}
2025-11-12 08:22:13,135 - INFO - Language weights: {'it': 0.9080362792659776, 'en': 0.9634043450748787, 'es': 1.1285593756591437}
2025-11-12 08:22:13,135 - INFO - Pos weight (for BCE): 5.9781
2025-11-12 08:22:13,136 - INFO - 
Epoch 1/10
2025-11-12 08:22:13,137 - 


FOLD 1:
  Train: 273 positive samples
  Val:   69 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-12 08:22:52,973 - INFO - Froze: Embeddings + First 6 Encoder Layers
2025-11-12 08:22:52,973 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-12 08:22:52,974 - INFO - Trainable parameters: 43,119,362 / 278,045,186 (15.51%)
2025-11-12 08:22:52,975 - INFO - Label weights: {0: 0.5832824893227577, 1: 3.501831501831502}
2025-11-12 08:22:52,975 - INFO - Language weights: {'it': 0.9080362792659776, 'en': 0.9634043450748787, 'es': 1.1285593756591437}
2025-11-12 08:22:52,975 - INFO - Pos weight (for BCE): 6.0037
2025-11-12 08:22:52,976 - INFO - 
Epoch 1/10
2025-11-12 08:22:52,977 - 


FOLD 2:
  Train: 274 positive samples
  Val:   69 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-12 08:23:32,714 - INFO - Froze: Embeddings + First 6 Encoder Layers
2025-11-12 08:23:32,715 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-12 08:23:32,715 - INFO - Trainable parameters: 43,119,362 / 278,045,186 (15.51%)
2025-11-12 08:23:32,716 - INFO - Label weights: {0: 0.5836385836385837, 1: 3.489051094890511}
2025-11-12 08:23:32,717 - INFO - Language weights: {'it': 0.9093912592122662, 'en': 0.961989231586796, 'es': 1.1286195092009375}
2025-11-12 08:23:32,717 - INFO - Pos weight (for BCE): 5.9781
2025-11-12 08:23:32,718 - INFO - 
Epoch 1/10
2025-11-12 08:23:32,719 - I


FOLD 3:
  Train: 273 positive samples
  Val:   70 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-12 08:24:11,552 - INFO - Froze: Embeddings + First 6 Encoder Layers
2025-11-12 08:24:11,553 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-12 08:24:11,553 - INFO - Trainable parameters: 43,119,362 / 278,045,186 (15.51%)
2025-11-12 08:24:11,554 - INFO - Label weights: {0: 0.5832824893227577, 1: 3.501831501831502}
2025-11-12 08:24:11,554 - INFO - Language weights: {'it': 0.9093912592122662, 'en': 0.961989231586796, 'es': 1.1286195092009375}
2025-11-12 08:24:11,555 - INFO - Pos weight (for BCE): 6.0037
2025-11-12 08:24:11,556 - INFO - 
Epoch 1/10
2025-11-12 08:24:11,556 - I


FOLD 4:
  Train: 272 positive samples
  Val:   70 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-12 08:24:50,622 - INFO - Froze: Embeddings + First 6 Encoder Layers
2025-11-12 08:24:50,622 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-12 08:24:50,622 - INFO - Trainable parameters: 43,119,362 / 278,045,186 (15.51%)
2025-11-12 08:24:50,623 - INFO - Label weights: {0: 0.5829268292682926, 1: 3.514705882352941}
2025-11-12 08:24:50,623 - INFO - Language weights: {'it': 0.9093912592122662, 'en': 0.961989231586796, 'es': 1.1286195092009375}
2025-11-12 08:24:50,624 - INFO - Pos weight (for BCE): 6.0294
2025-11-12 08:24:50,625 - INFO - 
Epoch 1/10
2025-11-12 08:24:50,625 - I

# Visualizing Fine-Tune Metrics

In [4]:
df = pd.read_csv('../results/roberta-fine-tune/training_results.csv')

In [10]:
f1_plot = alt.Chart(df).mark_line(point=True, size=3).encode(
    x=alt.X('epoch:Q', title='Epoch'),
    y=alt.Y('overall_macro_f1:Q', title='Macro F1 Score', scale=alt.Scale(domain=[0.3, 0.8])),
    color=alt.Color('fold:N', title='Fold'),
    tooltip=['fold:N', 'epoch:Q', alt.Tooltip('overall_macro_f1:Q', format='.4f')]
).properties(width=600, height=300, title='Overall F1 Score by Fold')
f1_plot.save(os.path.join(figures_root, 'f1_vs_fold.svg'))
os.path.join(figures_root, 'f1_vs_fold.svg')

'../figures/f1_vs_fold.svg'

![](../figures/f1_vs_fold.svg)

In [11]:
loss_data = df[['fold', 'epoch', 'train_loss', 'val_loss']].melt(
    id_vars=['fold', 'epoch'], var_name='loss_type', value_name='loss'
)

loss_plot = alt.Chart(loss_data).mark_line(point=True).encode(
    x='epoch:Q', y='loss:Q', color='loss_type:N', strokeDash='fold:N',
    tooltip=['fold:N', 'epoch:Q', 'loss_type:N', alt.Tooltip('loss:Q', format='.4f')]
).properties(width=600, height=300, title='Training & Validation Loss')

loss_plot.save(os.path.join(figures_root, 'loss.svg'))
os.path.join(figures_root, 'loss.svg')

'../figures/loss.svg'

![](../figures/loss.svg)

In [15]:
lang_data = df[['fold', 'epoch', 'en_macro_f1', 'es_macro_f1', 'it_macro_f1']].melt(
    id_vars=['fold', 'epoch'], var_name='language', value_name='f1'
)
lang_data['language'] = lang_data['language'].str.replace('_macro_f1', '').str.upper()

lang_plot = alt.Chart(lang_data).mark_line(point=True).encode(
    x='epoch:Q', y=alt.Y('f1:Q', scale=alt.Scale(domain=[0.4, 0.95])), color='language:N', strokeDash='fold:N',
    tooltip=['fold:N', 'epoch:Q', 'language:N', alt.Tooltip('f1:Q', format='.4f')]
).properties(width=600, height=300, title='F1 by Language')

lang_plot.save(os.path.join(figures_root, 'f1_vs_lang.svg'))
os.path.join(figures_root, 'f1_vs_lang.svg')

'../figures/f1_vs_lang.svg'

![](../figures/f1_vs_lang.svg)

In [16]:
metrics_data = df[['fold', 'epoch', 'overall_macro_precision', 'overall_macro_recall', 'overall_macro_f1']].melt(
    id_vars=['fold', 'epoch'], var_name='metric', value_name='value'
)
metrics_data['metric'] = metrics_data['metric'].str.replace('overall_macro_', '').str.capitalize()
metrics_data['fold_epoch'] = 'F' + metrics_data['fold'].astype(str) + ':E' + metrics_data['epoch'].astype(str)

heatmap = alt.Chart(metrics_data).mark_rect().encode(
    x='fold_epoch:O', y='metric:N',
    color=alt.Color('value:Q', scale=alt.Scale(scheme='viridis')),
    tooltip=['fold_epoch:N', 'metric:N', alt.Tooltip('value:Q', format='.4f')]
).properties(width=700, height=150, title='Precision/Recall/F1 Heatmap')

heatmap.save(os.path.join(figures_root, 'fold_vs_epoch.svg'))
os.path.join(figures_root, 'fold_vs_epoch.svg')

'../figures/fold_vs_epoch.svg'

![](../figures/fold_vs_epoch.svg)