In [1]:
import os
import torch
import pandas as pd
import altair as alt

from src.baseline.baseline import train_df, figures_root
from src.finetune.finetuner import main
from src.baseline.utils import calculate_class_distribution

✓ All random seeds set to 42
training files: ['train_es.csv', 'train_en.csv', 'train_it.csv']
Total training samples: 2988
CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 2560 (85.7%)
  Class 1 (RECLAMATORY): 428 (14.3%)
  Total: 2988

Per Language:
  EN: Class 0=938, Class 1=88, Total=1026
  ES: Class 0=743, Class 1=133, Total=876
  IT: Class 0=879, Class 1=207, Total=1086




In [2]:
original_data = train_df
augmented_data = pd.read_csv("../data/augmented_multilingual_tweets.csv")
print(original_data.shape, augmented_data.shape)

(2988, 5) (5976, 8)


In [3]:
merged_data = pd.concat([original_data, augmented_data[list(original_data.columns)]], ignore_index=True)
print(merged_data.shape)

(8964, 5)


In [4]:
calculate_class_distribution(merged_data)

CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 7680 (85.7%)
  Class 1 (RECLAMATORY): 1284 (14.3%)
  Total: 8964

Per Language:
  EN: Class 0=2560, Class 1=428, Total=2988
  ES: Class 0=2560, Class 1=428, Total=2988
  IT: Class 0=2560, Class 1=428, Total=2988




# Fine-Tuning on Merged Data

In [5]:
class Config:
    """Configuration for fine-tuning"""

    # Model configuration
    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"  # Base model
    NUM_LABELS = 2  # Binary classification
    MAX_LENGTH = 128  # Maximum sequence length
    NUM_FROZEN_LAYERS = 3  # Number of initial layers to freeze (0 = only train classification head)

    # Training configuration
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01 
    NUM_EPOCHS = 10
    BATCH_SIZE = 8
    GRADIENT_ACCUMULATION_STEPS = 2
    WARMUP_RATIO = 0.15  # Warmup as % of total steps

    # Early stopping
    PATIENCE = 3
    EVAL_STRATEGY = "epoch"  # Evaluate at end of each epoch

    # Cross-validation
    N_SPLITS = 5
    TRAIN_RATIO = 0.8  # 80% for training from each fold
    VAL_RATIO = 0.2  # 20% for validation from each fold

    # Dynamic undersampling
    DYNAMIC_UNDERSAMPLE = False  # Balance classes per epoch

    # Model saving
    MAX_MODELS_TO_SAVE = 2
    OUTPUT_DIR = "../fine_tuned_models"
    RESULTS_DIR = "../results/roberta-fine-tune/original_and_augumented/"

    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training would be start on the device: {DEVICE}")


class Config:
    """Configuration for fine-tuning"""

    # Model configuration
    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"  # Base model
    NUM_LABELS = 2  # Binary classification
    MAX_LENGTH = 128  # Maximum sequence length
    NUM_FROZEN_LAYERS = 3  # Number of initial layers to freeze (0 = only train classification head)

    # Training configuration
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    NUM_EPOCHS = 10
    BATCH_SIZE = 8
    GRADIENT_ACCUMULATION_STEPS = 2
    WARMUP_RATIO = 0.1  # Warmup as % of total steps

    # Early stopping
    PATIENCE = 3
    EVAL_STRATEGY = "epoch"  # Evaluate at end of each epoch

    # Cross-validation
    N_SPLITS = 5
    TRAIN_RATIO = 0.8  # 80% for training from each fold
    VAL_RATIO = 0.2  # 20% for validation from each fold

    # Dynamic undersampling
    DYNAMIC_UNDERSAMPLE = False  # Balance classes per epoch

    # Model saving
    MAX_MODELS_TO_SAVE = 2
    OUTPUT_DIR = "../fine_tuned_models"
    RESULTS_DIR = "../results/roberta-fine-tune/original_and_augumented/"

    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Training would be start on the device: {DEVICE}")

Training would be start on the device: cuda
Training would be start on the device: cuda


In [6]:
main(merged_data, Config)

2025-11-13 10:10:58,497 - INFO - Starting Fine-tuning Pipeline
2025-11-13 10:10:58,508 - INFO - Fold 0: Train=5736, Val=1435
2025-11-13 10:10:58,508 - INFO -   Train label dist: {0: 4914, 1: 822}
2025-11-13 10:10:58,508 - INFO -   Train lang dist: {'en': 1912, 'es': 1912, 'it': 1912}
2025-11-13 10:10:58,513 - INFO - Fold 1: Train=5736, Val=1435
2025-11-13 10:10:58,514 - INFO -   Train label dist: {0: 4915, 1: 821}
2025-11-13 10:10:58,514 - INFO -   Train lang dist: {'es': 1912, 'it': 1912, 'en': 1912}
2025-11-13 10:10:58,520 - INFO - Fold 2: Train=5736, Val=1435
2025-11-13 10:10:58,520 - INFO -   Train label dist: {0: 4915, 1: 821}
2025-11-13 10:10:58,520 - INFO -   Train lang dist: {'es': 1912, 'en': 1912, 'it': 1912}
2025-11-13 10:10:58,525 - INFO - Fold 3: Train=5736, Val=1435
2025-11-13 10:10:58,526 - INFO -   Train label dist: {0: 4916, 1: 820}
2025-11-13 10:10:58,526 - INFO -   Train lang dist: {'es': 1912, 'en': 1912, 'it': 1912}
2025-11-13 10:10:58,531 - INFO - Fold 4: Train=57


FOLD 0:
  Train: 822 positive samples
  Val:   205 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-13 10:11:02,102 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-11-13 10:11:02,103 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-13 10:11:02,104 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-11-13 10:11:02,105 - INFO - Label weights: {0: 0.5836385836385837, 1: 3.489051094890511}
2025-11-13 10:11:02,105 - INFO - Language weights: {'en': 1.0, 'es': 1.0, 'it': 1.0}
2025-11-13 10:11:02,105 - INFO - Pos weight (for BCE): 5.9781
2025-11-13 10:11:02,106 - INFO - 
Epoch 1/10
Training: 100%|██████████| 717/717 [00:41<00:00, 17.26it/s, loss=0.135]


FOLD 1:
  Train: 821 positive samples
  Val:   206 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-13 10:13:33,203 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-11-13 10:13:33,203 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-13 10:13:33,204 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-11-13 10:13:33,205 - INFO - Label weights: {0: 0.5835198372329603, 1: 3.4933008526187574}
2025-11-13 10:13:33,205 - INFO - Language weights: {'es': 1.0, 'it': 1.0, 'en': 1.0}
2025-11-13 10:13:33,205 - INFO - Pos weight (for BCE): 5.9866
2025-11-13 10:13:33,207 - INFO - 
Epoch 1/10
Training: 100%|██████████| 717/717 [00:19<00:00, 36.12it/s, loss=0.134


FOLD 2:
  Train: 821 positive samples
  Val:   206 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-13 10:17:18,014 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-11-13 10:17:18,014 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-13 10:17:18,015 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-11-13 10:17:18,016 - INFO - Label weights: {0: 0.5835198372329603, 1: 3.4933008526187574}
2025-11-13 10:17:18,016 - INFO - Language weights: {'es': 1.0, 'en': 1.0, 'it': 1.0}
2025-11-13 10:17:18,016 - INFO - Pos weight (for BCE): 5.9866
2025-11-13 10:17:18,018 - INFO - 
Epoch 1/10
Training: 100%|██████████| 717/717 [00:20<00:00, 35.74it/s, loss=0.133


FOLD 3:
  Train: 820 positive samples
  Val:   207 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-13 10:21:02,090 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-11-13 10:21:02,091 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-13 10:21:02,091 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-11-13 10:21:02,092 - INFO - Label weights: {0: 0.5834011391375101, 1: 3.497560975609756}
2025-11-13 10:21:02,092 - INFO - Language weights: {'es': 1.0, 'en': 1.0, 'it': 1.0}
2025-11-13 10:21:02,093 - INFO - Pos weight (for BCE): 5.9951
2025-11-13 10:21:02,094 - INFO - 
Epoch 1/10
Training: 100%|██████████| 717/717 [00:20<00:00, 35.70it/s, loss=0.135]


FOLD 4:
  Train: 821 positive samples
  Val:   207 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-11-13 10:28:35,664 - INFO - Froze: Embeddings + First 3 Encoder Layers
2025-11-13 10:28:35,665 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-11-13 10:28:35,665 - INFO - Trainable parameters: 64,382,978 / 278,045,186 (23.16%)
2025-11-13 10:28:35,666 - INFO - Label weights: {0: 0.5835028478437754, 1: 3.4939098660170522}
2025-11-13 10:28:35,666 - INFO - Language weights: {'it': 0.9996514464970374, 'es': 1.0001742767514814, 'en': 1.0001742767514814}
2025-11-13 10:28:35,667 - INFO - Pos weight (for BCE): 5.9878
2025-11-13 10:28:35,668 - INFO - 
Epoch 1/10
Training: 100%|██████████

### Visualizing Fine-Tune Metrics

In [7]:
df = pd.read_csv('../results/roberta-fine-tune/original_and_augumented/training_results.csv')

In [8]:
f1_plot = alt.Chart(df).mark_line(point=True, size=3).encode(
    x=alt.X('epoch:Q', title='Epoch'),
    y=alt.Y('overall_macro_f1:Q', title='Macro F1 Score', scale=alt.Scale(domain=[0.3, 0.8])),
    color=alt.Color('fold:N', title='Fold'),
    tooltip=['fold:N', 'epoch:Q', alt.Tooltip('overall_macro_f1:Q', format='.4f')]
).properties(width=600, height=300, title='Overall F1 Score by Fold')
f1_plot.save(os.path.join(figures_root, 'f1_vs_fold_augmented.svg'))
os.path.join(figures_root, 'f1_vs_fold_augmented.svg')

'../figures/f1_vs_fold_augmented.svg'

![](../figures/f1_vs_fold_augmented.svg)

In [9]:
loss_data = df[['fold', 'epoch', 'train_loss', 'val_loss']].melt(
    id_vars=['fold', 'epoch'], var_name='loss_type', value_name='loss'
)

loss_plot = alt.Chart(loss_data).mark_line(point=True).encode(
    x='epoch:Q', y='loss:Q', color='loss_type:N', strokeDash='fold:N',
    tooltip=['fold:N', 'epoch:Q', 'loss_type:N', alt.Tooltip('loss:Q', format='.4f')]
).properties(width=600, height=300, title='Training & Validation Loss')

loss_plot.save(os.path.join(figures_root, 'loss_augmented.svg'))
os.path.join(figures_root, 'loss_augmented.svg')

'../figures/loss_augmented.svg'

![](../figures/loss_augmented.svg)

In [10]:
lang_data = df[['fold', 'epoch', 'en_macro_f1', 'es_macro_f1', 'it_macro_f1']].melt(
    id_vars=['fold', 'epoch'], var_name='language', value_name='f1'
)
lang_data['language'] = lang_data['language'].str.replace('_macro_f1', '').str.upper()

lang_plot = alt.Chart(lang_data).mark_line(point=True).encode(
    x='epoch:Q', y=alt.Y('f1:Q', scale=alt.Scale(domain=[0.4, 0.95])), color='language:N', strokeDash='fold:N',
    tooltip=['fold:N', 'epoch:Q', 'language:N', alt.Tooltip('f1:Q', format='.4f')]
).properties(width=600, height=300, title='F1 by Language')

lang_plot.save(os.path.join(figures_root, 'f1_vs_lang_augmented.svg'))
os.path.join(figures_root, 'f1_vs_lang_augmented.svg')

'../figures/f1_vs_lang_augmented.svg'

![](../figures/f1_vs_lang_augmented.svg)

In [11]:
metrics_data = df[['fold', 'epoch', 'overall_macro_precision', 'overall_macro_recall', 'overall_macro_f1']].melt(
    id_vars=['fold', 'epoch'], var_name='metric', value_name='value'
)
metrics_data['metric'] = metrics_data['metric'].str.replace('overall_macro_', '').str.capitalize()
metrics_data['fold_epoch'] = 'F' + metrics_data['fold'].astype(str) + ':E' + metrics_data['epoch'].astype(str)

heatmap = alt.Chart(metrics_data).mark_rect().encode(
    x='fold_epoch:O', y='metric:N',
    color=alt.Color('value:Q', scale=alt.Scale(scheme='viridis')),
    tooltip=['fold_epoch:N', 'metric:N', alt.Tooltip('value:Q', format='.4f')]
).properties(width=700, height=150, title='Precision/Recall/F1 Heatmap')

heatmap.save(os.path.join(figures_root, 'fold_vs_epoch_augmented.svg'))
os.path.join(figures_root, 'fold_vs_epoch_augmented.svg')

'../figures/fold_vs_epoch_augmented.svg'

![](../figures/fold_vs_epoch_augmented.svg)