In [1]:
import os
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

import json
import logging
from pathlib import Path
from typing import List, Tuple, Dict
import warnings
import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
from transformers import (
XLMRobertaTokenizer,
XLMRobertaForSequenceClassification,
get_linear_schedule_with_warmup,
AutoModelForSequenceClassification,
AutoTokenizer
)
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from sklearn.model_selection import StratifiedKFold
import altair as alt

from src.baseline.baseline import train_df, SEED
from src.finetune.dataloader import MultilingualDataset, StratifiedMultilingualSplitter, DynamicUndersamplingSampler
from src.finetune.train import train_epoch, validate
from src.finetune.utils import calculate_metrics, calculate_weights
from src.finetune.finetuner import run_train, run_inference

warnings.filterwarnings("ignore")

2025-12-04 19:43:50.589620: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-04 19:43:50.620227: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-04 19:43:51.216850: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


✓ All random seeds set to 42
training files: ['train_en.csv', 'train_it.csv', 'train_es.csv']
Total training samples: 2988
CLASS DISTRIBUTION

Overall:
  Class 0 (NOT_RECLAMATORY): 2560 (85.7%)
  Class 1 (RECLAMATORY): 428 (14.3%)
  Total: 2988

Per Language:
  EN: Class 0=938, Class 1=88, Total=1026
  ES: Class 0=743, Class 1=133, Total=876
  IT: Class 0=879, Class 1=207, Total=1086




# Model Configuration

In [2]:
RUN_1 = "cardiffnlp/twitter-xlm-roberta-base"
RUN_2 = "../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model"  # resultant model of 6_Finetune-LM-with-Optuna-for-MLM.ipynb

RUN = RUN_2

class BaseConfig:
    MODEL_NAME = RUN  
    NUM_LABELS = 2
    MAX_LENGTH = 128
    NUM_FROZEN_LAYERS = 10      
    LEARNING_RATE = 5e-5
    WEIGHT_DECAY = 0.01
    BATCH_SIZE = 8
    NUM_EPOCHS = 10
    GRADIENT_ACCUMULATION_STEPS = 2
    WARMUP_RATIO = 0.15
    PATIENCE = 3
    EVAL_STRATEGY = "epoch"
    N_SPLITS = 5
    TRAIN_RATIO = 0.8
    VAL_RATIO = 0.2
    DYNAMIC_UNDERSAMPLE = False
    MAX_MODELS_TO_SAVE = 2
    OUTPUT_DIR = "../fine_tuned_models/mlm"
    RESULTS_DIR = "../results/fine_tuned_mlm/"
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model Setup

In [3]:
def setup_model(Config, num_frozen_layers: int = None):
    if num_frozen_layers is None:
        num_frozen_layers = Config.NUM_FROZEN_LAYERS
    
    tokenizer = XLMRobertaTokenizer.from_pretrained(Config.MODEL_NAME)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        Config.MODEL_NAME, num_labels=Config.NUM_LABELS
    )
    
    model = model.to(Config.DEVICE)
    
    torch.nn.init.xavier_uniform_(model.classifier.dense.weight)
    torch.nn.init.zeros_(model.classifier.dense.bias)
    torch.nn.init.xavier_uniform_(model.classifier.out_proj.weight)
    torch.nn.init.zeros_(model.classifier.out_proj.bias)
    
    for param in model.roberta.embeddings.parameters():
        param.requires_grad = False
    
    num_total_layers = len(model.roberta.encoder.layer)
    for idx in range(min(num_frozen_layers, num_total_layers)):
        for param in model.roberta.encoder.layer[idx].parameters():
            param.requires_grad = False
    
    if num_frozen_layers >= num_total_layers and model.roberta.pooler is not None:
        for param in model.roberta.pooler.parameters():
            param.requires_grad = False
    
    for param in model.classifier.parameters():
        param.requires_grad = True
    
    print(f"Froze: Embeddings + First {min(num_frozen_layers, num_total_layers)} Encoder Layers")
    print("Trainable: Classification Head + Remaining Encoder Layers")
    
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    
    print(
        f"Trainable parameters: {trainable_params:,} / {total_params:,} ({100 * trainable_params / total_params:.2f}%)"
    )

    return model, tokenizer

# Model Storage Configuration

In [4]:
class ModelCheckpointManager:
    def init(self, max_models: int = 2, output_dir: str = "./models"):
        self.max_models = max_models
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.best_models = []

    def save_checkpoint(self, model, score: float, epoch: int, fold: int) -> bool:
        checkpoint_name = f"fold_{fold}_epoch_{epoch}_f1_{score:.4f}.pt"
        checkpoint_path = self.output_dir / checkpoint_name
    
        self.best_models.append((score, checkpoint_path, epoch, fold))
        self.best_models.sort(reverse=True, key=lambda x: x)
    
        if len(self.best_models) > self.max_models:
            worst_score, worst_path, worst_epoch, worst_fold = self.best_models.pop()
            if worst_path.exists():
                worst_path.unlink()
                logger.info(f"Deleted checkpoint: {worst_path}")
    
        torch.save(model.state_dict(), checkpoint_path)
        logger.info(
            f"Saved checkpoint: {checkpoint_path} (F1: {score:.4f}, Fold: {fold}, Epoch: {epoch})"
        )
    
        return True
    
    def get_best_models(self) -> List[Tuple]:
        return [(score, path, epoch, fold) for score, path, epoch, fold in self.best_models]

# Single Fold Training

In [5]:
def train_single_fold(train_df: pd.DataFrame, val_df: pd.DataFrame, Config, fold_id: int = 0, trial_id: int = 0):
    try:
        model, tokenizer = setup_model(Config, num_frozen_layers=Config.NUM_FROZEN_LAYERS)
    
        label_weights, language_weights, pos_weight = calculate_weights(train_df)
    
        train_dataset = MultilingualDataset(
            texts=train_df["text"].tolist(),
            labels=train_df["label"].tolist(),
            languages=train_df["lang"].tolist(),
            tokenizer=tokenizer,
            max_length=Config.MAX_LENGTH,
        )
    
        val_dataset = MultilingualDataset(
            texts=val_df["text"].tolist(),
            labels=val_df["label"].tolist(),
            languages=val_df["lang"].tolist(),
            tokenizer=tokenizer,
            max_length=Config.MAX_LENGTH,
        )
    
        optimizer = AdamW(
            model.parameters(),
            lr=Config.LEARNING_RATE,
            weight_decay=Config.WEIGHT_DECAY,
        )
    
        balanced_train_size = 3 * len(train_df[train_df["label"] == 1])
        total_steps = (
            balanced_train_size
            // (Config.BATCH_SIZE * Config.GRADIENT_ACCUMULATION_STEPS)
            * Config.NUM_EPOCHS
        )
        warmup_steps = int(Config.WARMUP_RATIO * total_steps)
    
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps,
        )
    
        best_val_f1 = 0
        patience_counter = 0
        fold_results = []
    
        for epoch in range(Config.NUM_EPOCHS):
            
            if Config.DYNAMIC_UNDERSAMPLE:
                undersampler = DynamicUndersamplingSampler(
                    train_df, minority_class=1, seed=SEED
                )
                balanced_indices = undersampler.get_balanced_indices(epoch)
                train_subset_df = train_df.iloc[balanced_indices].reset_index(drop=True)
            else:
                train_subset_df = train_df
    
            train_dataset_epoch = MultilingualDataset(
                texts=train_subset_df["text"].tolist(),
                labels=train_subset_df["label"].tolist(),
                languages=train_subset_df["lang"].tolist(),
                tokenizer=tokenizer,
                max_length=Config.MAX_LENGTH,
            )
    
            train_loader = DataLoader(
                train_dataset_epoch,
                batch_size=Config.BATCH_SIZE,
                shuffle=True,
            )
    
            val_loader = DataLoader(
                val_dataset,
                batch_size=Config.BATCH_SIZE,
                shuffle=False,
            )
    
            train_loss = train_epoch(
                model,
                train_loader,
                optimizer,
                scheduler,
                label_weights,
                language_weights,
                pos_weight,
                Config
            )
    
            val_loss, val_preds, val_labels, val_languages = validate(
                model, val_loader, Config
            )
    
            val_metrics = calculate_metrics(val_preds, val_labels, val_languages)
            current_f1 = val_metrics["overall"]["macro_f1"]
            
            print(f"Trial {trial_id}, Fold {fold_id}, Epoch {epoch+1}: F1={current_f1:.4f}")
    
            if current_f1 > best_val_f1:
                best_val_f1 = current_f1
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= Config.PATIENCE:
                    logger.info(f"Early stopping at epoch {epoch}")
                    break
    
            fold_results.append({
                "epoch": epoch,
                "train_loss": train_loss,
                "val_loss": val_loss,
                "val_f1": current_f1
            })
    
        return best_val_f1, fold_results
    
    except Exception as e:
        print(f"Error in fold {fold_id}: {str(e)}")
        return 0.0, []

# Optuna Parameter Selection

In [6]:
def optuna_objective(trial, train_df: pd.DataFrame, base_config: BaseConfig):
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-4)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_epochs", 5, 12)
    
    print(f"\n{'='*80}")
    print(f"Trial {trial.number}")
    print(f"{'='*80}")
    print(f"Learning Rate: {learning_rate:.2e}")
    print(f"Weight Decay: {weight_decay:.4f}")
    print(f"Batch Size: {batch_size}")
    print(f"Num Epochs: {num_epochs}")
    
    class TrialConfig(base_config.__class__):
        LEARNING_RATE = learning_rate
        WEIGHT_DECAY = weight_decay
        BATCH_SIZE = batch_size
        NUM_EPOCHS = num_epochs
        OUTPUT_DIR = f"{base_config.OUTPUT_DIR}/trial_{trial.number}"
    
    trial_config = TrialConfig()
    
    fold_scores = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
        
        fold_train = train_df.iloc[train_idx].reset_index(drop=True)
        fold_val = train_df.iloc[val_idx].reset_index(drop=True)
        
        print(f"\n  Fold {fold_idx+1}/5: Train={len(fold_train)}, Val={len(fold_val)}")
        
        try:
            fold_f1, fold_results = train_single_fold(
                fold_train, 
                fold_val, 
                trial_config, 
                fold_id=fold_idx,
                trial_id=trial.number
            )
            
            fold_scores.append(fold_f1)
            print(f"  Fold {fold_idx+1} F1: {fold_f1:.4f}")
            
            avg_so_far = np.mean(fold_scores)
            trial.report(avg_so_far, fold_idx)
            
            if trial.should_prune():
                print(f"  Trial pruned at fold {fold_idx}")
                raise optuna.TrialPruned()
                
        except optuna.TrialPruned:
            raise
        except Exception as e:
            logger.error(f"Fold {fold_idx} failed: {str(e)}")
            return 0.0
    
    avg_f1 = np.mean(fold_scores)
    print(f"\nTrial {trial.number} - Average F1: {avg_f1:.4f}")
    print(f"Fold scores: {[f'{f:.4f}' for f in fold_scores]}\n")
    
    return avg_f1

# Visulize Optuna Parameter Selection Results

In [7]:
def visualize_optuna_results(study, output_dir: str = "../figures"):
    os.makedirs(output_dir, exist_ok=True)

    # Prepare data
    trial_data = []
    for trial in study.trials:
        if trial.value is not None:
            trial_data.append({
                "trial": trial.number,
                "f1": trial.value,
                "lr": trial.params.get("learning_rate", 0),
                "batch_size": trial.params.get("batch_size", 0),
                "epochs": trial.params.get("num_epochs", 0),
                "weight_decay": trial.params.get("weight_decay", 0)
            })
    
    df = pd.DataFrame(trial_data)
    
    print("\nGenerating visualizations...")
    
    # Plot 1: Optimization History
    opt_history = alt.Chart(df).mark_line(point=True).encode(
        x=alt.X("trial:Q", title="Trial Number"),
        y=alt.Y("f1:Q", title="F1 Score"),
        tooltip=["trial:Q", alt.Tooltip("f1:Q", format=".4f")]
    ).properties(
        width=700, height=400, title="Optimization History: F1 Score Over Trials"
    ).interactive()
    
    opt_history_path = f"{output_dir}/optuna_optimization_history.svg"
    opt_history.save(opt_history_path)
    print(f"Saved: optuna_optimization_history.svg")
    
    # Plot 2: Learning Rate Impact
    lr_plot = alt.Chart(df).mark_circle(size=100).encode(
        x=alt.X("lr:Q", title="Learning Rate", scale=alt.Scale(type="log")),
        y=alt.Y("f1:Q", title="F1 Score"),
        color=alt.Color("epochs:O", title="Num Epochs"),
        tooltip=["trial:Q", alt.Tooltip("f1:Q", format=".4f"), "lr:Q", "epochs:O"]
    ).properties(
        width=600, height=400, title="Learning Rate vs F1 Score"
    ).interactive()
    
    lr_path = f"{output_dir}/optuna_learning_rate_impact.svg"
    lr_plot.save(lr_path)
    print(f"Saved: optuna_learning_rate_impact.svg")
    
    # Plot 3: Batch Size Impact
    bs_plot = alt.Chart(df).mark_boxplot().encode(
        x=alt.X("batch_size:O", title="Batch Size"),
        y=alt.Y("f1:Q", title="F1 Score"),
        tooltip=["trial:Q", alt.Tooltip("f1:Q", format=".4f")]
    ).properties(
        width=500, height=400, title="Batch Size vs F1 Score"
    )
    
    bs_path = f"{output_dir}/optuna_batch_size_impact.svg"
    bs_plot.save(bs_path)
    print(f"Saved: optuna_batch_size_impact.svg")
    
    # Plot 4: Weight Decay Impact
    wd_plot = alt.Chart(df).mark_circle(size=100).encode(
        x=alt.X("weight_decay:Q", title="Weight Decay"),
        y=alt.Y("f1:Q", title="F1 Score"),
        color=alt.Color("f1:Q", scale=alt.Scale(scheme="viridis")),
        tooltip=["trial:Q", alt.Tooltip("f1:Q", format=".4f"), "weight_decay:Q"]
    ).properties(
        width=600, height=400, title="Weight Decay vs F1 Score"
    ).interactive()
    
    wd_path = f"{output_dir}/optuna_weight_decay_impact.svg"
    wd_plot.save(wd_path)
    print(f"Saved: optuna_weight_decay_impact.svg")
    
    # Plot 5: Epochs Impact
    epochs_plot = alt.Chart(df).mark_boxplot().encode(
        x=alt.X("epochs:O", title="Number of Epochs"),
        y=alt.Y("f1:Q", title="F1 Score"),
        tooltip=["trial:Q", alt.Tooltip("f1:Q", format=".4f")]
    ).properties(
        width=600, height=400, title="Number of Epochs vs F1 Score"
    )
    
    epochs_path = f"{output_dir}/optuna_epochs_impact.svg"
    epochs_plot.save(epochs_path)
    print(f"Saved: optuna_epochs_impact.svg")
    
    # Save trials dataframe
    trials_csv = f"{output_dir}/optuna_trials.csv"
    df.to_csv(trials_csv, index=False)
    print(f"Saved: optuna_trials.csv")
    
    print(f"\nAll visualizations saved to: {output_dir}\n")
    
    return df

In [8]:
def run_optuna_optimization(train_df: pd.DataFrame, n_trials: int = 10):
    base_config = BaseConfig()

    print("\n" + " "*80)
    print("STEP 1: OPTUNA HYPERPARAMETER OPTIMIZATION")
    print(" "*80)
    print(f"Number of trials: {n_trials}")
    print(f"CV folds per trial: 5")
    print(f"Total model trainings: {n_trials * 5}")
    print(" "*80 + "\n")
    
    sampler = TPESampler(seed=SEED)
    pruner = MedianPruner(n_startup_trials=2, n_warmup_steps=10)
    
    study = optuna.create_study(
        direction="maximize",
        sampler=sampler,
        pruner=pruner,
        study_name="classification_optimization"
    )
    
    study.optimize(
        lambda trial: optuna_objective(trial, train_df, base_config),
        n_trials=n_trials,
        gc_after_trial=True,
        show_progress_bar=False
    )
    
    best_trial = study.best_trial
    
    print("\n" + "="*80)
    print("BEST TRIAL")
    print("="*80)
    print(f"Trial: {best_trial.number}")
    print(f"Best F1 Score: {best_trial.value:.4f}")
    print(f"\nBest Hyperparameters:")
    for key, value in best_trial.params.items():
        print(f"  {key}: {value}")
    print("="*80 + "\n")
    
    # Visualize
    print("\n" + " "*80)
    print(" STEP 2: VISUALIZE RESULTS")
    print(" "*80 + "\n")
    
    trial_df = visualize_optuna_results(study)
    
    print(f"Top 5 Trials:")
    print(trial_df.nlargest(5, "f1")[["trial", "f1", "lr", "batch_size", "epochs"]])
    
    return study, best_trial

# Train Final Model with Best Parameters

In [9]:
def train_final_model(best_trial, train_df: pd.DataFrame, base_config: BaseConfig):
    print("\n" + " "*80)
    print("STEP 3: TRAIN FINAL MODEL WITH BEST HYPERPARAMETERS")
    print(" "*80)
    
    class FinalConfig(BaseConfig):
        LEARNING_RATE = best_trial.params["learning_rate"]
        WEIGHT_DECAY = best_trial.params["weight_decay"]
        BATCH_SIZE = best_trial.params["batch_size"]
        NUM_EPOCHS = best_trial.params["num_epochs"]
        OUTPUT_DIR = f"{base_config.OUTPUT_DIR}/final_model"
        RESULTS_DIR = f"{base_config.RESULTS_DIR}final_model/"
    
    final_config = FinalConfig()
    
    print(f"\nFinal Training Configuration:")
    print(f"Learning Rate: {final_config.LEARNING_RATE:.2e}")
    print(f"Weight Decay: {final_config.WEIGHT_DECAY:.4f}")
    print(f"Batch Size: {final_config.BATCH_SIZE}")
    print(f"Num Epochs: {final_config.NUM_EPOCHS}\n")
    
    final_model_path = run_train(train_df, final_config)
    
    return final_config, final_model_path

# Inference

In [10]:
class InferenceConfig:
    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"
    NUM_LABELS = 2
    MAX_LENGTH = 128
    BATCH_SIZE = 32
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    CHECKPOINT_PATH = None # Will be set from best model

class TestDataset(Dataset):
    def __init__(self, texts: List[str], tokenizer, max_length: int = 128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }


def run_inference(df: pd.DataFrame, config: InferenceConfig) -> Dict:
    print(f"Running inference on {len(df)} samples...")
    print(f"Device: {config.DEVICE}")
    
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
    
    model = AutoModelForSequenceClassification.from_pretrained(
        config.MODEL_NAME,
        num_labels=config.NUM_LABELS
    )
    
    checkpoint = torch.load(config.CHECKPOINT_PATH, map_location=config.DEVICE)
    
    if isinstance(checkpoint, dict):
        if "model_state_dict" in checkpoint:
            state_dict = checkpoint["model_state_dict"]
        elif "state_dict" in checkpoint:
            state_dict = checkpoint["state_dict"]
        else:
            state_dict = checkpoint
    else:
        state_dict = checkpoint
    
    model.load_state_dict(state_dict, strict=False)
    print(f"Loaded checkpoint from: {config.CHECKPOINT_PATH}")
    
    model.to(config.DEVICE)
    model.eval()
    
    texts = df["text"].tolist()
    labels = df["label"].tolist()
    languages = df["lang"].tolist()
    
    dataset = TestDataset(texts, tokenizer, config.MAX_LENGTH)
    dataloader = DataLoader(dataset, batch_size=config.BATCH_SIZE, shuffle=False)
    
    all_preds = []
    all_probs = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Inference"):
            input_ids = batch["input_ids"].to(config.DEVICE)
            attention_mask = batch["attention_mask"].to(config.DEVICE)
    
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
    
            probs = torch.softmax(logits, dim=-1)
            preds = torch.argmax(logits, dim=-1)
    
            all_probs.extend(probs.cpu().numpy().tolist())
            all_preds.extend(preds.cpu().numpy().tolist())
    
    metrics = calculate_metrics(all_preds, labels, languages)
    
    print("\n" + "="*80)
    print("INFERENCE RESULTS ON TRAINING DATA")
    print("="*80)
    
    print(f"\nOverall Metrics:")
    print(f"Macro Precision:{metrics['overall']['macro_precision']:.4f}")
    print(f"Macro Recall:{metrics['overall']['macro_recall']:.4f}")
    print(f"Macro F1:{metrics['overall']['macro_f1']:.4f}")
    
    print(f"\nPer-Language Metrics:")
    for lang in sorted([k for k in metrics.keys() if k != "overall"]):
        print(f"{lang.upper()}:")
        print(f"Precision:{metrics[lang]['macro_precision']:.4f}")
        print(f"Recall:{metrics[lang]['macro_recall']:.4f}")
        print(f"F1:{metrics[lang]['macro_f1']:.4f}")
    
    print("="*80 + "\n")
    
    return {
        "predictions": all_preds,
        "probabilities": all_probs,
        "labels": labels,
        "languages": languages,
        "metrics": metrics
    }


# Actual Run

### Load Data

In [11]:
original_data = train_df
augmented_data = pd.read_csv("../data/augmented_multilingual_tweets.csv")
print(original_data.shape, augmented_data.shape)

(2988, 5) (5976, 8)


In [12]:
merged_data = pd.concat([original_data, augmented_data[list(original_data.columns)]], ignore_index=True)
print(merged_data.shape)

(8964, 5)


### Optuna Parameter Selection

In [13]:
base_config = BaseConfig()
study, best_trial = run_optuna_optimization(merged_data, n_trials=10)

[I 2025-12-04 19:44:58,211] A new study created in memory with name: classification_optimization



                                                                                
STEP 1: OPTUNA HYPERPARAMETER OPTIMIZATION
                                                                                
Number of trials: 10
CV folds per trial: 5
Total model trainings: 50
                                                                                


Trial 0
Learning Rate: 4.33e-05
Weight Decay: 0.0951
Batch Size: 8
Num Epochs: 6

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:44:59,762 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:44:59,762 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 19:44:59,762 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 71.06it/s, loss=0.137]
Validating: 100%|██████████| 225/225 [00:02<00:00, 109.99it/s]


Trial 0, Fold 0, Epoch 1: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 72.66it/s, loss=0.13] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.76it/s]


Trial 0, Fold 0, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 73.97it/s, loss=0.124]
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.45it/s]


Trial 0, Fold 0, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 73.33it/s, loss=0.115]
Validating: 100%|██████████| 225/225 [00:02<00:00, 109.49it/s]


Trial 0, Fold 0, Epoch 4: F1=0.4766


Training: 100%|██████████| 897/897 [00:12<00:00, 71.60it/s, loss=0.11]  
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.03it/s]


Trial 0, Fold 0, Epoch 5: F1=0.5657


Training: 100%|██████████| 897/897 [00:12<00:00, 72.16it/s, loss=0.106]
Validating: 100%|██████████| 225/225 [00:02<00:00, 107.94it/s]


Trial 0, Fold 0, Epoch 6: F1=0.5855
  Fold 1 F1: 0.5855

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:46:26,999 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:46:26,999 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 19:46:26,999 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 72.85it/s, loss=0.224]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.75it/s]


Trial 0, Fold 1, Epoch 1: F1=0.4237


Training: 100%|██████████| 897/897 [00:12<00:00, 72.64it/s, loss=0.131]
Validating: 100%|██████████| 225/225 [00:02<00:00, 109.76it/s]


Trial 0, Fold 1, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 71.34it/s, loss=0.12] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.06it/s]


Trial 0, Fold 1, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 73.80it/s, loss=0.114]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.47it/s]


Trial 0, Fold 1, Epoch 4: F1=0.4612


Training: 100%|██████████| 897/897 [00:12<00:00, 71.38it/s, loss=0.11] 
Validating: 100%|██████████| 225/225 [00:02<00:00, 107.04it/s]


Trial 0, Fold 1, Epoch 5: F1=0.5305


Training: 100%|██████████| 897/897 [00:12<00:00, 71.85it/s, loss=0.108]
Validating: 100%|██████████| 225/225 [00:02<00:00, 109.24it/s]


Trial 0, Fold 1, Epoch 6: F1=0.5992
  Fold 2 F1: 0.5992

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:47:54,412 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:47:54,412 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 19:47:54,412 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 71.41it/s, loss=0.57] 
Validating: 100%|██████████| 225/225 [00:02<00:00, 108.09it/s]


Trial 0, Fold 2, Epoch 1: F1=0.1254


Training: 100%|██████████| 897/897 [00:12<00:00, 71.33it/s, loss=0.22] 
Validating: 100%|██████████| 225/225 [00:02<00:00, 110.38it/s]


Trial 0, Fold 2, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 73.49it/s, loss=0.132]
Validating: 100%|██████████| 225/225 [00:02<00:00, 106.66it/s]


Trial 0, Fold 2, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 72.11it/s, loss=0.124]
Validating: 100%|██████████| 225/225 [00:02<00:00, 108.10it/s]


Trial 0, Fold 2, Epoch 4: F1=0.4831


Training: 100%|██████████| 897/897 [00:12<00:00, 71.82it/s, loss=0.118]
Validating: 100%|██████████| 225/225 [00:02<00:00, 111.34it/s]


Trial 0, Fold 2, Epoch 5: F1=0.5546


Training: 100%|██████████| 897/897 [00:12<00:00, 71.45it/s, loss=0.115]
Validating: 100%|██████████| 225/225 [00:02<00:00, 104.52it/s]


Trial 0, Fold 2, Epoch 6: F1=0.6148
  Fold 3 F1: 0.6148

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:49:22,509 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:49:22,510 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 19:49:22,510 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 71.69it/s, loss=0.373]
Validating: 100%|██████████| 225/225 [00:02<00:00, 106.90it/s]


Trial 0, Fold 3, Epoch 1: F1=0.1254


Training: 100%|██████████| 897/897 [00:12<00:00, 71.88it/s, loss=0.175]
Validating: 100%|██████████| 225/225 [00:02<00:00, 109.20it/s]


Trial 0, Fold 3, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 71.56it/s, loss=0.131]
Validating: 100%|██████████| 225/225 [00:02<00:00, 110.22it/s]


Trial 0, Fold 3, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 72.57it/s, loss=0.119]
Validating: 100%|██████████| 225/225 [00:02<00:00, 108.94it/s]


Trial 0, Fold 3, Epoch 4: F1=0.4924


Training: 100%|██████████| 897/897 [00:12<00:00, 73.53it/s, loss=0.113]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.79it/s]


Trial 0, Fold 3, Epoch 5: F1=0.6592


Training: 100%|██████████| 897/897 [00:12<00:00, 73.33it/s, loss=0.111]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.27it/s]


Trial 0, Fold 3, Epoch 6: F1=0.5981
  Fold 4 F1: 0.6592

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:50:49,837 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 19:50:49,837 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 19:50:49,837 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 73.20it/s, loss=0.17] 
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.98it/s]


Trial 0, Fold 4, Epoch 1: F1=0.5353


Training: 100%|██████████| 897/897 [00:13<00:00, 67.51it/s, loss=0.126]
Validating: 100%|██████████| 224/224 [00:02<00:00, 110.55it/s]


Trial 0, Fold 4, Epoch 2: F1=0.4615


Training: 100%|██████████| 897/897 [00:13<00:00, 67.55it/s, loss=0.122]
Validating: 100%|██████████| 224/224 [00:02<00:00, 110.78it/s]


Trial 0, Fold 4, Epoch 3: F1=0.4696


Training: 100%|██████████| 897/897 [00:12<00:00, 69.72it/s, loss=0.114]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.67it/s]
[I 2025-12-04 19:51:49,604] Trial 0 finished with value: 0.49174287418015855 and parameters: {'learning_rate': 4.3284502212938785e-05, 'weight_decay': 0.09507143064099162, 'batch_size': 8, 'num_epochs': 6}. Best is trial 0 with value: 0.49174287418015855.


Trial 0, Fold 4, Epoch 4: F1=0.4996
Error in fold 4: name 'logger' is not defined
  Fold 5 F1: 0.0000

Trial 0 - Average F1: 0.4917
Fold scores: ['0.5855', '0.5992', '0.6148', '0.6592', '0.0000']


Trial 1
Learning Rate: 1.26e-05
Weight Decay: 0.0866
Batch Size: 16
Num Epochs: 12

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:51:50,435 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:51:50,435 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 19:51:50,435 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.19it/s, loss=0.262]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.04it/s]


Trial 1, Fold 0, Epoch 1: F1=0.1501


Training: 100%|██████████| 449/449 [00:10<00:00, 43.47it/s, loss=0.218]
Validating: 100%|██████████| 113/113 [00:01<00:00, 68.64it/s]


Trial 1, Fold 0, Epoch 2: F1=0.5405


Training: 100%|██████████| 449/449 [00:09<00:00, 45.91it/s, loss=0.162]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.32it/s]


Trial 1, Fold 0, Epoch 3: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.43it/s, loss=0.154]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.64it/s]


Trial 1, Fold 0, Epoch 4: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.50it/s, loss=0.144]
Validating: 100%|██████████| 113/113 [00:01<00:00, 67.83it/s]


Trial 1, Fold 0, Epoch 5: F1=0.4614
Error in fold 0: name 'logger' is not defined
  Fold 1 F1: 0.0000

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:52:48,523 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:52:48,524 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 19:52:48,524 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 45.46it/s, loss=0.14] 
Validating: 100%|██████████| 113/113 [00:01<00:00, 68.20it/s]


Trial 1, Fold 1, Epoch 1: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 45.70it/s, loss=0.138]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.36it/s]


Trial 1, Fold 1, Epoch 2: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.37it/s, loss=0.133]
Validating: 100%|██████████| 113/113 [00:01<00:00, 66.09it/s]


Trial 1, Fold 1, Epoch 3: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.63it/s, loss=0.128]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.67it/s]


Trial 1, Fold 1, Epoch 4: F1=0.4614
Error in fold 1: name 'logger' is not defined
  Fold 2 F1: 0.0000

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:53:34,866 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:53:34,867 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 19:53:34,867 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.81it/s, loss=0.214]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.65it/s]


Trial 1, Fold 2, Epoch 1: F1=0.3452


Training: 100%|██████████| 449/449 [00:09<00:00, 46.81it/s, loss=0.174]
Validating: 100%|██████████| 113/113 [00:01<00:00, 68.59it/s]


Trial 1, Fold 2, Epoch 2: F1=0.4647


Training: 100%|██████████| 449/449 [00:09<00:00, 46.21it/s, loss=0.131]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.86it/s]


Trial 1, Fold 2, Epoch 3: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.22it/s, loss=0.125]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.60it/s]


Trial 1, Fold 2, Epoch 4: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.88it/s, loss=0.122]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.47it/s]


Trial 1, Fold 2, Epoch 5: F1=0.4614
Error in fold 2: name 'logger' is not defined
  Fold 3 F1: 0.0000

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:54:31,878 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:54:31,878 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 19:54:31,878 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.77it/s, loss=0.417]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.66it/s]


Trial 1, Fold 3, Epoch 1: F1=0.1254


Training: 100%|██████████| 449/449 [00:09<00:00, 45.57it/s, loss=0.319]
Validating: 100%|██████████| 113/113 [00:01<00:00, 67.67it/s]


Trial 1, Fold 3, Epoch 2: F1=0.3189


Training: 100%|██████████| 449/449 [00:09<00:00, 46.22it/s, loss=0.16] 
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.47it/s]


Trial 1, Fold 3, Epoch 3: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.09it/s, loss=0.124]
Validating: 100%|██████████| 113/113 [00:01<00:00, 68.79it/s]


Trial 1, Fold 3, Epoch 4: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.40it/s, loss=0.124]
Validating: 100%|██████████| 113/113 [00:01<00:00, 65.78it/s]


Trial 1, Fold 3, Epoch 5: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 45.84it/s, loss=0.121]
Validating: 100%|██████████| 113/113 [00:01<00:00, 68.97it/s]


Trial 1, Fold 3, Epoch 6: F1=0.4614
Error in fold 3: name 'logger' is not defined
  Fold 4 F1: 0.0000

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:55:40,900 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 19:55:40,901 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 19:55:40,901 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 45.75it/s, loss=0.15] 
Validating: 100%|██████████| 112/112 [00:01<00:00, 68.24it/s]


Trial 1, Fold 4, Epoch 1: F1=0.4615


Training: 100%|██████████| 449/449 [00:09<00:00, 46.31it/s, loss=0.145]
Validating: 100%|██████████| 112/112 [00:01<00:00, 67.88it/s]


Trial 1, Fold 4, Epoch 2: F1=0.4615


Training: 100%|██████████| 449/449 [00:09<00:00, 46.22it/s, loss=0.143]
Validating: 100%|██████████| 112/112 [00:01<00:00, 68.33it/s]


Trial 1, Fold 4, Epoch 3: F1=0.4615


Training: 100%|██████████| 449/449 [00:09<00:00, 46.03it/s, loss=0.134]
Validating: 100%|██████████| 112/112 [00:01<00:00, 65.80it/s]
[I 2025-12-04 19:56:26,571] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 1.2551115172973821e-05, 'weight_decay': 0.08661761457749352, 'batch_size': 16, 'num_epochs': 12}. Best is trial 0 with value: 0.49174287418015855.


Trial 1, Fold 4, Epoch 4: F1=0.4615
Error in fold 4: name 'logger' is not defined
  Fold 5 F1: 0.0000

Trial 1 - Average F1: 0.0000
Fold scores: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000']


Trial 2
Learning Rate: 2.60e-04
Weight Decay: 0.0212
Batch Size: 32
Num Epochs: 9

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:56:27,376 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:56:27,376 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 19:56:27,376 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.36it/s, loss=0.14] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 38.47it/s]


Trial 2, Fold 0, Epoch 1: F1=0.4612


Training: 100%|██████████| 225/225 [00:07<00:00, 28.24it/s, loss=0.118]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.17it/s]


Trial 2, Fold 0, Epoch 2: F1=0.5133


Training: 100%|██████████| 225/225 [00:07<00:00, 28.30it/s, loss=0.102] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.45it/s]


Trial 2, Fold 0, Epoch 3: F1=0.5915


Training: 100%|██████████| 225/225 [00:07<00:00, 28.61it/s, loss=0.0907]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.41it/s]


Trial 2, Fold 0, Epoch 4: F1=0.6348


Training: 100%|██████████| 225/225 [00:07<00:00, 28.17it/s, loss=0.0848]
Validating: 100%|██████████| 57/57 [00:01<00:00, 37.92it/s]


Trial 2, Fold 0, Epoch 5: F1=0.7124


Training: 100%|██████████| 225/225 [00:08<00:00, 27.29it/s, loss=0.0766]
Validating: 100%|██████████| 57/57 [00:01<00:00, 37.10it/s]


Trial 2, Fold 0, Epoch 6: F1=0.7609


Training: 100%|██████████| 225/225 [00:08<00:00, 27.85it/s, loss=0.0677]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.41it/s]


Trial 2, Fold 0, Epoch 7: F1=0.7597


Training: 100%|██████████| 225/225 [00:07<00:00, 28.32it/s, loss=0.0611]
Validating: 100%|██████████| 57/57 [00:01<00:00, 38.22it/s]


Trial 2, Fold 0, Epoch 8: F1=0.7870


Training: 100%|██████████| 225/225 [00:07<00:00, 28.13it/s, loss=0.0576]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.44it/s]


Trial 2, Fold 0, Epoch 9: F1=0.7833
  Fold 1 F1: 0.7870

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:57:53,427 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:57:53,427 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 19:57:53,428 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.55it/s, loss=0.484]
Validating: 100%|██████████| 57/57 [00:01<00:00, 38.54it/s]


Trial 2, Fold 1, Epoch 1: F1=0.1254


Training: 100%|██████████| 225/225 [00:08<00:00, 28.04it/s, loss=0.139]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.50it/s]


Trial 2, Fold 1, Epoch 2: F1=0.5168


Training: 100%|██████████| 225/225 [00:07<00:00, 28.73it/s, loss=0.103]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.67it/s]


Trial 2, Fold 1, Epoch 3: F1=0.6097


Training: 100%|██████████| 225/225 [00:07<00:00, 28.74it/s, loss=0.0933]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.70it/s]


Trial 2, Fold 1, Epoch 4: F1=0.6076


Training: 100%|██████████| 225/225 [00:07<00:00, 28.72it/s, loss=0.0838]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.85it/s]


Trial 2, Fold 1, Epoch 5: F1=0.7049


Training: 100%|██████████| 225/225 [00:07<00:00, 28.70it/s, loss=0.0766]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.82it/s]


Trial 2, Fold 1, Epoch 6: F1=0.7116


Training: 100%|██████████| 225/225 [00:07<00:00, 28.71it/s, loss=0.0686]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.86it/s]


Trial 2, Fold 1, Epoch 7: F1=0.7113


Training: 100%|██████████| 225/225 [00:07<00:00, 28.78it/s, loss=0.0643]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.75it/s]


Trial 2, Fold 1, Epoch 8: F1=0.7879


Training: 100%|██████████| 225/225 [00:07<00:00, 28.67it/s, loss=0.0593]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.26it/s]


Trial 2, Fold 1, Epoch 9: F1=0.8113
  Fold 2 F1: 0.8113

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 19:59:17,970 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 19:59:17,971 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 19:59:17,971 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:08<00:00, 28.05it/s, loss=0.219]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.63it/s]


Trial 2, Fold 2, Epoch 1: F1=0.3707


Training: 100%|██████████| 225/225 [00:07<00:00, 28.14it/s, loss=0.116]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.76it/s]


Trial 2, Fold 2, Epoch 2: F1=0.5061


Training: 100%|██████████| 225/225 [00:07<00:00, 28.65it/s, loss=0.101] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.56it/s]


Trial 2, Fold 2, Epoch 3: F1=0.6777


Training: 100%|██████████| 225/225 [00:07<00:00, 28.48it/s, loss=0.0923]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.59it/s]


Trial 2, Fold 2, Epoch 4: F1=0.7175


Training: 100%|██████████| 225/225 [00:07<00:00, 28.64it/s, loss=0.0834]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.27it/s]


Trial 2, Fold 2, Epoch 5: F1=0.7340


Training: 100%|██████████| 225/225 [00:07<00:00, 28.63it/s, loss=0.0761]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.43it/s]


Trial 2, Fold 2, Epoch 6: F1=0.7660


Training: 100%|██████████| 225/225 [00:07<00:00, 28.73it/s, loss=0.0685]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.73it/s]


Trial 2, Fold 2, Epoch 7: F1=0.7624


Training: 100%|██████████| 225/225 [00:07<00:00, 28.75it/s, loss=0.0616]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.85it/s]


Trial 2, Fold 2, Epoch 8: F1=0.7705


Training: 100%|██████████| 225/225 [00:07<00:00, 28.72it/s, loss=0.0566]
Validating: 100%|██████████| 57/57 [00:01<00:00, 38.00it/s]


Trial 2, Fold 2, Epoch 9: F1=0.7188
  Fold 3 F1: 0.7705

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:00:42,794 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:00:42,795 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 20:00:42,795 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.62it/s, loss=0.575]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.70it/s]


Trial 2, Fold 3, Epoch 1: F1=0.1254


Training: 100%|██████████| 225/225 [00:07<00:00, 28.65it/s, loss=0.144]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.62it/s]


Trial 2, Fold 3, Epoch 2: F1=0.5323


Training: 100%|██████████| 225/225 [00:07<00:00, 28.72it/s, loss=0.104]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.93it/s]


Trial 2, Fold 3, Epoch 3: F1=0.5986


Training: 100%|██████████| 225/225 [00:07<00:00, 28.63it/s, loss=0.0947]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.79it/s]


Trial 2, Fold 3, Epoch 4: F1=0.7076


Training: 100%|██████████| 225/225 [00:07<00:00, 28.66it/s, loss=0.0847]
Validating: 100%|██████████| 57/57 [00:01<00:00, 38.52it/s]


Trial 2, Fold 3, Epoch 5: F1=0.7107


Training: 100%|██████████| 225/225 [00:08<00:00, 27.84it/s, loss=0.0782]
Validating: 100%|██████████| 57/57 [00:01<00:00, 37.95it/s]


Trial 2, Fold 3, Epoch 6: F1=0.7317


Training: 100%|██████████| 225/225 [00:07<00:00, 28.13it/s, loss=0.0742]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.09it/s]


Trial 2, Fold 3, Epoch 7: F1=0.6799


Training: 100%|██████████| 225/225 [00:07<00:00, 28.53it/s, loss=0.0679]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.14it/s]


Trial 2, Fold 3, Epoch 8: F1=0.7429


Training: 100%|██████████| 225/225 [00:07<00:00, 28.49it/s, loss=0.0573]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.38it/s]


Trial 2, Fold 3, Epoch 9: F1=0.8177
  Fold 4 F1: 0.8177

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:02:07,837 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 20:02:07,837 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 20:02:07,837 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.48it/s, loss=0.304]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.64it/s]


Trial 2, Fold 4, Epoch 1: F1=0.1318


Training: 100%|██████████| 225/225 [00:07<00:00, 28.50it/s, loss=0.129]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.34it/s]


Trial 2, Fold 4, Epoch 2: F1=0.5057


Training: 100%|██████████| 225/225 [00:07<00:00, 28.19it/s, loss=0.108]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.66it/s]


Trial 2, Fold 4, Epoch 3: F1=0.5107


Training: 100%|██████████| 225/225 [00:07<00:00, 28.35it/s, loss=0.0917]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.07it/s]


Trial 2, Fold 4, Epoch 4: F1=0.6355


Training: 100%|██████████| 225/225 [00:07<00:00, 28.45it/s, loss=0.083] 
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.47it/s]


Trial 2, Fold 4, Epoch 5: F1=0.6635


Training: 100%|██████████| 225/225 [00:07<00:00, 28.29it/s, loss=0.0777]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.88it/s]


Trial 2, Fold 4, Epoch 6: F1=0.6817


Training: 100%|██████████| 225/225 [00:07<00:00, 28.41it/s, loss=0.0716]
Validating: 100%|██████████| 56/56 [00:01<00:00, 37.81it/s]


Trial 2, Fold 4, Epoch 7: F1=0.7728


Training: 100%|██████████| 225/225 [00:07<00:00, 28.28it/s, loss=0.0626]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.35it/s]


Trial 2, Fold 4, Epoch 8: F1=0.7454


Training: 100%|██████████| 225/225 [00:07<00:00, 28.50it/s, loss=0.0565]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.51it/s]
[I 2025-12-04 20:03:32,428] Trial 2 finished with value: 0.7918504359347013 and parameters: {'learning_rate': 0.00025959425503112657, 'weight_decay': 0.021233911067827616, 'batch_size': 32, 'num_epochs': 9}. Best is trial 2 with value: 0.7918504359347013.


Trial 2, Fold 4, Epoch 9: F1=0.7424
  Fold 5 F1: 0.7728

Trial 2 - Average F1: 0.7919
Fold scores: ['0.7870', '0.8113', '0.7705', '0.8177', '0.7728']


Trial 3
Learning Rate: 5.42e-05
Weight Decay: 0.0291
Batch Size: 8
Num Epochs: 7

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:03:33,274 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:03:33,274 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 20:03:33,274 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 73.27it/s, loss=0.163]
Validating: 100%|██████████| 225/225 [00:02<00:00, 110.73it/s]


Trial 3, Fold 0, Epoch 1: F1=0.4612


Training: 100%|██████████| 897/897 [00:12<00:00, 73.95it/s, loss=0.122]
Validating: 100%|██████████| 225/225 [00:02<00:00, 111.89it/s]


Trial 3, Fold 0, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.00it/s, loss=0.117]
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.11it/s]


Trial 3, Fold 0, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 73.65it/s, loss=0.108]
Validating: 100%|██████████| 225/225 [00:02<00:00, 109.68it/s]


Trial 3, Fold 0, Epoch 4: F1=0.4651


Training: 100%|██████████| 897/897 [00:12<00:00, 74.02it/s, loss=0.105] 
Validating: 100%|██████████| 225/225 [00:02<00:00, 108.90it/s]


Trial 3, Fold 0, Epoch 5: F1=0.5216


Training: 100%|██████████| 897/897 [00:12<00:00, 73.46it/s, loss=0.104] 
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.01it/s]


Trial 3, Fold 0, Epoch 6: F1=0.6147


Training: 100%|██████████| 897/897 [00:12<00:00, 74.00it/s, loss=0.1]  
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.37it/s]


Trial 3, Fold 0, Epoch 7: F1=0.6356
  Fold 1 F1: 0.6356

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:05:13,369 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:05:13,370 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 20:05:13,370 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 73.86it/s, loss=0.435]
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.07it/s]


Trial 3, Fold 1, Epoch 1: F1=0.1254


Training: 100%|██████████| 897/897 [00:12<00:00, 74.06it/s, loss=0.182]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.94it/s]


Trial 3, Fold 1, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 73.64it/s, loss=0.136]
Validating: 100%|██████████| 225/225 [00:02<00:00, 110.78it/s]


Trial 3, Fold 1, Epoch 3: F1=0.4692


Training: 100%|██████████| 897/897 [00:12<00:00, 72.95it/s, loss=0.127]
Validating: 100%|██████████| 225/225 [00:02<00:00, 109.66it/s]


Trial 3, Fold 1, Epoch 4: F1=0.5381


Training: 100%|██████████| 897/897 [00:12<00:00, 73.51it/s, loss=0.115]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.69it/s]


Trial 3, Fold 1, Epoch 5: F1=0.6219


Training: 100%|██████████| 897/897 [00:12<00:00, 74.56it/s, loss=0.111]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.11it/s]


Trial 3, Fold 1, Epoch 6: F1=0.6253


Training: 100%|██████████| 897/897 [00:12<00:00, 74.54it/s, loss=0.107] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.18it/s]


Trial 3, Fold 1, Epoch 7: F1=0.6953
  Fold 2 F1: 0.6953

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:06:53,210 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:06:53,211 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 20:06:53,211 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.48it/s, loss=0.167]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.28it/s]


Trial 3, Fold 2, Epoch 1: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.52it/s, loss=0.165]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.16it/s]


Trial 3, Fold 2, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.55it/s, loss=0.149]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.99it/s]


Trial 3, Fold 2, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.48it/s, loss=0.132]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.59it/s]


Trial 3, Fold 2, Epoch 4: F1=0.4654


Training: 100%|██████████| 897/897 [00:12<00:00, 74.50it/s, loss=0.118]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.29it/s]


Trial 3, Fold 2, Epoch 5: F1=0.5239


Training: 100%|██████████| 897/897 [00:12<00:00, 74.48it/s, loss=0.112]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.36it/s]


Trial 3, Fold 2, Epoch 6: F1=0.5863


Training: 100%|██████████| 897/897 [00:12<00:00, 74.52it/s, loss=0.105]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.36it/s]


Trial 3, Fold 2, Epoch 7: F1=0.6393
  Fold 3 F1: 0.6393

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:08:32,194 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:08:32,195 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 20:08:32,195 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.52it/s, loss=0.14] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.74it/s]


Trial 3, Fold 3, Epoch 1: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.59it/s, loss=0.141]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.68it/s]


Trial 3, Fold 3, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.56it/s, loss=0.13] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.07it/s]


Trial 3, Fold 3, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.41it/s, loss=0.117]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.62it/s]


Trial 3, Fold 3, Epoch 4: F1=0.4850


Training: 100%|██████████| 897/897 [00:12<00:00, 74.67it/s, loss=0.111]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.56it/s]


Trial 3, Fold 3, Epoch 5: F1=0.6050


Training: 100%|██████████| 897/897 [00:12<00:00, 74.40it/s, loss=0.108]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.19it/s]


Trial 3, Fold 3, Epoch 6: F1=0.6397


Training: 100%|██████████| 897/897 [00:12<00:00, 74.48it/s, loss=0.106] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.26it/s]


Trial 3, Fold 3, Epoch 7: F1=0.6262
  Fold 4 F1: 0.6397

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:10:11,137 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 20:10:11,138 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 20:10:11,138 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.49it/s, loss=0.4]  
Validating: 100%|██████████| 224/224 [00:01<00:00, 113.34it/s]


Trial 3, Fold 4, Epoch 1: F1=0.1250


Training: 100%|██████████| 897/897 [00:12<00:00, 74.56it/s, loss=0.173]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.09it/s]


Trial 3, Fold 4, Epoch 2: F1=0.4615


Training: 100%|██████████| 897/897 [00:12<00:00, 74.67it/s, loss=0.127]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.54it/s]


Trial 3, Fold 4, Epoch 3: F1=0.4615


Training: 100%|██████████| 897/897 [00:12<00:00, 74.43it/s, loss=0.117]
Validating: 100%|██████████| 224/224 [00:01<00:00, 113.18it/s]


Trial 3, Fold 4, Epoch 4: F1=0.4848


Training: 100%|██████████| 897/897 [00:12<00:00, 74.47it/s, loss=0.109]
Validating: 100%|██████████| 224/224 [00:01<00:00, 113.01it/s]


Trial 3, Fold 4, Epoch 5: F1=0.5369


Training: 100%|██████████| 897/897 [00:12<00:00, 74.55it/s, loss=0.109]
Validating: 100%|██████████| 224/224 [00:01<00:00, 113.17it/s]


Trial 3, Fold 4, Epoch 6: F1=0.6139


Training: 100%|██████████| 897/897 [00:12<00:00, 74.55it/s, loss=0.105]
Validating: 100%|██████████| 224/224 [00:01<00:00, 113.28it/s]
[I 2025-12-04 20:11:49,369] Trial 3 finished with value: 0.6484322783086813 and parameters: {'learning_rate': 5.4182823195332406e-05, 'weight_decay': 0.029122914019804193, 'batch_size': 8, 'num_epochs': 7}. Best is trial 2 with value: 0.7918504359347013.


Trial 3, Fold 4, Epoch 7: F1=0.6322
  Fold 5 F1: 0.6322

Trial 3 - Average F1: 0.6484
Fold scores: ['0.6356', '0.6953', '0.6393', '0.6397', '0.6322']


Trial 4
Learning Rate: 5.95e-05
Weight Decay: 0.0785
Batch Size: 32
Num Epochs: 5

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:11:50,190 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:11:50,190 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 20:11:50,190 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.68it/s, loss=0.132]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.44it/s]


Trial 4, Fold 0, Epoch 1: F1=0.4614


Training: 100%|██████████| 225/225 [00:07<00:00, 28.65it/s, loss=0.12] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.77it/s]


Trial 4, Fold 0, Epoch 2: F1=0.4614


Training: 100%|██████████| 225/225 [00:07<00:00, 28.57it/s, loss=0.11] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.42it/s]


Trial 4, Fold 0, Epoch 3: F1=0.4771


Training: 100%|██████████| 225/225 [00:07<00:00, 28.65it/s, loss=0.102]
Validating: 100%|██████████| 57/57 [00:01<00:00, 38.31it/s]


Trial 4, Fold 0, Epoch 4: F1=0.6125


Training: 100%|██████████| 225/225 [00:08<00:00, 28.09it/s, loss=0.0943]
Validating: 100%|██████████| 57/57 [00:01<00:00, 38.79it/s]


Trial 4, Fold 0, Epoch 5: F1=0.6342
  Fold 1 F1: 0.6342

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:12:37,733 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:12:37,733 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 20:12:37,733 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.44it/s, loss=0.182]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.49it/s]


Trial 4, Fold 1, Epoch 1: F1=0.5415


Training: 100%|██████████| 225/225 [00:07<00:00, 28.51it/s, loss=0.121]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.41it/s]


Trial 4, Fold 1, Epoch 2: F1=0.4612


Training: 100%|██████████| 225/225 [00:07<00:00, 28.49it/s, loss=0.107]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.38it/s]


Trial 4, Fold 1, Epoch 3: F1=0.5339


Training: 100%|██████████| 225/225 [00:07<00:00, 28.56it/s, loss=0.1]   
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.36it/s]


Trial 4, Fold 1, Epoch 4: F1=0.6538


Training: 100%|██████████| 225/225 [00:07<00:00, 28.51it/s, loss=0.0963]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.44it/s]


Trial 4, Fold 1, Epoch 5: F1=0.6574
  Fold 2 F1: 0.6574

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:13:25,204 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:13:25,204 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 20:13:25,204 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.50it/s, loss=0.145]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.46it/s]


Trial 4, Fold 2, Epoch 1: F1=0.4611


Training: 100%|██████████| 225/225 [00:07<00:00, 28.57it/s, loss=0.125]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.41it/s]


Trial 4, Fold 2, Epoch 2: F1=0.4614


Training: 100%|██████████| 225/225 [00:07<00:00, 28.52it/s, loss=0.109]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.52it/s]


Trial 4, Fold 2, Epoch 3: F1=0.5061


Training: 100%|██████████| 225/225 [00:07<00:00, 28.46it/s, loss=0.0997]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.33it/s]


Trial 4, Fold 2, Epoch 4: F1=0.5196


Training: 100%|██████████| 225/225 [00:07<00:00, 28.52it/s, loss=0.094] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.44it/s]


Trial 4, Fold 2, Epoch 5: F1=0.6110
  Fold 3 F1: 0.6110

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:14:12,695 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:14:12,695 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 20:14:12,695 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.53it/s, loss=0.176]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.45it/s]


Trial 4, Fold 3, Epoch 1: F1=0.4939


Training: 100%|██████████| 225/225 [00:07<00:00, 28.50it/s, loss=0.123]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.53it/s]


Trial 4, Fold 3, Epoch 2: F1=0.4614


Training: 100%|██████████| 225/225 [00:07<00:00, 28.51it/s, loss=0.108]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.46it/s]


Trial 4, Fold 3, Epoch 3: F1=0.5407


Training: 100%|██████████| 225/225 [00:07<00:00, 28.55it/s, loss=0.101] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.31it/s]


Trial 4, Fold 3, Epoch 4: F1=0.5660


Training: 100%|██████████| 225/225 [00:07<00:00, 28.47it/s, loss=0.0944]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.59it/s]


Trial 4, Fold 3, Epoch 5: F1=0.5755
  Fold 4 F1: 0.5755

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:15:00,141 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 20:15:00,142 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 20:15:00,142 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.51it/s, loss=0.137]
Validating: 100%|██████████| 56/56 [00:01<00:00, 39.01it/s]


Trial 4, Fold 4, Epoch 1: F1=0.4615


Training: 100%|██████████| 225/225 [00:07<00:00, 28.54it/s, loss=0.129]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.76it/s]


Trial 4, Fold 4, Epoch 2: F1=0.4615


Training: 100%|██████████| 225/225 [00:07<00:00, 28.54it/s, loss=0.114]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.74it/s]


Trial 4, Fold 4, Epoch 3: F1=0.4962


Training: 100%|██████████| 225/225 [00:07<00:00, 28.55it/s, loss=0.105]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.97it/s]


Trial 4, Fold 4, Epoch 4: F1=0.5887


Training: 100%|██████████| 225/225 [00:07<00:00, 28.42it/s, loss=0.0979]
Validating: 100%|██████████| 56/56 [00:01<00:00, 39.02it/s]
[I 2025-12-04 20:15:46,871] Trial 4 finished with value: 0.6154495927977643 and parameters: {'learning_rate': 5.954553793888986e-05, 'weight_decay': 0.07851759613930137, 'batch_size': 32, 'num_epochs': 5}. Best is trial 2 with value: 0.7918504359347013.


Trial 4, Fold 4, Epoch 5: F1=0.5992
  Fold 5 F1: 0.5992

Trial 4 - Average F1: 0.6154
Fold scores: ['0.6342', '0.6574', '0.6110', '0.5755', '0.5992']


Trial 5
Learning Rate: 1.08e-04
Weight Decay: 0.0171
Batch Size: 32
Num Epochs: 11

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:15:47,701 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:15:47,701 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 20:15:47,701 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.59it/s, loss=0.134]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.56it/s]


Trial 5, Fold 0, Epoch 1: F1=0.4612


Training: 100%|██████████| 225/225 [00:07<00:00, 28.56it/s, loss=0.12] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.66it/s]


Trial 5, Fold 0, Epoch 2: F1=0.4614


Training: 100%|██████████| 225/225 [00:07<00:00, 28.60it/s, loss=0.108]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.41it/s]


Trial 5, Fold 0, Epoch 3: F1=0.4727


Training: 100%|██████████| 225/225 [00:07<00:00, 28.66it/s, loss=0.102]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.52it/s]


Trial 5, Fold 0, Epoch 4: F1=0.5713


Training: 100%|██████████| 225/225 [00:07<00:00, 28.62it/s, loss=0.0975]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.53it/s]


Trial 5, Fold 0, Epoch 5: F1=0.6258


Training: 100%|██████████| 225/225 [00:07<00:00, 28.62it/s, loss=0.0897]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.59it/s]


Trial 5, Fold 0, Epoch 6: F1=0.6384


Training: 100%|██████████| 225/225 [00:07<00:00, 28.60it/s, loss=0.0826]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.37it/s]


Trial 5, Fold 0, Epoch 7: F1=0.6814


Training: 100%|██████████| 225/225 [00:07<00:00, 28.57it/s, loss=0.0814]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.59it/s]


Trial 5, Fold 0, Epoch 8: F1=0.6585


Training: 100%|██████████| 225/225 [00:07<00:00, 28.58it/s, loss=0.077] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.46it/s]


Trial 5, Fold 0, Epoch 9: F1=0.7201


Training: 100%|██████████| 225/225 [00:07<00:00, 28.46it/s, loss=0.0744]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.51it/s]


Trial 5, Fold 0, Epoch 10: F1=0.7163


Training: 100%|██████████| 225/225 [00:07<00:00, 28.49it/s, loss=0.0675]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.47it/s]


Trial 5, Fold 0, Epoch 11: F1=0.7396
  Fold 1 F1: 0.7396

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:17:31,027 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:17:31,028 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 20:17:31,028 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.53it/s, loss=0.147]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.44it/s]


Trial 5, Fold 1, Epoch 1: F1=0.4784


Training: 100%|██████████| 225/225 [00:07<00:00, 28.61it/s, loss=0.124]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.54it/s]


Trial 5, Fold 1, Epoch 2: F1=0.4614


Training: 100%|██████████| 225/225 [00:07<00:00, 28.54it/s, loss=0.112]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.47it/s]


Trial 5, Fold 1, Epoch 3: F1=0.4836


Training: 100%|██████████| 225/225 [00:07<00:00, 28.57it/s, loss=0.102]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.51it/s]


Trial 5, Fold 1, Epoch 4: F1=0.5381


Training: 100%|██████████| 225/225 [00:07<00:00, 28.58it/s, loss=0.0974]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.48it/s]


Trial 5, Fold 1, Epoch 5: F1=0.6493


Training: 100%|██████████| 225/225 [00:07<00:00, 28.59it/s, loss=0.0906]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.37it/s]


Trial 5, Fold 1, Epoch 6: F1=0.6308


Training: 100%|██████████| 225/225 [00:07<00:00, 28.55it/s, loss=0.0863]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.46it/s]


Trial 5, Fold 1, Epoch 7: F1=0.7184


Training: 100%|██████████| 225/225 [00:07<00:00, 28.55it/s, loss=0.0847]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.35it/s]


Trial 5, Fold 1, Epoch 8: F1=0.7590


Training: 100%|██████████| 225/225 [00:07<00:00, 28.53it/s, loss=0.0782]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.37it/s]


Trial 5, Fold 1, Epoch 9: F1=0.7587


Training: 100%|██████████| 225/225 [00:07<00:00, 28.25it/s, loss=0.0744]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.29it/s]


Trial 5, Fold 1, Epoch 10: F1=0.7322


Training: 100%|██████████| 225/225 [00:07<00:00, 28.44it/s, loss=0.0699]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.23it/s]


Trial 5, Fold 1, Epoch 11: F1=0.7521
Error in fold 1: name 'logger' is not defined
  Fold 2 F1: 0.0000

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:19:14,578 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:19:14,578 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 20:19:14,579 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.30it/s, loss=0.478]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.28it/s]


Trial 5, Fold 2, Epoch 1: F1=0.1254


Training: 100%|██████████| 225/225 [00:07<00:00, 28.56it/s, loss=0.156]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.48it/s]


Trial 5, Fold 2, Epoch 2: F1=0.4612


Training: 100%|██████████| 225/225 [00:07<00:00, 28.56it/s, loss=0.109]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.66it/s]


Trial 5, Fold 2, Epoch 3: F1=0.4802


Training: 100%|██████████| 225/225 [00:07<00:00, 28.63it/s, loss=0.103]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.57it/s]


Trial 5, Fold 2, Epoch 4: F1=0.5923


Training: 100%|██████████| 225/225 [00:07<00:00, 28.67it/s, loss=0.0943]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.48it/s]


Trial 5, Fold 2, Epoch 5: F1=0.5728


Training: 100%|██████████| 225/225 [00:07<00:00, 28.63it/s, loss=0.0893]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.64it/s]


Trial 5, Fold 2, Epoch 6: F1=0.5891


Training: 100%|██████████| 225/225 [00:07<00:00, 28.63it/s, loss=0.0853]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.70it/s]


Trial 5, Fold 2, Epoch 7: F1=0.6803


Training: 100%|██████████| 225/225 [00:07<00:00, 28.55it/s, loss=0.0793]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.58it/s]


Trial 5, Fold 2, Epoch 8: F1=0.7370


Training: 100%|██████████| 225/225 [00:07<00:00, 28.63it/s, loss=0.0752]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.52it/s]


Trial 5, Fold 2, Epoch 9: F1=0.7118


Training: 100%|██████████| 225/225 [00:07<00:00, 28.60it/s, loss=0.0708]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.69it/s]


Trial 5, Fold 2, Epoch 10: F1=0.7526


Training: 100%|██████████| 225/225 [00:07<00:00, 28.62it/s, loss=0.0673]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.69it/s]


Trial 5, Fold 2, Epoch 11: F1=0.8030
  Fold 3 F1: 0.8030

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:20:57,873 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:20:57,873 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 20:20:57,873 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.63it/s, loss=0.29] 
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.61it/s]


Trial 5, Fold 3, Epoch 1: F1=0.1296


Training: 100%|██████████| 225/225 [00:07<00:00, 28.63it/s, loss=0.157]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.49it/s]


Trial 5, Fold 3, Epoch 2: F1=0.4614


Training: 100%|██████████| 225/225 [00:07<00:00, 28.56it/s, loss=0.118]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.70it/s]


Trial 5, Fold 3, Epoch 3: F1=0.5278


Training: 100%|██████████| 225/225 [00:07<00:00, 28.63it/s, loss=0.106]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.41it/s]


Trial 5, Fold 3, Epoch 4: F1=0.5996


Training: 100%|██████████| 225/225 [00:07<00:00, 28.41it/s, loss=0.0991]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.17it/s]


Trial 5, Fold 3, Epoch 5: F1=0.6426


Training: 100%|██████████| 225/225 [00:08<00:00, 28.03it/s, loss=0.0944]
Validating: 100%|██████████| 57/57 [00:01<00:00, 38.11it/s]


Trial 5, Fold 3, Epoch 6: F1=0.6448


Training: 100%|██████████| 225/225 [00:08<00:00, 28.01it/s, loss=0.0868]
Validating: 100%|██████████| 57/57 [00:01<00:00, 38.51it/s]


Trial 5, Fold 3, Epoch 7: F1=0.7230


Training: 100%|██████████| 225/225 [00:07<00:00, 28.59it/s, loss=0.0827]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.67it/s]


Trial 5, Fold 3, Epoch 8: F1=0.7304


Training: 100%|██████████| 225/225 [00:07<00:00, 28.65it/s, loss=0.0786]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.69it/s]


Trial 5, Fold 3, Epoch 9: F1=0.7477


Training: 100%|██████████| 225/225 [00:07<00:00, 28.67it/s, loss=0.0726]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.56it/s]


Trial 5, Fold 3, Epoch 10: F1=0.7400


Training: 100%|██████████| 225/225 [00:07<00:00, 28.64it/s, loss=0.0693]
Validating: 100%|██████████| 57/57 [00:01<00:00, 39.72it/s]


Trial 5, Fold 3, Epoch 11: F1=0.7824
  Fold 4 F1: 0.7824

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:22:41,523 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 20:22:41,523 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 20:22:41,523 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 225/225 [00:07<00:00, 28.66it/s, loss=0.186]
Validating: 100%|██████████| 56/56 [00:01<00:00, 39.00it/s]


Trial 5, Fold 4, Epoch 1: F1=0.5113


Training: 100%|██████████| 225/225 [00:07<00:00, 28.65it/s, loss=0.121]
Validating: 100%|██████████| 56/56 [00:01<00:00, 39.19it/s]


Trial 5, Fold 4, Epoch 2: F1=0.4615


Training: 100%|██████████| 225/225 [00:07<00:00, 28.67it/s, loss=0.111]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.98it/s]


Trial 5, Fold 4, Epoch 3: F1=0.4959


Training: 100%|██████████| 225/225 [00:07<00:00, 28.64it/s, loss=0.101] 
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.67it/s]


Trial 5, Fold 4, Epoch 4: F1=0.6068


Training: 100%|██████████| 225/225 [00:07<00:00, 28.64it/s, loss=0.096] 
Validating: 100%|██████████| 56/56 [00:01<00:00, 39.04it/s]


Trial 5, Fold 4, Epoch 5: F1=0.6478


Training: 100%|██████████| 225/225 [00:07<00:00, 28.65it/s, loss=0.0916]
Validating: 100%|██████████| 56/56 [00:01<00:00, 39.01it/s]


Trial 5, Fold 4, Epoch 6: F1=0.6126


Training: 100%|██████████| 225/225 [00:07<00:00, 28.64it/s, loss=0.0898]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.93it/s]


Trial 5, Fold 4, Epoch 7: F1=0.6774


Training: 100%|██████████| 225/225 [00:07<00:00, 28.68it/s, loss=0.0822]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.85it/s]


Trial 5, Fold 4, Epoch 8: F1=0.7112


Training: 100%|██████████| 225/225 [00:07<00:00, 28.66it/s, loss=0.0755]
Validating: 100%|██████████| 56/56 [00:01<00:00, 39.02it/s]


Trial 5, Fold 4, Epoch 9: F1=0.7472


Training: 100%|██████████| 225/225 [00:07<00:00, 28.66it/s, loss=0.0736]
Validating: 100%|██████████| 56/56 [00:01<00:00, 39.20it/s]


Trial 5, Fold 4, Epoch 10: F1=0.7704


Training: 100%|██████████| 225/225 [00:07<00:00, 28.66it/s, loss=0.0667]
Validating: 100%|██████████| 56/56 [00:01<00:00, 38.87it/s]
[I 2025-12-04 20:24:23,844] Trial 5 finished with value: 0.6190891776643856 and parameters: {'learning_rate': 0.00010769622478263136, 'weight_decay': 0.017052412368729154, 'batch_size': 32, 'num_epochs': 11}. Best is trial 2 with value: 0.7918504359347013.


Trial 5, Fold 4, Epoch 11: F1=0.7527
  Fold 5 F1: 0.7704

Trial 5 - Average F1: 0.6191
Fold scores: ['0.7396', '0.0000', '0.8030', '0.7824', '0.7704']


Trial 6
Learning Rate: 3.29e-05
Weight Decay: 0.0098
Batch Size: 8
Num Epochs: 8

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:24:24,661 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:24:24,661 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 20:24:24,661 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.39it/s, loss=0.175]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.46it/s]


Trial 6, Fold 0, Epoch 1: F1=0.4612


Training: 100%|██████████| 897/897 [00:12<00:00, 74.46it/s, loss=0.141]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.29it/s]


Trial 6, Fold 0, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.51it/s, loss=0.131]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.25it/s]


Trial 6, Fold 0, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.48it/s, loss=0.124]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.72it/s]


Trial 6, Fold 0, Epoch 4: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.42it/s, loss=0.119]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.82it/s]


Trial 6, Fold 0, Epoch 5: F1=0.4614
Error in fold 0: name 'logger' is not defined
  Fold 1 F1: 0.0000

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:25:35,615 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:25:35,616 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 20:25:35,616 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.46it/s, loss=0.146]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.67it/s]


Trial 6, Fold 1, Epoch 1: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.51it/s, loss=0.136]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.42it/s]


Trial 6, Fold 1, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.40it/s, loss=0.127]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.88it/s]


Trial 6, Fold 1, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.50it/s, loss=0.122]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.84it/s]


Trial 6, Fold 1, Epoch 4: F1=0.4614
Error in fold 1: name 'logger' is not defined
  Fold 2 F1: 0.0000

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:26:32,508 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:26:32,509 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 20:26:32,509 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.49it/s, loss=0.162]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.57it/s]


Trial 6, Fold 2, Epoch 1: F1=0.5138


Training: 100%|██████████| 897/897 [00:12<00:00, 74.54it/s, loss=0.13] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.83it/s]


Trial 6, Fold 2, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.57it/s, loss=0.127]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.59it/s]


Trial 6, Fold 2, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.49it/s, loss=0.118]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.66it/s]


Trial 6, Fold 2, Epoch 4: F1=0.4614
Error in fold 2: name 'logger' is not defined
  Fold 3 F1: 0.0000

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:27:29,332 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:27:29,332 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 20:27:29,333 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.38it/s, loss=0.137]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.65it/s]


Trial 6, Fold 3, Epoch 1: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.24it/s, loss=0.126]
Validating: 100%|██████████| 225/225 [00:02<00:00, 111.63it/s]


Trial 6, Fold 3, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.25it/s, loss=0.124]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.72it/s]


Trial 6, Fold 3, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.48it/s, loss=0.12] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.94it/s]


Trial 6, Fold 3, Epoch 4: F1=0.4614
Error in fold 3: name 'logger' is not defined
  Fold 4 F1: 0.0000

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:28:26,336 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 20:28:26,337 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 20:28:26,337 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.39it/s, loss=0.23] 
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.58it/s]


Trial 6, Fold 4, Epoch 1: F1=0.3041


Training: 100%|██████████| 897/897 [00:12<00:00, 74.31it/s, loss=0.143]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.56it/s]


Trial 6, Fold 4, Epoch 2: F1=0.4615


Training: 100%|██████████| 897/897 [00:12<00:00, 74.29it/s, loss=0.124]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.48it/s]


Trial 6, Fold 4, Epoch 3: F1=0.4615


Training: 100%|██████████| 897/897 [00:12<00:00, 74.50it/s, loss=0.116]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.79it/s]


Trial 6, Fold 4, Epoch 4: F1=0.4615


Training: 100%|██████████| 897/897 [00:12<00:00, 74.41it/s, loss=0.111]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.69it/s]


Trial 6, Fold 4, Epoch 5: F1=0.4814


Training: 100%|██████████| 897/897 [00:12<00:00, 74.38it/s, loss=0.105] 
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.97it/s]


Trial 6, Fold 4, Epoch 6: F1=0.4812


Training: 100%|██████████| 897/897 [00:12<00:00, 74.46it/s, loss=0.106]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.74it/s]


Trial 6, Fold 4, Epoch 7: F1=0.5728


Training: 100%|██████████| 897/897 [00:12<00:00, 74.44it/s, loss=0.105]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.84it/s]
[I 2025-12-04 20:30:18,804] Trial 6 finished with value: 0.1232348065783837 and parameters: {'learning_rate': 3.292529363110524e-05, 'weight_decay': 0.009767211400638388, 'batch_size': 8, 'num_epochs': 8}. Best is trial 2 with value: 0.7918504359347013.


Trial 6, Fold 4, Epoch 8: F1=0.6162
  Fold 5 F1: 0.6162

Trial 6 - Average F1: 0.1232
Fold scores: ['0.0000', '0.0000', '0.0000', '0.0000', '0.6162']


Trial 7
Learning Rate: 1.14e-05
Weight Decay: 0.0909
Batch Size: 16
Num Epochs: 9

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:30:19,635 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:30:19,635 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 20:30:19,635 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.51it/s, loss=0.399]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.19it/s]


Trial 7, Fold 0, Epoch 1: F1=0.1254


Training: 100%|██████████| 449/449 [00:09<00:00, 46.52it/s, loss=0.285]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.90it/s]


Trial 7, Fold 0, Epoch 2: F1=0.4976


Training: 100%|██████████| 449/449 [00:09<00:00, 46.58it/s, loss=0.142]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.95it/s]


Trial 7, Fold 0, Epoch 3: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.56it/s, loss=0.123]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.86it/s]


Trial 7, Fold 0, Epoch 4: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.61it/s, loss=0.12] 
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.33it/s]


Trial 7, Fold 0, Epoch 5: F1=0.4614
Error in fold 0: name 'logger' is not defined
  Fold 1 F1: 0.0000

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:31:16,713 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:31:16,714 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 20:31:16,714 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.50it/s, loss=0.181]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.96it/s]


Trial 7, Fold 1, Epoch 1: F1=0.4728


Training: 100%|██████████| 449/449 [00:09<00:00, 46.61it/s, loss=0.156]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.87it/s]


Trial 7, Fold 1, Epoch 2: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.49it/s, loss=0.134]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.81it/s]


Trial 7, Fold 1, Epoch 3: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.59it/s, loss=0.132]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.11it/s]


Trial 7, Fold 1, Epoch 4: F1=0.4614
Error in fold 1: name 'logger' is not defined
  Fold 2 F1: 0.0000

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:32:02,517 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:32:02,518 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 20:32:02,518 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.54it/s, loss=0.285]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.88it/s]


Trial 7, Fold 2, Epoch 1: F1=0.1296


Training: 100%|██████████| 449/449 [00:09<00:00, 46.52it/s, loss=0.223]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.44it/s]


Trial 7, Fold 2, Epoch 2: F1=0.5448


Training: 100%|██████████| 449/449 [00:09<00:00, 46.52it/s, loss=0.155]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.52it/s]


Trial 7, Fold 2, Epoch 3: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.45it/s, loss=0.143]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.84it/s]


Trial 7, Fold 2, Epoch 4: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.48it/s, loss=0.135]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.46it/s]


Trial 7, Fold 2, Epoch 5: F1=0.4614
Error in fold 2: name 'logger' is not defined
  Fold 3 F1: 0.0000

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:32:59,681 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:32:59,681 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 20:32:59,682 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.55it/s, loss=0.128]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.57it/s]


Trial 7, Fold 3, Epoch 1: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.44it/s, loss=0.127]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.00it/s]


Trial 7, Fold 3, Epoch 2: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.58it/s, loss=0.125]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.76it/s]


Trial 7, Fold 3, Epoch 3: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.60it/s, loss=0.122]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.04it/s]


Trial 7, Fold 3, Epoch 4: F1=0.4614
Error in fold 3: name 'logger' is not defined
  Fold 4 F1: 0.0000

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:33:45,495 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 20:33:45,495 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 20:33:45,496 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.51it/s, loss=0.7]  
Validating: 100%|██████████| 112/112 [00:01<00:00, 68.94it/s]


Trial 7, Fold 4, Epoch 1: F1=0.1250


Training: 100%|██████████| 449/449 [00:09<00:00, 46.56it/s, loss=0.517]
Validating: 100%|██████████| 112/112 [00:01<00:00, 69.09it/s]


Trial 7, Fold 4, Epoch 2: F1=0.1250


Training: 100%|██████████| 449/449 [00:09<00:00, 46.52it/s, loss=0.214]
Validating: 100%|██████████| 112/112 [00:01<00:00, 69.34it/s]


Trial 7, Fold 4, Epoch 3: F1=0.4615


Training: 100%|██████████| 449/449 [00:09<00:00, 46.55it/s, loss=0.137]
Validating: 100%|██████████| 112/112 [00:01<00:00, 69.44it/s]


Trial 7, Fold 4, Epoch 4: F1=0.4615


Training: 100%|██████████| 449/449 [00:09<00:00, 46.53it/s, loss=0.128]
Validating: 100%|██████████| 112/112 [00:01<00:00, 69.19it/s]


Trial 7, Fold 4, Epoch 5: F1=0.4615


Training: 100%|██████████| 449/449 [00:09<00:00, 46.49it/s, loss=0.123]
Validating: 100%|██████████| 112/112 [00:01<00:00, 65.14it/s]
[I 2025-12-04 20:34:53,296] Trial 7 finished with value: 0.0 and parameters: {'learning_rate': 1.1439974749291259e-05, 'weight_decay': 0.0909320402078782, 'batch_size': 16, 'num_epochs': 9}. Best is trial 2 with value: 0.7918504359347013.


Trial 7, Fold 4, Epoch 6: F1=0.4615
Error in fold 4: name 'logger' is not defined
  Fold 5 F1: 0.0000

Trial 7 - Average F1: 0.0000
Fold scores: ['0.0000', '0.0000', '0.0000', '0.0000', '0.0000']


Trial 8
Learning Rate: 8.49e-05
Weight Decay: 0.0185
Batch Size: 8
Num Epochs: 12

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:34:54,145 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:34:54,145 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 20:34:54,145 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 72.88it/s, loss=0.131]
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.10it/s]


Trial 8, Fold 0, Epoch 1: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.11it/s, loss=0.132]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.81it/s]


Trial 8, Fold 0, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.20it/s, loss=0.124]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.14it/s]


Trial 8, Fold 0, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.22it/s, loss=0.115]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.79it/s]


Trial 8, Fold 0, Epoch 4: F1=0.4614
Error in fold 0: name 'logger' is not defined
  Fold 1 F1: 0.0000

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:35:51,479 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:35:51,479 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 20:35:51,479 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.26it/s, loss=0.14] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.91it/s]


Trial 8, Fold 1, Epoch 1: F1=0.4608


Training: 100%|██████████| 897/897 [00:12<00:00, 74.41it/s, loss=0.131]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.56it/s]


Trial 8, Fold 1, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.37it/s, loss=0.125]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.52it/s]


Trial 8, Fold 1, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.38it/s, loss=0.117]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.70it/s]


Trial 8, Fold 1, Epoch 4: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.30it/s, loss=0.109]
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.47it/s]


Trial 8, Fold 1, Epoch 5: F1=0.4771


Training: 100%|██████████| 897/897 [00:12<00:00, 74.34it/s, loss=0.103] 
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.27it/s]


Trial 8, Fold 1, Epoch 6: F1=0.5795


Training: 100%|██████████| 897/897 [00:12<00:00, 74.25it/s, loss=0.104] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.61it/s]


Trial 8, Fold 1, Epoch 7: F1=0.6599


Training: 100%|██████████| 897/897 [00:12<00:00, 74.36it/s, loss=0.101] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.92it/s]


Trial 8, Fold 1, Epoch 8: F1=0.6505


Training: 100%|██████████| 897/897 [00:12<00:00, 74.33it/s, loss=0.102] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.62it/s]


Trial 8, Fold 1, Epoch 9: F1=0.6638


Training: 100%|██████████| 897/897 [00:12<00:00, 74.42it/s, loss=0.0945]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.19it/s]


Trial 8, Fold 1, Epoch 10: F1=0.6632


Training: 100%|██████████| 897/897 [00:12<00:00, 74.34it/s, loss=0.0951]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.83it/s]


Trial 8, Fold 1, Epoch 11: F1=0.7078


Training: 100%|██████████| 897/897 [00:12<00:00, 74.30it/s, loss=0.09]  
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.72it/s]


Trial 8, Fold 1, Epoch 12: F1=0.7246
  Fold 2 F1: 0.7246

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:38:41,096 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:38:41,097 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 20:38:41,097 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.41it/s, loss=0.123]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.23it/s]


Trial 8, Fold 2, Epoch 1: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.47it/s, loss=0.123]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.51it/s]


Trial 8, Fold 2, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.29it/s, loss=0.119]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.85it/s]


Trial 8, Fold 2, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.30it/s, loss=0.114]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.87it/s]


Trial 8, Fold 2, Epoch 4: F1=0.4654


Training: 100%|██████████| 897/897 [00:12<00:00, 74.31it/s, loss=0.107]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.06it/s]


Trial 8, Fold 2, Epoch 5: F1=0.4848


Training: 100%|██████████| 897/897 [00:12<00:00, 74.38it/s, loss=0.107]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.36it/s]


Trial 8, Fold 2, Epoch 6: F1=0.5503


Training: 100%|██████████| 897/897 [00:12<00:00, 74.35it/s, loss=0.103] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.95it/s]


Trial 8, Fold 2, Epoch 7: F1=0.5656


Training: 100%|██████████| 897/897 [00:12<00:00, 74.33it/s, loss=0.104]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.35it/s]


Trial 8, Fold 2, Epoch 8: F1=0.6678


Training: 100%|██████████| 897/897 [00:12<00:00, 74.31it/s, loss=0.1]   
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.39it/s]


Trial 8, Fold 2, Epoch 9: F1=0.7013


Training: 100%|██████████| 897/897 [00:12<00:00, 74.23it/s, loss=0.0975]
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.22it/s]


Trial 8, Fold 2, Epoch 10: F1=0.6191


Training: 100%|██████████| 897/897 [00:12<00:00, 74.46it/s, loss=0.0934]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.76it/s]


Trial 8, Fold 2, Epoch 11: F1=0.6926


Training: 100%|██████████| 897/897 [00:12<00:00, 74.35it/s, loss=0.0958]
Validating: 100%|██████████| 225/225 [00:02<00:00, 112.43it/s]


Trial 8, Fold 2, Epoch 12: F1=0.7081
  Fold 3 F1: 0.7081

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:41:30,637 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:41:30,637 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 20:41:30,638 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.34it/s, loss=0.15] 
Validating: 100%|██████████| 225/225 [00:01<00:00, 113.48it/s]


Trial 8, Fold 3, Epoch 1: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.48it/s, loss=0.142]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.62it/s]


Trial 8, Fold 3, Epoch 2: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.35it/s, loss=0.129]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.96it/s]


Trial 8, Fold 3, Epoch 3: F1=0.4614


Training: 100%|██████████| 897/897 [00:12<00:00, 74.33it/s, loss=0.121]
Validating: 100%|██████████| 225/225 [00:01<00:00, 112.99it/s]


Trial 8, Fold 3, Epoch 4: F1=0.4614
Error in fold 3: name 'logger' is not defined
  Fold 4 F1: 0.0000

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:42:27,596 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 20:42:27,596 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 20:42:27,596 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 897/897 [00:12<00:00, 74.27it/s, loss=0.25] 
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.54it/s]


Trial 8, Fold 4, Epoch 1: F1=0.2095


Training: 100%|██████████| 897/897 [00:12<00:00, 74.36it/s, loss=0.15] 
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.68it/s]


Trial 8, Fold 4, Epoch 2: F1=0.4615


Training: 100%|██████████| 897/897 [00:12<00:00, 74.43it/s, loss=0.132]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.34it/s]


Trial 8, Fold 4, Epoch 3: F1=0.4615


Training: 100%|██████████| 897/897 [00:12<00:00, 74.30it/s, loss=0.122]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.55it/s]


Trial 8, Fold 4, Epoch 4: F1=0.5027


Training: 100%|██████████| 897/897 [00:12<00:00, 74.51it/s, loss=0.115]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.93it/s]


Trial 8, Fold 4, Epoch 5: F1=0.5239


Training: 100%|██████████| 897/897 [00:12<00:00, 74.37it/s, loss=0.11] 
Validating: 100%|██████████| 224/224 [00:02<00:00, 111.91it/s]


Trial 8, Fold 4, Epoch 6: F1=0.5805


Training: 100%|██████████| 897/897 [00:12<00:00, 74.27it/s, loss=0.108] 
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.91it/s]


Trial 8, Fold 4, Epoch 7: F1=0.6309


Training: 100%|██████████| 897/897 [00:12<00:00, 74.40it/s, loss=0.108]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.91it/s]


Trial 8, Fold 4, Epoch 8: F1=0.6189


Training: 100%|██████████| 897/897 [00:12<00:00, 74.45it/s, loss=0.101] 
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.72it/s]


Trial 8, Fold 4, Epoch 9: F1=0.6821


Training: 100%|██████████| 897/897 [00:12<00:00, 74.34it/s, loss=0.0957]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.39it/s]


Trial 8, Fold 4, Epoch 10: F1=0.6500


Training: 100%|██████████| 897/897 [00:12<00:00, 74.45it/s, loss=0.0965]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.66it/s]


Trial 8, Fold 4, Epoch 11: F1=0.6643


Training: 100%|██████████| 897/897 [00:12<00:00, 74.36it/s, loss=0.0932]
Validating: 100%|██████████| 224/224 [00:01<00:00, 112.53it/s]
[I 2025-12-04 20:45:16,356] Trial 8 finished with value: 0.4320221269194707 and parameters: {'learning_rate': 8.488762161408708e-05, 'weight_decay': 0.018485445552552705, 'batch_size': 8, 'num_epochs': 12}. Best is trial 2 with value: 0.7918504359347013.


Trial 8, Fold 4, Epoch 12: F1=0.7274
  Fold 5 F1: 0.7274

Trial 8 - Average F1: 0.4320
Fold scores: ['0.0000', '0.7246', '0.7081', '0.0000', '0.7274']


Trial 9
Learning Rate: 1.04e-04
Weight Decay: 0.0922
Batch Size: 16
Num Epochs: 7

  Fold 1/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:45:17,166 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:45:17,166 - INFO - Language weights: {'es': 0.9847470941344993, 'en': 1.0008296471794096, 'it': 1.0144232586860908}
2025-12-04 20:45:17,166 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.43it/s, loss=0.173]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.55it/s]


Trial 9, Fold 0, Epoch 1: F1=0.4816


Training: 100%|██████████| 449/449 [00:09<00:00, 46.39it/s, loss=0.128]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.20it/s]


Trial 9, Fold 0, Epoch 2: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.39it/s, loss=0.114]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.52it/s]


Trial 9, Fold 0, Epoch 3: F1=0.4878


Training: 100%|██████████| 449/449 [00:09<00:00, 46.44it/s, loss=0.103]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.58it/s]


Trial 9, Fold 0, Epoch 4: F1=0.6119


Training: 100%|██████████| 449/449 [00:09<00:00, 46.48it/s, loss=0.0967]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.63it/s]


Trial 9, Fold 0, Epoch 5: F1=0.6521


Training: 100%|██████████| 449/449 [00:09<00:00, 46.54it/s, loss=0.0907]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.70it/s]


Trial 9, Fold 0, Epoch 6: F1=0.6692


Training: 100%|██████████| 449/449 [00:09<00:00, 46.50it/s, loss=0.0861]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.56it/s]


Trial 9, Fold 0, Epoch 7: F1=0.7351
  Fold 1 F1: 0.7351

  Fold 2/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:46:36,964 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:46:36,965 - INFO - Language weights: {'it': 0.990547924474408, 'en': 1.0000803940404794, 'es': 1.0093716814851124}
2025-12-04 20:46:36,965 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.42it/s, loss=0.566]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.86it/s]


Trial 9, Fold 1, Epoch 1: F1=0.1254


Training: 100%|██████████| 449/449 [00:09<00:00, 46.52it/s, loss=0.158]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.84it/s]


Trial 9, Fold 1, Epoch 2: F1=0.4612


Training: 100%|██████████| 449/449 [00:09<00:00, 46.47it/s, loss=0.114]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.90it/s]


Trial 9, Fold 1, Epoch 3: F1=0.5573


Training: 100%|██████████| 449/449 [00:09<00:00, 46.41it/s, loss=0.104]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.38it/s]


Trial 9, Fold 1, Epoch 4: F1=0.6097


Training: 100%|██████████| 449/449 [00:09<00:00, 46.47it/s, loss=0.099] 
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.63it/s]


Trial 9, Fold 1, Epoch 5: F1=0.6703


Training: 100%|██████████| 449/449 [00:09<00:00, 46.40it/s, loss=0.0914]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.69it/s]


Trial 9, Fold 1, Epoch 6: F1=0.6307


Training: 100%|██████████| 449/449 [00:09<00:00, 46.51it/s, loss=0.088] 
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.68it/s]


Trial 9, Fold 1, Epoch 7: F1=0.7500
  Fold 2 F1: 0.7500

  Fold 3/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:47:56,769 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:47:56,769 - INFO - Language weights: {'it': 0.9959626621507102, 'en': 1.0005485094858537, 'es': 1.003488828363436}
2025-12-04 20:47:56,770 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.36it/s, loss=0.374]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.81it/s]


Trial 9, Fold 2, Epoch 1: F1=0.1289


Training: 100%|██████████| 449/449 [00:09<00:00, 46.49it/s, loss=0.166]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.18it/s]


Trial 9, Fold 2, Epoch 2: F1=0.4614


Training: 100%|██████████| 449/449 [00:09<00:00, 46.44it/s, loss=0.129]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.15it/s]


Trial 9, Fold 2, Epoch 3: F1=0.5224


Training: 100%|██████████| 449/449 [00:09<00:00, 46.36it/s, loss=0.112]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.03it/s]


Trial 9, Fold 2, Epoch 4: F1=0.6667


Training: 100%|██████████| 449/449 [00:09<00:00, 46.39it/s, loss=0.104]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.33it/s]


Trial 9, Fold 2, Epoch 5: F1=0.6602


Training: 100%|██████████| 449/449 [00:09<00:00, 46.36it/s, loss=0.0954]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.62it/s]


Trial 9, Fold 2, Epoch 6: F1=0.6982


Training: 100%|██████████| 449/449 [00:09<00:00, 46.47it/s, loss=0.0893]
Validating: 100%|██████████| 113/113 [00:01<00:00, 69.81it/s]


Trial 9, Fold 2, Epoch 7: F1=0.6221
  Fold 3 F1: 0.6982

  Fold 4/5: Train=7171, Val=1793


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:49:16,617 - INFO - Label weights: {0: 0.5835774739583334, 1: 3.491236611489776}
2025-12-04 20:49:16,617 - INFO - Language weights: {'en': 0.9905452280968097, 'es': 0.9996594041813475, 'it': 1.0097953677218428}
2025-12-04 20:49:16,618 - INFO - Pos weight (for BCE): 5.9825


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.37it/s, loss=0.275]
Validating: 100%|██████████| 113/113 [00:01<00:00, 68.35it/s]


Trial 9, Fold 3, Epoch 1: F1=0.1327


Training: 100%|██████████| 449/449 [00:09<00:00, 45.64it/s, loss=0.139]
Validating: 100%|██████████| 113/113 [00:01<00:00, 67.73it/s]


Trial 9, Fold 3, Epoch 2: F1=0.4812


Training: 100%|██████████| 449/449 [00:09<00:00, 45.18it/s, loss=0.117]
Validating: 100%|██████████| 113/113 [00:01<00:00, 68.36it/s]


Trial 9, Fold 3, Epoch 3: F1=0.5200


Training: 100%|██████████| 449/449 [00:09<00:00, 45.23it/s, loss=0.105]
Validating: 100%|██████████| 113/113 [00:01<00:00, 65.82it/s]


Trial 9, Fold 3, Epoch 4: F1=0.6123


Training: 100%|██████████| 449/449 [00:09<00:00, 45.16it/s, loss=0.0994]
Validating: 100%|██████████| 113/113 [00:01<00:00, 67.79it/s]


Trial 9, Fold 3, Epoch 5: F1=0.5945


Training: 100%|██████████| 449/449 [00:09<00:00, 46.34it/s, loss=0.0951]
Validating: 100%|██████████| 113/113 [00:01<00:00, 70.00it/s]


Trial 9, Fold 3, Epoch 6: F1=0.6962


Training: 100%|██████████| 449/449 [00:09<00:00, 46.29it/s, loss=0.0897]
Validating: 100%|██████████| 113/113 [00:01<00:00, 67.62it/s]


Trial 9, Fold 3, Epoch 7: F1=0.6777
  Fold 4 F1: 0.6962

  Fold 5/5: Train=7172, Val=1792


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:50:37,772 - INFO - Label weights: {0: 0.5836588541666666, 1: 3.4883268482490273}
2025-12-04 20:50:37,772 - INFO - Language weights: {'it': 0.9894548630811957, 'es': 1.0027361364111447, 'en': 1.0078090005076596}
2025-12-04 20:50:37,773 - INFO - Pos weight (for BCE): 5.9767


Froze: Embeddings + First 10 Encoder Layers
Trainable: Classification Head + Remaining Encoder Layers
Trainable parameters: 14,767,874 / 278,045,186 (5.31%)


Training: 100%|██████████| 449/449 [00:09<00:00, 46.06it/s, loss=0.998]
Validating: 100%|██████████| 112/112 [00:01<00:00, 64.42it/s]


Trial 9, Fold 4, Epoch 1: F1=0.1250


Training: 100%|██████████| 449/449 [00:09<00:00, 45.44it/s, loss=0.237]
Validating: 100%|██████████| 112/112 [00:01<00:00, 69.45it/s]


Trial 9, Fold 4, Epoch 2: F1=0.4615


Training: 100%|██████████| 449/449 [00:09<00:00, 46.06it/s, loss=0.118]
Validating: 100%|██████████| 112/112 [00:01<00:00, 65.76it/s]


Trial 9, Fold 4, Epoch 3: F1=0.5304


Training: 100%|██████████| 449/449 [00:09<00:00, 45.05it/s, loss=0.106]
Validating: 100%|██████████| 112/112 [00:01<00:00, 67.68it/s]


Trial 9, Fold 4, Epoch 4: F1=0.5922


Training: 100%|██████████| 449/449 [00:09<00:00, 45.79it/s, loss=0.0973]
Validating: 100%|██████████| 112/112 [00:01<00:00, 69.59it/s]


Trial 9, Fold 4, Epoch 5: F1=0.6821


Training: 100%|██████████| 449/449 [00:09<00:00, 46.63it/s, loss=0.0925]
Validating: 100%|██████████| 112/112 [00:01<00:00, 69.53it/s]


Trial 9, Fold 4, Epoch 6: F1=0.7076


Training: 100%|██████████| 449/449 [00:09<00:00, 46.56it/s, loss=0.0877]
Validating: 100%|██████████| 112/112 [00:01<00:00, 69.49it/s]
[I 2025-12-04 20:51:57,842] Trial 9 finished with value: 0.7174014345909159 and parameters: {'learning_rate': 0.0001037084466895453, 'weight_decay': 0.09218742350231168, 'batch_size': 16, 'num_epochs': 7}. Best is trial 2 with value: 0.7918504359347013.


Trial 9, Fold 4, Epoch 7: F1=0.6957
  Fold 5 F1: 0.7076

Trial 9 - Average F1: 0.7174
Fold scores: ['0.7351', '0.7500', '0.6982', '0.6962', '0.7076']


BEST TRIAL
Trial: 2
Best F1 Score: 0.7919

Best Hyperparameters:
  learning_rate: 0.00025959425503112657
  weight_decay: 0.021233911067827616
  batch_size: 32
  num_epochs: 9


                                                                                
 STEP 2: VISUALIZE RESULTS
                                                                                


Generating visualizations...
✓ Saved: optuna_optimization_history.svg
✓ Saved: optuna_learning_rate_impact.svg
✓ Saved: optuna_batch_size_impact.svg
✓ Saved: optuna_weight_decay_impact.svg
✓ Saved: optuna_epochs_impact.svg
✓ Saved: optuna_trials.csv

All visualizations saved to: ../figures

Top 5 Trials:
   trial        f1        lr  batch_size  epochs
2      2  0.791850  0.000260          32       9
9      9  0.717401  0.000104          16       7
3      3  0.648432  0.00005

### Final Model Training w.r.t Best Parameters

In [14]:
final_config, final_model_path = train_final_model(best_trial, augmented_data, base_config)

2025-12-04 20:53:11,567 - INFO - Fold 0: Train=3824, Val=956
2025-12-04 20:53:11,567 - INFO -   Train label dist: {0: 3276, 1: 548}
2025-12-04 20:53:11,568 - INFO -   Train lang dist: {'es': 1352, 'en': 1255, 'it': 1217}
2025-12-04 20:53:11,573 - INFO - Fold 1: Train=3824, Val=957
2025-12-04 20:53:11,574 - INFO -   Train label dist: {0: 3277, 1: 547}
2025-12-04 20:53:11,575 - INFO -   Train lang dist: {'es': 1352, 'en': 1255, 'it': 1217}
2025-12-04 20:53:11,581 - INFO - Fold 2: Train=3824, Val=957
2025-12-04 20:53:11,582 - INFO -   Train label dist: {0: 3277, 1: 547}
2025-12-04 20:53:11,583 - INFO -   Train lang dist: {'es': 1352, 'en': 1256, 'it': 1216}
2025-12-04 20:53:11,589 - INFO - Fold 3: Train=3824, Val=957
2025-12-04 20:53:11,589 - INFO -   Train label dist: {0: 3278, 1: 546}
2025-12-04 20:53:11,590 - INFO -   Train lang dist: {'es': 1352, 'en': 1255, 'it': 1217}
2025-12-04 20:53:11,594 - INFO - Fold 4: Train=3824, Val=957
2025-12-04 20:53:11,594 - INFO -   Train label dist: {0


                                                                                
STEP 3: TRAIN FINAL MODEL WITH BEST HYPERPARAMETERS
                                                                                

Final Training Configuration:
  Learning Rate: 2.60e-04
  Weight Decay: 0.0212
  Batch Size: 32
  Num Epochs: 9


FOLD 0:
  Train: 548 positive samples
  Val:   137 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:53:12,316 - INFO - Froze: Embeddings + First 10 Encoder Layers
2025-12-04 20:53:12,317 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-04 20:53:12,317 - INFO - Trainable parameters: 14,767,874 / 278,045,186 (5.31%)
2025-12-04 20:53:12,318 - INFO - Label weights: {0: 0.5836385836385837, 1: 3.489051094890511}
2025-12-04 20:53:12,318 - INFO - Language weights: {'es': 0.9409641154628656, 'en': 1.0136920192078043, 'it': 1.0453438653293299}
2025-12-04 20:53:12,318 - INFO - Pos weight (for BCE): 5.9781
2025-12-04 20:53:12,319 - INFO - 
E


FOLD 1:
  Train: 547 positive samples
  Val:   138 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:54:04,959 - INFO - Froze: Embeddings + First 10 Encoder Layers
2025-12-04 20:54:04,960 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-04 20:54:04,960 - INFO - Trainable parameters: 14,767,874 / 278,045,186 (5.31%)
2025-12-04 20:54:04,962 - INFO - Label weights: {0: 0.5834604821483064, 1: 3.495429616087751}
2025-12-04 20:54:04,962 - INFO - Language weights: {'es': 0.9409641154628656, 'en': 1.0136920192078043, 'it': 1.0453438653293299}
2025-12-04 20:54:04,962 - INFO - Pos weight (for BCE): 5.9909
2025-12-04 20:54:04,963 - INFO - 
E


FOLD 2:
  Train: 547 positive samples
  Val:   138 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:54:56,193 - INFO - Froze: Embeddings + First 10 Encoder Layers
2025-12-04 20:54:56,193 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-04 20:54:56,194 - INFO - Trainable parameters: 14,767,874 / 278,045,186 (5.31%)
2025-12-04 20:54:56,195 - INFO - Label weights: {0: 0.5834604821483064, 1: 3.495429616087751}
2025-12-04 20:54:56,195 - INFO - Language weights: {'es': 0.9409476243674836, 'en': 1.0128671880134061, 'it': 1.0461851876191102}
2025-12-04 20:54:56,195 - INFO - Pos weight (for BCE): 5.9909
2025-12-04 20:54:56,196 - INFO - 
E


FOLD 3:
  Train: 546 positive samples
  Val:   138 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:55:35,232 - INFO - Froze: Embeddings + First 10 Encoder Layers
2025-12-04 20:55:35,233 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-04 20:55:35,233 - INFO - Trainable parameters: 14,767,874 / 278,045,186 (5.31%)
2025-12-04 20:55:35,234 - INFO - Label weights: {0: 0.5832824893227577, 1: 3.501831501831502}
2025-12-04 20:55:35,235 - INFO - Language weights: {'es': 0.9409641154628656, 'en': 1.0136920192078043, 'it': 1.0453438653293299}
2025-12-04 20:55:35,235 - INFO - Pos weight (for BCE): 6.0037
2025-12-04 20:55:35,236 - INFO - 
E


FOLD 4:
  Train: 547 positive samples
  Val:   138 positive samples


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ../fine_tuned_models_mlm/twitter-xlm-roberta-base/final_MLM_model/model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-12-04 20:56:26,252 - INFO - Froze: Embeddings + First 10 Encoder Layers
2025-12-04 20:56:26,252 - INFO - Trainable: Classification Head + Remaining Encoder Layers
2025-12-04 20:56:26,253 - INFO - Trainable parameters: 14,767,874 / 278,045,186 (5.31%)
2025-12-04 20:56:26,254 - INFO - Label weights: {0: 0.5834604821483064, 1: 3.495429616087751}
2025-12-04 20:56:26,254 - INFO - Language weights: {'es': 0.9416953224870754, 'en': 1.0129222776114961, 'it': 1.045382399901429}
2025-12-04 20:56:26,254 - INFO - Pos weight (for BCE): 5.9909
2025-12-04 20:56:26,255 - INFO - 
Ep

### Inference with Peformance Metrics

In [15]:
inference_config = InferenceConfig()
inference_config.CHECKPOINT_PATH = str(final_model_path)

inference_results = run_inference(merged_data, inference_config)

Running inference on 8964 samples...
Device: cuda


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded checkpoint from: ../fine_tuned_models/mlm/final_model/checkpoints/fold_0_epoch_7_f1_0.7841.pt


Inference: 100%|██████████| 281/281 [00:07<00:00, 38.21it/s]



INFERENCE RESULTS ON TRAINING DATA

Overall Metrics:
  Macro Precision: 0.8027
  Macro Recall:    0.7472
  Macro F1:        0.7706

Per-Language Metrics:
  EN:
    Precision: 0.7634
    Recall:    0.7952
    F1:        0.7777
  ES:
    Precision: 0.8446
    Recall:    0.8117
    F1:        0.8268
  IT:
    Precision: 0.8677
    Recall:    0.6347
    F1:        0.6785



In [16]:
print(f"\nOptuna Trials: {len(study.trials)}")
print(f"Best Trial: {best_trial.number}")
print(f"Best F1 Score (CV): {best_trial.value:.4f}")
print(f"Final Inference F1: {inference_results['metrics']['overall']['macro_f1']:.4f}")
print(f"\nBest Hyperparameters:")
for key, val in best_trial.params.items():
    print(f"  {key}: {val}")
print(f"\nModel saved at: {final_config.OUTPUT_DIR}")
print(f"Results saved at: {final_config.RESULTS_DIR}")


Optuna Trials: 10
Best Trial: 2
Best F1 Score (CV): 0.7919
Final Inference F1: 0.7706

Best Hyperparameters:
  learning_rate: 0.00025959425503112657
  weight_decay: 0.021233911067827616
  batch_size: 32
  num_epochs: 9

Model saved at: ../fine_tuned_models/mlm/final_model
Results saved at: ../results/fine_tuned_mlm/final_model/


# Prediction Threshold Validation

In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [18]:
probs = np.array(inference_results['probabilities'])  # Shape: (N, 2)
probs_class_1 = probs[:, 1]  # Get positive class probabilities
labels = np.array(inference_results['labels'])
languages = inference_results['languages']

print(f"Probability stats:")
print(f"Min:{probs_class_1.min():.4f}")
print(f"Max:{probs_class_1.max():.4f}")
print(f"Mean:{probs_class_1.mean():.4f}")
print(f"Median:{np.median(probs_class_1):.4f}")

results = []

for threshold in np.arange(0.1, 1.0, 0.1):
    # Apply threshold
    preds = (probs_class_1 >= threshold).astype(int)
    
    # Calculate metrics
    f1 = f1_score(labels, preds, average="macro", zero_division=0)
    precision = precision_score(labels, preds, average="macro", zero_division=0)
    recall = recall_score(labels, preds, average="macro", zero_division=0)
    
    results.append({
        "threshold": threshold,
        "f1": f1,
        "precision": precision,
        "recall": recall
    })

threshold_df = pd.DataFrame(results)
print(threshold_df.to_string(index=False))


print("\n" + "="*80)
print("LANGUAGE-SPECIFIC OPTIMAL THRESHOLDS")
print("="*80)

language_optimal_thresholds = {}

for lang in sorted(set(languages)):
    lang_mask = np.array(languages) == lang
    lang_probs = probs_class_1[lang_mask]
    lang_labels = np.array(labels)[lang_mask]
    
    best_threshold = None
    best_f1 = 0
    best_precision = 0
    best_recall = 0
    
    # Test all thresholds for this language
    for threshold in np.arange(0.05, 1.0, 0.05):
        preds = (lang_probs >= threshold).astype(int)
        f1 = f1_score(lang_labels, preds, average="macro", zero_division=0)
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_precision = precision_score(lang_labels, preds, average="macro", zero_division=0)
            best_recall = recall_score(lang_labels, preds, average="macro", zero_division=0)
    
    language_optimal_thresholds[lang] = best_threshold
    
    print(f"\n{lang.upper()}:")
    print(f"Optimal Threshold: {best_threshold:.2f}")
    print(f"F1 Score: {best_f1:.4f}")
    print(f"Precision: {best_precision:.4f}")
    print(f"Recall: {best_recall:.4f}")

print("\n" + "="*80)
print("APPLYING LANGUAGE-SPECIFIC THRESHOLDS")
print("="*80)

language_specific_preds = np.zeros_like(probs_class_1, dtype=int)

for lang in sorted(set(languages)):
    lang_mask = np.array(languages) == lang
    threshold = language_optimal_thresholds[lang]
    language_specific_preds[lang_mask] = (probs_class_1[lang_mask] >= threshold).astype(int)

print("\nOVERALL METRICS COMPARISON:")
print("-" * 80)

default_preds = (probs_class_1 >= 0.5).astype(int)
default_f1 = f1_score(labels, default_preds, average="macro", zero_division=0)
default_precision = precision_score(labels, default_preds, average="macro", zero_division=0)
default_recall = recall_score(labels, default_preds, average="macro", zero_division=0)

langspec_f1 = f1_score(labels, language_specific_preds, average="macro", zero_division=0)
langspec_precision = precision_score(labels, language_specific_preds, average="macro", zero_division=0)
langspec_recall = recall_score(labels, language_specific_preds, average="macro", zero_division=0)

print(f"\nDefault Threshold (0.5):")
print(f"F1: {default_f1:.4f}")
print(f"Precision: {default_precision:.4f}")
print(f"Recall: {default_recall:.4f}")

print(f"\nLanguage-Specific Thresholds:")
for lang in sorted(set(languages)):
    print(f"  {lang.upper()}: {language_optimal_thresholds[lang]:.2f}", end="")
print()

print(f"F1: {langspec_f1:.4f}")
print(f"Precision: {langspec_precision:.4f}")
print(f"Recall: {langspec_recall:.4f}")

print(f"\nOverall Improvement:")
print(f"F1 Change: {langspec_f1 - default_f1:+.4f} ({(langspec_f1/default_f1 - 1)*100:+.1f}%)")
print(f"Precision Change: {langspec_precision - default_precision:+.4f}")
print(f"Recall Change: {langspec_recall - default_recall:+.4f}")

print("\n" + "="*80)
print("PER-LANGUAGE COMPARISON")
print("="*80)

for lang in sorted(set(languages)):
    lang_mask = np.array(languages) == lang
    lang_labels = np.array(labels)[lang_mask]
    
    default_lang_preds = default_preds[lang_mask]
    default_lang_f1 = f1_score(lang_labels, default_lang_preds, average="macro", zero_division=0)
    default_lang_precision = precision_score(lang_labels, default_lang_preds, average="macro", zero_division=0)
    default_lang_recall = recall_score(lang_labels, default_lang_preds, average="macro", zero_division=0)
    
    langspec_lang_preds = language_specific_preds[lang_mask]
    langspec_lang_f1 = f1_score(lang_labels, langspec_lang_preds, average="macro", zero_division=0)
    langspec_lang_precision = precision_score(lang_labels, langspec_lang_preds, average="macro", zero_division=0)
    langspec_lang_recall = recall_score(lang_labels, langspec_lang_preds, average="macro", zero_division=0)
    
    print(f"\n{lang.upper()}:")
    print(f"Default (0.5):")
    print(f"F1: {default_lang_f1:.4f}, Precision: {default_lang_precision:.4f}, Recall: {default_lang_recall:.4f}")
    print(f"Language-Specific ({language_optimal_thresholds[lang]:.2f}):")
    print(f"F1: {langspec_lang_f1:.4f}, Precision: {langspec_lang_precision:.4f}, Recall: {langspec_lang_recall:.4f}")
    print(f"Improvement: {langspec_lang_f1 - default_lang_f1:+.4f} ({(langspec_lang_f1/default_lang_f1 - 1)*100:+.1f}%)")

print("\n" + "="*80)
print("DETAILED CLASSIFICATION REPORT - LANGUAGE-SPECIFIC THRESHOLDS")
print("="*80)

print("\nOverall Classification Report:")
print(classification_report(labels, language_specific_preds, target_names=['Class 0', 'Class 1']))

print("\nPer-Language Classification Reports:")
for lang in sorted(set(languages)):
    lang_mask = np.array(languages) == lang
    lang_labels = np.array(labels)[lang_mask]
    lang_preds = language_specific_preds[lang_mask]
    
    print(f"\n{lang.upper()} (Threshold: {language_optimal_thresholds[lang]:.2f}):")
    print(classification_report(lang_labels, lang_preds, target_names=['Class 0', 'Class 1']))

results_df = pd.DataFrame({
    'language': list(language_optimal_thresholds.keys()),
    'optimal_threshold': list(language_optimal_thresholds.values())
})

print("\n" + "="*80)
print("SUMMARY TABLE")
print("="*80)
print(results_df.to_string(index=False))

Probability stats:
Min:0.0010
Max:0.9996
Mean:0.2448
Median:0.1993
 threshold       f1  precision   recall
       0.1 0.417996   0.598311 0.664659
       0.2 0.574726   0.628261 0.761282
       0.3 0.690419   0.674178 0.810448
       0.4 0.759620   0.734950 0.798832
       0.5 0.770560   0.802662 0.747208
       0.6 0.722379   0.852444 0.675470
       0.7 0.632847   0.870776 0.599615
       0.8 0.564557   0.882454 0.554968
       0.9 0.511879   0.902955 0.525441

LANGUAGE-SPECIFIC OPTIMAL THRESHOLDS

EN:
Optimal Threshold: 0.50
F1 Score: 0.7777
Precision: 0.7634
Recall: 0.7952

ES:
Optimal Threshold: 0.45
F1 Score: 0.8314
Precision: 0.8177
Recall: 0.8473

IT:
Optimal Threshold: 0.35
F1 Score: 0.7710
Precision: 0.7807
Recall: 0.7624

APPLYING LANGUAGE-SPECIFIC THRESHOLDS

OVERALL METRICS COMPARISON:
--------------------------------------------------------------------------------

Default Threshold (0.5):
F1: 0.7706
Precision: 0.8027
Recall: 0.7472

Language-Specific Thresholds:
  EN: 0.

# Submission

In [19]:
from src.finetune.finetuner import run_test

In [20]:
class InferenceConfig:
    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"
    NUM_LABELS = 2
    CHECKPOINT_PATH = "../fine_tuned_models/mlm/final_model/checkpoints/fold_0_epoch_8_f1_0.7845.pt"
    
    # YOUR OPTIMAL THRESHOLDS
    LANGUAGE_THRESHOLDS = {
        'en': 0.50,
        'es': 0.50,
        'it': 0.50
    }

    MAX_LENGTH = 128
    BATCH_SIZE = 32
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class InferenceConfigThreshold:
    MODEL_NAME = "cardiffnlp/twitter-xlm-roberta-base"
    NUM_LABELS = 2
    # CHECKPOINT_PATH = "../fine_tuned_models/base/final_model/checkpoints/fold_0_epoch_8_f1_0.7829.pt"
    CHECKPOINT_PATH = "../fine_tuned_models/mlm/final_model/checkpoints/fold_0_epoch_8_f1_0.7845.pt"
    
    # YOUR OPTIMAL THRESHOLDS
    LANGUAGE_THRESHOLDS = {
        'en': 0.50,
        'es': 0.45,
        'it': 0.35
    }
    MAX_LENGTH = 128
    BATCH_SIZE = 32
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
test_root = "../data/test_sets/Test Set/"
test_files = [file for file in os.listdir(test_root) if (file.endswith(".csv") and ("test" in file))]
print(f"training files: {test_files}")

test_df = pd.DataFrame()


for file in test_files:
    temp_df = pd.read_csv(os.path.join(test_root, file))
    if "en" in file:
        temp_df["bio"] = [None] * temp_df.shape[0]
    test_df = pd.concat([test_df, temp_df], ignore_index=True)

training files: ['es_test.csv', 'it_test.csv', 'en_test.csv']


In [22]:
test_df.shape

(1995, 4)

In [23]:
results = run_test(test_df, InferenceConfig)

Running inference on 1995 samples...
Device: cuda


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded checkpoint from: ../fine_tuned_models/mlm/final_model/checkpoints/fold_0_epoch_8_f1_0.7845.pt


Inference: 100%|██████████| 63/63 [00:01<00:00, 39.00it/s]


In [24]:
submission_folder = "../submissions/"
os.makedirs(submission_folder, exist_ok=True)
submission_file = os.path.join(submission_folder, "multipride2025_KIT-TIP-NLP_2.tsv")
final_submission = dict()
final_submission["id"] = results["ids"]
final_submission["label"] = results["predictions"]
final_submission["lang"] = results["languages"]


final_submission = pd.DataFrame.from_dict(final_submission)
print(final_submission.shape)
final_submission.to_csv(submission_file, sep="\t", index=False)

(1995, 3)


### For ablation Study 

In [25]:
results = run_test(test_df, InferenceConfigThreshold)

Running inference on 1995 samples...
Device: cuda


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded checkpoint from: ../fine_tuned_models/mlm/final_model/checkpoints/fold_0_epoch_8_f1_0.7845.pt


Inference: 100%|██████████| 63/63 [00:01<00:00, 39.00it/s]


In [26]:
submission_file = os.path.join(submission_folder, "multipride2025_KIT-TIP-NLP_4.tsv")
final_submission = dict()
final_submission["id"] = results["ids"]
final_submission["label"] = results["predictions"]
final_submission["lang"] = results["languages"]


final_submission = pd.DataFrame.from_dict(final_submission)
print(final_submission.shape)
final_submission.to_csv(submission_file, sep="\t", index=False, )

(1995, 3)
