<a href="https://colab.research.google.com/github/silvsilvsilv/multilingualcodeswitchingthesis/blob/main/XLM_RoBERTa_L.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================================
# XLM-RoBERTa-Large + LoRA (no noise) with selective unfreezing
# Data: CSVs with columns ["text","label"] at explicit Drive paths
# =========================================================
!pip -q install -U transformers datasets peft accelerate evaluate scikit-learn

import os, numpy as np
import torch
import torch.nn as nn

from google.colab import drive
drive.mount('/content/drive')

from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# -------------------------
# Explicit paths (EDIT ONLY IF YOUR PATHS CHANGE)
# -------------------------
TRAIN_CSV = "/content/drive/MyDrive/Machine_Learning/dataset/unique_train_dataset_cleaned (1).csv"
VAL_CSV   = "/content/drive/MyDrive/Machine_Learning/dataset/unique_validation_dataset_cleaned (1).csv"
TEST_CSV  = "/content/drive/MyDrive/Machine_Learning/dataset/unique_test_dataset_cleaned (1).csv"

# -------------------------
# Model / training config
# -------------------------
MODEL_NAME = "xlm-roberta-large"
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
SEED = 42

# LoRA config
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
LORA_TARGETS = ["query", "value"]  # typical for RoBERTa/XLM-R

# Selective unfreezing
LAST_K_TO_UNFREEZE = 2     # try 1–4
UNFREEZE_LAYER_NORM = True
GRADIENT_CHECKPOINTING = True

# bfloat16 on A100/L4; else fp16
USE_BF16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8


In [None]:
# -------------------------
# Load dataset (expects columns: text, label)
# -------------------------
data_files = {
    "train": TRAIN_CSV,
    "validation": VAL_CSV,
    "test": TEST_CSV,
}
raw = load_dataset("csv", data_files=data_files)

# Verify required columns and cast label to int
for split in raw.keys():
    cols = set(raw[split].column_names)
    if not {"text", "label"}.issubset(cols):
        raise KeyError(f"Split '{split}' must have columns: 'text' and 'label'. Found: {cols}")

def ensure_int_label(example):
    example["label"] = int(example["label"])
    return example

raw = raw.map(ensure_int_label)

NUM_LABELS = 2
id2label = {0: "LABEL_0", 1: "LABEL_1"}
label2id = {"LABEL_0": 0, "LABEL_1": 1}

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/21767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/2808 [00:00<?, ? examples/s]

In [None]:

# -------------------------
# Tokenizer & preprocessing
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def preprocess(ex):
    return tokenizer(ex["text"], truncation=True, max_length=MAX_LEN)

# Keep only tokenized features + label
cols_to_remove = [c for c in raw["train"].column_names if c not in ("text", "label")]
encoded = raw.map(preprocess, batched=True, remove_columns=[c for c in cols_to_remove if c != "label"])
encoded = encoded.rename_column("label", "labels")
encoded = encoded.with_format("torch")


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/21767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

Map:   0%|          | 0/2808 [00:00<?, ? examples/s]

In [None]:
# -------------------------
# Build model
# -------------------------
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    id2label=id2label,
    label2id=label2id,
)


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# -------------------------
# Apply LoRA
# -------------------------
lora_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=LORA_TARGETS,
)
model = get_peft_model(model, lora_cfg)

In [None]:
# -------------------------
# Selective unfreezing (AFTER LoRA is attached)
# -------------------------
def get_backbone(m: nn.Module) -> nn.Module:
    if hasattr(m, "base_model") and hasattr(m.base_model, "model"):
        return m.base_model.model
    if hasattr(m, "model"):
        return m.model
    return m

def freeze_all_except_lora(m: nn.Module):
    for n, p in m.named_parameters():
        if "lora_" in n or getattr(p, "is_lora", False):
            p.requires_grad = True
        else:
            p.requires_grad = False

def unfreeze_last_k_transformer_layers(backbone: nn.Module, last_k: int, unfreeze_ln: bool):
    if not hasattr(backbone, "roberta"):
        raise RuntimeError("Expected Roberta/XLM-R backbone with '.roberta'.")
    enc = backbone.roberta.encoder
    num_layers = backbone.config.num_hidden_layers
    last_k = max(0, min(last_k, num_layers))
    start = num_layers - last_k
    for i in range(start, num_layers):
        for p in enc.layer[i].parameters():
            p.requires_grad = True
    if unfreeze_ln:
        for mod in backbone.modules():
            if isinstance(mod, nn.LayerNorm):
                for p in mod.parameters():
                    p.requires_grad = True

freeze_all_except_lora(model)
backbone = get_backbone(model)
unfreeze_last_k_transformer_layers(backbone, LAST_K_TO_UNFREEZE, UNFREEZE_LAYER_NORM)

# # Optional: gradient checkpointing
# if GRADIENT_CHECKPOINTING:
#     try:
#         model.gradient_checkpointing_enable()
#     except Exception:
#         pass

In [None]:
# -------------------------
# Trainable summary
# -------------------------
def print_trainable_summary(m: nn.Module, max_list=25):
    total, trainable = 0, 0
    names = []
    for n, p in m.named_parameters():
        total += p.numel()
        if p.requires_grad:
            trainable += p.numel()
            if len(names) < max_list:
                names.append(n)
    pct = 100.0 * trainable / max(1, total)
    print(f"Trainable params: {trainable:,} / {total:,} ({pct:.2f}%)")
    for n in names:
        print("  •", n)
print_trainable_summary(model)

Trainable params: 26,857,472 / 562,516,996 (4.77%)
  • base_model.model.roberta.embeddings.LayerNorm.weight
  • base_model.model.roberta.embeddings.LayerNorm.bias
  • base_model.model.roberta.encoder.layer.0.attention.self.query.lora_A.default.weight
  • base_model.model.roberta.encoder.layer.0.attention.self.query.lora_B.default.weight
  • base_model.model.roberta.encoder.layer.0.attention.self.value.lora_A.default.weight
  • base_model.model.roberta.encoder.layer.0.attention.self.value.lora_B.default.weight
  • base_model.model.roberta.encoder.layer.0.attention.output.LayerNorm.weight
  • base_model.model.roberta.encoder.layer.0.attention.output.LayerNorm.bias
  • base_model.model.roberta.encoder.layer.0.output.LayerNorm.weight
  • base_model.model.roberta.encoder.layer.0.output.LayerNorm.bias
  • base_model.model.roberta.encoder.layer.1.attention.self.query.lora_A.default.weight
  • base_model.model.roberta.encoder.layer.1.attention.self.query.lora_B.default.weight
  • base_model.mo

In [None]:

# -------------------------
# Collator & metrics
# -------------------------
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")
prec= evaluate.load("precision")
rec = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
        "precision": prec.compute(predictions=preds, references=labels, average="weighted")["precision"],
        "recall": rec.compute(predictions=preds, references=labels, average="weighted")["recall"],
        "f1": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# -------------------------
# TrainingArguments & Trainer
# -------------------------
out_dir = "/content/xlmr_large_lora_unfreeze_drive"
args = TrainingArguments(
    output_dir=out_dir,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    lr_scheduler_type="cosine",
    warmup_ratio=0.06,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    gradient_accumulation_steps=1,
    fp16=not USE_BF16,
    bf16=USE_BF16,
    max_grad_norm=1.0,
    report_to="none",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
# -------------------------
# Train, Validate, Test
# -------------------------
train_result = trainer.train()
val_metrics = trainer.evaluate()
print("Validation:", val_metrics)

test_metrics = trainer.evaluate(encoded["test"])
print("Test:", test_metrics)

# Save model (optional)
# trainer.save_model(out_dir + "/best")

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
200,0.6263,0.551555,0.760357,0.762993,0.760357,0.759559
400,0.4471,0.419773,0.821786,0.828436,0.821786,0.820703
600,0.4369,0.379983,0.839643,0.84048,0.839643,0.839489
800,0.429,0.399761,0.824286,0.829843,0.824286,0.823694
1000,0.4025,0.363129,0.845357,0.853625,0.845357,0.844292
1200,0.4009,0.350575,0.851429,0.851429,0.851429,0.851429
1400,0.3556,0.344323,0.853929,0.854342,0.853929,0.853852
1600,0.3935,0.343486,0.85,0.854691,0.85,0.849388
1800,0.3139,0.351237,0.862143,0.86397,0.862143,0.861906
2000,0.3622,0.338624,0.860714,0.863088,0.860714,0.860413


Validation: {'eval_loss': 0.3320234417915344, 'eval_accuracy': 0.8635714285714285, 'eval_precision': 0.8657996834299043, 'eval_recall': 0.8635714285714285, 'eval_f1': 0.8632939598367901, 'eval_runtime': 6.9121, 'eval_samples_per_second': 405.089, 'eval_steps_per_second': 25.318, 'epoch': 3.0}
Test: {'eval_loss': 0.3241073489189148, 'eval_accuracy': 0.8657407407407407, 'eval_precision': 0.8677608761265053, 'eval_recall': 0.8657407407407407, 'eval_f1': 0.8655809745023813, 'eval_runtime': 7.0537, 'eval_samples_per_second': 398.089, 'eval_steps_per_second': 24.951, 'epoch': 3.0}
