In [31]:
# ✅ XLM-R fine-tuning (version-compatible)
import os, inspect, pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# If your notebook lives in /notebooks, go to project root so paths work:
os.chdir("..") if os.path.basename(os.getcwd()) == "notebooks" else None

train_df = pd.read_csv("data/splits/train.csv")
dev_df   = pd.read_csv("data/splits/dev.csv")


# ---- Map string labels to integers ----
label2id = {"neutral": 0, "distress": 1}
id2label = {v: k for k, v in label2id.items()}

if train_df["label"].dtype == object:
    train_df["labels"] = train_df["label"].map(label2id)
    dev_df["labels"]   = dev_df["label"].map(label2id)
else:
    # If already numeric, ensure column is named 'labels'
    train_df["labels"] = train_df["label"]
    dev_df["labels"]   = dev_df["label"]

# Keep only the columns we need for loading; drop others to be safe
train_df[["text", "labels"]].to_csv("train_temp.csv", index=False)
dev_df[["text", "labels"]].to_csv("dev_temp.csv", index=False)

dataset = load_dataset("csv", data_files={"train":"train_temp.csv", "validation":"dev_temp.csv"})

# ---- Tokenizer & tokenization ----
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

dataset = dataset.map(tokenize, batched=True)
# Remove everything except the model inputs + labels
keep_cols = {"input_ids", "attention_mask", "labels"}
for split in dataset:
    drop_cols = [c for c in dataset[split].column_names if c not in keep_cols]
    dataset[split] = dataset[split].remove_columns(drop_cols)
dataset.set_format("torch")

# ---- Model ----
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# ---- TrainingArguments (compatible across versions) ----
ta_params = inspect.signature(TrainingArguments.__init__).parameters
kwargs = dict(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
)
if "evaluation_strategy" in ta_params:
    kwargs["evaluation_strategy"] = "epoch"
if "save_strategy" in ta_params:
    kwargs["save_strategy"] = "no"

args = TrainingArguments(**kwargs)

# ---- Trainer ----
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
)

trainer.train()
metrics = trainer.evaluate()
metrics



Generating train split: 37 examples [00:00, 17821.46 examples/s]
Generating validation split: 5 examples [00:00, 2516.38 examples/s]
Map: 100%|██████████| 37/37 [00:00<00:00, 13558.38 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 2392.37 examples/s]
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


{'eval_loss': 0.6766914129257202,
 'eval_runtime': 0.0267,
 'eval_samples_per_second': 187.132,
 'eval_steps_per_second': 37.426,
 'epoch': 3.0}

In [32]:
import numpy as np, pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

pred_out = trainer.predict(dataset["validation"])
y_true  = pred_out.label_ids
y_prob  = pred_out.predictions
y_pred  = y_prob.argmax(axis=1)

print(classification_report(y_true, y_pred, target_names=["neutral","distress"]))

from pathlib import Path, PurePath
import json
from sklearn.metrics import classification_report
import numpy as np

# Create folder if missing
Path("reports/experiments").mkdir(parents=True, exist_ok=True)

# Save results
with open("reports/experiments/xlmr_metrics.json", "w") as f:
    json.dump({
        "eval_loss": float(pred_out.metrics.get("test_loss", np.nan)),
        "report": classification_report(
            y_true, y_pred, target_names=["neutral","distress"], output_dict=True
        )
    }, f, indent=2)

print("✅ Metrics saved to reports/experiments/xlmr_metrics.json")



              precision    recall  f1-score   support

     neutral       0.00      0.00      0.00         2
    distress       0.60      1.00      0.75         3

    accuracy                           0.60         5
   macro avg       0.30      0.50      0.38         5
weighted avg       0.36      0.60      0.45         5

✅ Metrics saved to reports/experiments/xlmr_metrics.json


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [33]:
# ✅ Save fine-tuned model and tokenizer
trainer.save_model("models/xlmr-burmese")
tokenizer.save_pretrained("models/xlmr-burmese")

print("✅ Model and tokenizer saved to: models/xlmr-burmese")



✅ Model and tokenizer saved to: models/xlmr-burmese
