<a href="https://colab.research.google.com/github/shrushti20/Coursera_HTML-CSS-Javascript-for-Web-Developers/blob/master/goemo_xlmr_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"

APIkey- dc7ea0d0de5bd531c9a038d69fa8aa882a3c080b

In [3]:
!pip install -q "transformers" "datasets" "accelerate" scikit-learn

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch
from google.colab import drive

# ---- MOUNT DRIVE ONCE ----
drive.mount('/content/drive')

# ---- LOAD GOEMOTIONS (MULTI-LABEL) ----
goemo_raw = load_dataset("go_emotions")

# keep only examples with exactly ONE label
def filter_single_label(example):
    return len(example["labels"]) == 1

goemo_filtered = goemo_raw.filter(filter_single_label)

# convert list[int] -> single int
def squeeze_label(example):
    example["label"] = example["labels"][0]
    return example

goemo_filtered = goemo_filtered.map(squeeze_label)
goemo_filtered = goemo_filtered.remove_columns(["labels"])

print(goemo_filtered)

# ---- LABEL MAPPINGS ----
goemo_label_names = goemo_raw["train"].features["labels"].feature.names
num_labels = len(goemo_label_names)

id2label = {i: l for i, l in enumerate(goemo_label_names)}
label2id = {l: i for i, l in enumerate(goemo_label_names)}

print("Num labels:", num_labels)
print("First labels:", goemo_label_names[:10])

# ---- TOKENIZER + MODEL (XLM-R BASE) ----
xlmr_model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    xlmr_model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# ---- TOKENIZE DATA ----
def tokenize_goemo(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

encoded_goemo = goemo_filtered.map(tokenize_goemo, batched=True)
encoded_goemo = encoded_goemo.rename_column("label", "labels")
encoded_goemo.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

# ---- METRICS ----
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "macro_f1": macro_f1}

# ---- TRAINING ARGS (NO FANCY STUFF, VERSION-SAFE) ----
training_args = TrainingArguments(
    output_dir="goemo_xlmr_base",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
)

# ---- TRAIN ----
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_goemo["train"],
    eval_dataset=encoded_goemo["validation"],
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate(encoded_goemo["test"])
print("Test results:", eval_results)

# ---- SAVE LOCALLY ----
model.save_pretrained("goemo_xlmr_base")
tokenizer.save_pretrained("goemo_xlmr_base")
print("Saved locally in /content/goemo_xlmr_base")

# ---- COPY TO DRIVE (PERMANENT) ----
!rm -rf /content/drive/MyDrive/goemo_xlmr_base
!cp -r goemo_xlmr_base /content/drive/MyDrive/

print("✅ Saved XLM-R GoEmotions permanently to Drive: /content/drive/MyDrive/goemo_xlmr_base")


Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

simplified/train-00000-of-00001.parquet:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

simplified/validation-00000-of-00001.par(…):   0%|          | 0.00/350k [00:00<?, ?B/s]

simplified/test-00000-of-00001.parquet:   0%|          | 0.00/347k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/43410 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5426 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5427 [00:00<?, ? examples/s]

Filter:   0%|          | 0/43410 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5426 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/36308 [00:00<?, ? examples/s]

Map:   0%|          | 0/4548 [00:00<?, ? examples/s]

Map:   0%|          | 0/4590 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'label'],
        num_rows: 36308
    })
    validation: Dataset({
        features: ['text', 'id', 'label'],
        num_rows: 4548
    })
    test: Dataset({
        features: ['text', 'id', 'label'],
        num_rows: 4590
    })
})
Num labels: 28
First labels: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment']


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/36308 [00:00<?, ? examples/s]

Map:   0%|          | 0/4548 [00:00<?, ? examples/s]

Map:   0%|          | 0/4590 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
100,2.801
200,2.5809
300,2.6049
400,2.4432
500,2.2056
600,2.106
700,1.9895
800,1.9332
900,1.8288
1000,1.7749


Test results: {'eval_loss': 1.2976247072219849, 'eval_accuracy': 0.6213507625272331, 'eval_macro_f1': 0.46833476832373633, 'eval_runtime': 29.7603, 'eval_samples_per_second': 154.232, 'eval_steps_per_second': 9.644, 'epoch': 3.0}
Saved locally in /content/goemo_xlmr_base
✅ Saved XLM-R GoEmotions permanently to Drive: /content/drive/MyDrive/goemo_xlmr_base


In [6]:
!ls /content/drive/MyDrive/goemo_xlmr_base


checkpoint-1000  checkpoint-500  runs


2. Load TweetEval Emotion dataset# New section    ait_xlmr_base

In [8]:
from datasets import load_dataset

tweetemo = load_dataset("tweet_eval", "emotion")
print(tweetemo)
print(tweetemo["train"][0])


README.md: 0.00B [00:00, ?B/s]

emotion/train-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

emotion/test-00000-of-00001.parquet:   0%|          | 0.00/105k [00:00<?, ?B/s]

emotion/validation-00000-of-00001.parque(…):   0%|          | 0.00/28.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})
{'text': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry", 'label': 2}


In [9]:
label_names = tweetemo["train"].features["label"].names
num_labels = len(label_names)
id2label = {i: l for i, l in enumerate(label_names)}
label2id = {l: i for i, l in enumerate(label_names)}

print("Labels:", label_names)
print("Num labels:", num_labels)


Labels: ['anger', 'joy', 'optimism', 'sadness']
Num labels: 4


In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

xlmr_model_name = "xlm-roberta-base"

tokenizer_ait = AutoTokenizer.from_pretrained(xlmr_model_name)

model_ait = AutoModelForSequenceClassification.from_pretrained(
    xlmr_model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def tokenize_tweetemo(batch):
    return tokenizer_ait(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

encoded_ait = tweetemo.map(tokenize_tweetemo, batched=True)
encoded_ait = encoded_ait.rename_column("label", "labels")
encoded_ait.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)


Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

In [12]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "macro_f1": macro_f1}


In [13]:
from transformers import TrainingArguments, Trainer

training_args_ait = TrainingArguments(
    output_dir="ait_xlmr_base",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",   # <- avoids wandb
)


In [14]:
trainer_ait = Trainer(
    model=model_ait,
    args=training_args_ait,
    train_dataset=encoded_ait["train"],
    eval_dataset=encoded_ait["validation"],
    compute_metrics=compute_metrics,
)

trainer_ait.train()
ait_eval_results = trainer_ait.evaluate(encoded_ait["test"])
print("AIT XLM-R test:", ait_eval_results)


Step,Training Loss
50,1.2922
100,1.1064
150,0.9913
200,0.8697
250,0.7058
300,0.6973
350,0.6957
400,0.6529
450,0.5729
500,0.5175


AIT XLM-R test: {'eval_loss': 0.6332455277442932, 'eval_accuracy': 0.7952146375791695, 'eval_macro_f1': 0.7528767785348682, 'eval_runtime': 9.2661, 'eval_samples_per_second': 153.354, 'eval_steps_per_second': 9.605, 'epoch': 3.0}


In [9]:
!pip install -q "transformers" "datasets" "accelerate" scikit-learn

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch
from google.colab import drive
import os

# disable wandb noise
os.environ["WANDB_DISABLED"] = "true"

# ---- mount Drive ----
drive.mount('/content/drive')

# ---- 1. load TweetEval Emotion ----
tweetemo = load_dataset("tweet_eval", "emotion")
print(tweetemo)
print("Example:", tweetemo["train"][0])

# ---- 2. label names & mappings ----
label_names = tweetemo["train"].features["label"].names
num_labels = len(label_names)
id2label = {i: l for i, l in enumerate(label_names)}
label2id = {l: i for i, l in enumerate(label_names)}

print("Labels:", label_names)

# ---- 3. tokenizer + model (XLM-R) ----
xlmr_model_name = "xlm-roberta-base"

tokenizer_ait = AutoTokenizer.from_pretrained(xlmr_model_name)

model_ait = AutoModelForSequenceClassification.from_pretrained(
    xlmr_model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# ---- 4. tokenize dataset ----
def tokenize_tweetemo(batch):
    return tokenizer_ait(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

encoded_ait = tweetemo.map(tokenize_tweetemo, batched=True)
encoded_ait = encoded_ait.rename_column("label", "labels")
encoded_ait.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

# ---- 5. metrics ----
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "macro_f1": macro_f1}

# ---- 6. training args ----
training_args_ait = TrainingArguments(
    output_dir="ait_xlmr_base",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",   # no wandb
)

# ---- 7. train + evaluate ----
trainer_ait = Trainer(
    model=model_ait,
    args=training_args_ait,
    train_dataset=encoded_ait["train"],
    eval_dataset=encoded_ait["validation"],
    compute_metrics=compute_metrics,
)

trainer_ait.train()
ait_eval_results = trainer_ait.evaluate(encoded_ait["test"])
print("AIT XLM-R test:", ait_eval_results)

# ---- 8. save locally ----
trainer_ait.save_model("ait_xlmr_base")
tokenizer_ait.save_pretrained("ait_xlmr_base")
print("Saved locally in /content/ait_xlmr_base")

# ---- 9. copy to Drive (permanent) ----
!rm -rf /content/drive/MyDrive/ait_xlmr_base
!cp -r /content/ait_xlmr_base /content/drive/MyDrive/

print("✅ AIT XLM-R saved permanently to Drive: /content/drive/MyDrive/ait_xlmr_base")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

emotion/train-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

emotion/test-00000-of-00001.parquet:   0%|          | 0.00/105k [00:00<?, ?B/s]

emotion/validation-00000-of-00001.parque(…):   0%|          | 0.00/28.6k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3257 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1421 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/374 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})
Example: {'text': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry", 'label': 2}
Labels: ['anger', 'joy', 'optimism', 'sadness']


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]



Step,Training Loss
50,1.3198
100,1.2009
150,1.0417
200,0.9897
250,0.7474
300,0.7351
350,0.7351
400,0.6875
450,0.6224
500,0.5286




AIT XLM-R test: {'eval_loss': 0.6387863159179688, 'eval_accuracy': 0.7902885292047853, 'eval_macro_f1': 0.7556616036085796, 'eval_runtime': 602.2435, 'eval_samples_per_second': 2.36, 'eval_steps_per_second': 0.148, 'epoch': 3.0}
Saved locally in /content/ait_xlmr_base
✅ AIT XLM-R saved permanently to Drive: /content/drive/MyDrive/ait_xlmr_base


In [10]:
!ls /content/drive/MyDrive/ait_xlmr_base

checkpoint-500	model.safetensors	 tokenizer_config.json
checkpoint-612	sentencepiece.bpe.model  tokenizer.json
config.json	special_tokens_map.json  training_args.bin
