<a href="https://colab.research.google.com/github/sudama-inc/llm_finetuning/blob/main/XLMR_FineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/train_text.txt
!wget https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/train_labels.txt
!wget https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/test_text.txt
!wget https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/test_labels.txt
!wget https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/val_text.txt
!wget https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/val_labels.txt

In [None]:
!pip install datasets
!pip install torch
!pip install transformers
!pip install accelerate -U

In [None]:
import torch
import datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, set_seed

In [None]:
data_dict = {
    "train": {
        "text": [],
        "labels": []
    },
    "test": {
        "text": [],
        "labels": []
    },
    "val": {
        "text": [],
        "labels": []
    }
}

# read train data
data_dict["train"]["text"] = open("train_text.txt", "r").read().split("\n")[:-1]
data_dict["train"]["labels"]  = [int(item) for item in open("train_labels.txt", "r").read().split("\n")[:-1]]

# read test data
data_dict["test"]["text"] = open("test_text.txt", "r").read().split("\n")[:-1]
data_dict["test"]["labels"] = [int(item) for item in open("test_labels.txt", "r").read().split("\n")[:-1]]

# read val data
data_dict["val"]["text"] = open("val_text.txt", "r").read().split("\n")[:-1]
data_dict["val"]["labels"] = [int(item) for item in open("val_labels.txt", "r").read().split("\n")[:-1]]

In [None]:
train_dataset = datasets.Dataset.from_dict(data_dict["train"])
test_dataset = datasets.Dataset.from_dict(data_dict["test"])
val_dataset = datasets.Dataset.from_dict(data_dict["val"])

In [None]:
# create a tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base", use_fast=True)

In [None]:
# tokenize dataset
train_dataset = train_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)
test_dataset = test_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)
val_dataset = val_dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding=True), batched=True)

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

In [None]:
dataset = datasets.DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "val": val_dataset
})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1421
    })
    val: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 374
    })
})

In [None]:
RANDOM_SEED = 42
set_seed(RANDOM_SEED)

In [None]:
EPOCHS = 10
BATCH_SIZE = 64
LEARNING_RATE = 2e-5
MODEL_PATH = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
MAX_TRAINING_EXAMPLES = -1
NUM_LABELS = 4

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=100,
    logging_dir="./logs",
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    seed = RANDOM_SEED,
    load_best_model_at_end=True
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=NUM_LABELS, ignore_mismatched_sizes=True)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks = [EarlyStoppingCallback(3, 0.001)]
)

In [None]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
100,1.0306,0.626676
200,0.4269,0.651655
300,0.224,0.765253
400,0.1271,0.805789


TrainOutput(global_step=400, training_loss=0.4521696758270264, metrics={'train_runtime': 518.9169, 'train_samples_per_second': 62.765, 'train_steps_per_second': 0.983, 'total_flos': 1194885328219128.0, 'train_loss': 0.4521696758270264, 'epoch': 7.84})

In [None]:
trainer.save_model("./results/best_model")

In [None]:
test_pred_raw, test_pred_labels, _ = trainer.predict(test_dataset)
test_preds = np.argmax(test_pred_raw, axis=1)
print(classification_report(test_dataset['labels'], test_preds))

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       558
           1       0.80      0.85      0.82       358
           2       0.74      0.40      0.52       123
           3       0.80      0.76      0.78       382

    accuracy                           0.79      1421
   macro avg       0.78      0.72      0.74      1421
weighted avg       0.79      0.79      0.79      1421

