<a href="https://colab.research.google.com/github/sabre-code/tweet-emotion/blob/main/roberta_large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers[torch]
!pip install datasets

Collecting transformers[torch]
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.0 MB/s

In [2]:

from datasets import load_dataset
import pandas as pd
import torch
from transformers import AutoTokenizer

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
emotions = load_dataset("emotion")
emotions

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [6]:
df = pd.DataFrame(emotions['train'][:])
df.iloc[1]

text     i can go from feeling so hopeless to so damned...
label                                                    0
Name: 1, dtype: object

In [7]:
def label_int2str(row):
  return emotions["train"].features["label"].int2str(row)


df["label_name"] = df["label"].apply(label_int2str)
df.head()


Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


In [8]:
model_ckpt = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [9]:
def tokenize(batch):
  return tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt')

In [10]:
print(tokenize(emotions["train"][:3]))

{'input_ids': tensor([[    0,   118, 46405,   619, 32386,     2,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1],
        [    0,   118,    64,   213,    31,  2157,    98, 24418,     7,    98,
         37689,  7917,    95,    31,   145,   198,   951,    54, 14534,     8,
            16, 24628,     2],
        [    0,   757, 16004,    10,  2289,     7,   618,   939,   619, 34405,
          1593,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [11]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)
print(emotions_encoded["train"].column_names)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

['text', 'label', 'input_ids', 'attention_mask']


In [32]:
#print(pd.DataFrame(emotions_encoded['train'][1]))

In [12]:
from transformers import AutoModelForSequenceClassification
num_labels = 6
model = (AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to(device))

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

In [14]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
!pip install accelerate -U



In [24]:
from transformers import Trainer, TrainingArguments
optim = torch.optim.Adam(model.parameters(), lr=4e-4)

batch_size = 32
logging_steps = 32
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
  num_train_epochs=3,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  logging_steps=logging_steps,
  push_to_hub=True,
  )

In [17]:
#len(emotions_encoded["train"]) // batch_size

500

In [25]:
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
  compute_metrics=compute_metrics,
  train_dataset=emotions_encoded["train"],
  eval_dataset=emotions_encoded["validation"],
  tokenizer = tokenizer,
  optimizers=(optim, None))


trainer.train();

In [21]:
preds_output = trainer.predict(emotions_encoded["validation"])
preds_output_test = trainer.predict(emotions_encoded["validation"])

In [22]:
preds_output.metrics

{'test_loss': 1.5797349214553833,
 'test_accuracy': 0.352,
 'test_f1': 0.18328994082840236,
 'test_runtime': 27.8305,
 'test_samples_per_second': 71.864,
 'test_steps_per_second': 2.264}

In [23]:
preds_output_test.metrics

{'test_loss': 1.5797349214553833,
 'test_accuracy': 0.352,
 'test_f1': 0.18328994082840236,
 'test_runtime': 28.194,
 'test_samples_per_second': 70.937,
 'test_steps_per_second': 2.235}