In [39]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np

In [40]:
data = pd.read_csv("/content/tweet_emotions.csv")
#data = pd.read_csv("sample.csv")

In [41]:
emotions = data["sentiment"].unique()
emotion_id = {emotion: i for i, emotion in enumerate(emotions)}

data["emotion_id"] = data["sentiment"].map(emotion_id)

Creating the evaluation dataset

In [42]:
np.random.seed(42)
eval_ratio = 0.1
total = data.shape[0]
eval_end = int(total * eval_ratio)
eval_data = data[:eval_end]
data = data[eval_end:]
print(total, len(eval_data), len(data))

40000 4000 36000


Setting up the model


In [43]:
model_nm = "FacebookAI/roberta-base"

In [10]:
#! pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [44]:
from datasets import Dataset, DatasetDict
ds = Dataset.from_pandas(data)
# ds_eval = Dataset.from_pandas(eval_data)
# ds_dict = DatasetDict({"train": ds, "eval": ds_eval})

Tokenizer

In [45]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
tokz = AutoTokenizer.from_pretrained(model_nm)

loading configuration file https://huggingface.co/FacebookAI/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a1462ede3948796d4f0b92fc0538bec1d072c3e9185be0d18eaa90b079ef5a1e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "FacebookAI/roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/

In [46]:
def tok_func(x): return tokz(x["content"])

In [47]:
tokz_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

In [48]:
eval_tokz_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

In [49]:
eval_ds = Dataset.from_pandas(eval_data).map(tok_func, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [50]:
tokz_ds[0]

{'tweet_id': 1960162032,
 'sentiment': 'neutral',
 'content': 'my to do list is bananas, before i leave for Europe BOOOO',
 'emotion_id': 3,
 'input_ids': [0,
  4783,
  7,
  109,
  889,
  16,
  31130,
  6,
  137,
  939,
  989,
  13,
  1005,
  163,
  47123,
  2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Train-Test split


In [51]:
dds = tokz_ds.train_test_split(test_size=0.2)
dds

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'sentiment', 'content', 'emotion_id', 'input_ids', 'attention_mask'],
        num_rows: 28800
    })
    test: Dataset({
        features: ['tweet_id', 'sentiment', 'content', 'emotion_id', 'input_ids', 'attention_mask'],
        num_rows: 7200
    })
})

Args

In [52]:
bs = 96
lr = 8e-5
epochs = 4

In [53]:
from transformers import TrainingArguments, Trainer

In [54]:
#!pip install accelerate -U

In [55]:
#!pip install -U transformers

In [19]:
# import accelerate
# import transformers
# accelerate.__version__, transformers.__version__

('0.29.3', '4.17.0')

In [23]:
#pip install transformers==4.17 --- solved the error of the install accelerate!
#torch default version is not compatible with the accelerate version

In [60]:
args = TrainingArguments(
    "outputs",
    learning_rate=lr,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs,
    warmup_ratio=0.1,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    fp16=True,
    evaluation_strategy="epoch",
    report_to="none",
    save_total_limit=3
)

PyTorch: setting up devices


Metrics

In [62]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from transformers import EvalPrediction
import torch
from torch.nn.functional import cross_entropy

def multiclass_metrics(predictions, labels):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.argmax(probs, axis=1)
    y_true = labels
    accuracy = accuracy_score(y_true, y_pred)
    loss = cross_entropy(torch.Tensor(predictions), torch.Tensor(labels))
    metrics ={"accuracy":accuracy, 'loss':loss.item()}
    return metrics


def compute_metrics(p:EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multiclass_metrics(predictions= preds, labels = p[1])
    return result

In [63]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=6)
trainer = Trainer(
    model,
    args,
    train_dataset=dds["train"],
    eval_dataset=dds["test"],
    tokenizer=tokz,
    compute_metrics=compute_metrics
)

loading configuration file https://huggingface.co/FacebookAI/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a1462ede3948796d4f0b92fc0538bec1d072c3e9185be0d18eaa90b079ef5a1e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "FacebookAI/roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "ro

In [64]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: tweet_id, content, sentiment, emotion_id. If tweet_id, content, sentiment, emotion_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 28800
  Num Epochs = 4
  Instantaneous batch size per device = 96
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 1200


KeyError: 'loss'

In [None]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)