In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np

In [2]:
#data = pd.read_csv("/content/tweet_emotions.csv")
#data = pd.read_csv("sample.csv")
data = pd.read_csv("text.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [4]:
# emotions = data["sentiment"].unique()
# emotion_id = {emotion: i for i, emotion in enumerate(emotions)}

# data["emotion_id"] = data["sentiment"].map(emotion_id)

In [5]:
mapping = {0 : "sadness",
           1 : "joy",
           2 : "love",
           3 : "anger",
           4 : "fear",
           5 : "surprise"
}

data["emotion"] =  data["label"].map(mapping)

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,text,label,emotion
0,0,i just feel really helpless and heavy hearted,4,fear
1,1,ive enjoyed being able to slouch about relax a...,0,sadness
2,2,i gave up my internship with the dmrg and am f...,4,fear
3,3,i dont know i feel so lost,0,sadness
4,4,i am a kindergarten teacher and i am thoroughl...,4,fear


Creating the evaluation dataset

In [7]:

np.random.seed(42)
data = data.sample(frac=1).reset_index(drop=True)

eval_ratio = 0.15
total = data.shape[0]
eval_end = int(total * eval_ratio)

eval_data = data[:eval_end]

data = data[eval_end:]
print(total, len(eval_data), len(data), len(eval_data) + len(data))

416809 62521 354288 416809


Setting up the model


In [8]:
model_nm = "FacebookAI/roberta-base"

In [9]:
#! pip install -q datasets

In [10]:
from datasets import Dataset, DatasetDict
ds = Dataset.from_pandas(data)
# ds_eval = Dataset.from_pandas(eval_data)
# ds_dict = DatasetDict({"train": ds, "eval": ds_eval})

Tokenizer

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
tokz = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def tok_func(x): return tokz(x["text"])

In [13]:
tokz_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/354288 [00:00<?, ? examples/s]

In [14]:
#eval_tokz_ds = ds.map(tok_func, batched=True)

In [15]:
eval_ds = Dataset.from_pandas(eval_data).map(tok_func, batched=True)

Map:   0%|          | 0/62521 [00:00<?, ? examples/s]

In [16]:
tokz_ds[0]

{'Unnamed: 0': 89246,
 'text': 'i feel a little inadequate to advise them as i have no knowledge of the timing of their local nectar flows',
 'label': 0,
 'emotion': 'sadness',
 'input_ids': [0,
  118,
  619,
  10,
  410,
  15650,
  7,
  12922,
  106,
  25,
  939,
  33,
  117,
  2655,
  9,
  5,
  5801,
  9,
  49,
  400,
  295,
  39459,
  7964,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

Train-Test split


In [17]:
dds = tokz_ds.train_test_split(test_size=0.2)
dds

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'label', 'emotion', 'input_ids', 'attention_mask'],
        num_rows: 283430
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'label', 'emotion', 'input_ids', 'attention_mask'],
        num_rows: 70858
    })
})

Args

In [18]:
bs = 96
lr = 8e-5
epochs = 1

In [19]:
from transformers import TrainingArguments, Trainer

In [20]:
#!pip install accelerate -U

In [21]:
#!pip install -U transformers

In [22]:
# import accelerate
# import transformers
# accelerate.__version__, transformers.__version__

In [23]:
#!pip install transformers==4.17
#--- solved the error of the install accelerate!
#torch default version is not compatible with the accelerate version

In [24]:
args = TrainingArguments(
    "outputs",
    learning_rate=lr,
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs,
    warmup_ratio=0.1,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    fp16=True,
    evaluation_strategy="epoch",
    report_to="none",
    save_total_limit=3
)

Metrics

In [25]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from transformers import EvalPrediction
import torch
from torch.nn.functional import cross_entropy

def multi_label_metrics(predictions, labels):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.argmax(probs, axis=1)
    y_true = labels
    accuracy = accuracy_score(y_true, y_pred)
    #loss = cross_entropy(torch.Tensor(predictions), torch.Tensor(labels))
    metrics ={"accuracy":accuracy}
    return metrics


def compute_metrics(p:EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions= preds, labels = p[1])
    print(result)
    return result

In [26]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=6)
trainer = Trainer(
    model,
    args,
    train_dataset=dds["train"],
    eval_dataset=dds["test"],
    tokenizer=tokz,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at FacebookAI/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.out

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, emotion, Unnamed: 0. If text, emotion, Unnamed: 0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 283430
  Num Epochs = 4
  Instantaneous batch size per device = 96
  Total train batch size (w. parallel, distributed & accumulation) = 96
  Gradient Accumulation steps = 1
  Total optimization steps = 11812


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1174,0.104828,0.937989


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to outputs/checkpoint-1500
Configuration saved in outputs/checkpoint-1500/config.json
Model weights saved in outputs/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1500/special_tokens_map.json
Saving model check

{'accuracy': 0.9379886533630641}


Saving model checkpoint to outputs/checkpoint-3000
Configuration saved in outputs/checkpoint-3000/config.json
Model weights saved in outputs/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-3000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-3000/special_tokens_map.json
Deleting older checkpoint [outputs/checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to outputs/checkpoint-3500
Configuration saved in outputs/checkpoint-3500/config.json
Model weights saved in outputs/checkpoint-3500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-3500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-3500/special_tokens_map.json
Deleting older checkpoint [outputs/checkpoint-2000] due to args.save_total_limit


In [None]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)