In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, Features, Value, ClassLabel, DatasetDict

In [29]:
import pandas as pd
text = pd.read_csv("emotions_final.csv", header=0, names=['label', 'text'])
text.sample(3)

emotions = text.label.unique()
emotions

class_names = list(emotions)
emotion_features = Features({'text': Value('string'), 'label': ClassLabel(5,names=class_names)})

labels = ClassLabel(5, class_names)

In [30]:
sentences = load_dataset('csv', data_files='emotions_final.csv', column_names=['label', 'text'])

Using custom data configuration default-3cdbca8706c45705
Reusing dataset csv (C:\Users\grace\.cache\huggingface\datasets\csv\default-3cdbca8706c45705\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)
100%|██████████| 1/1 [00:00<00:00, 1001.03it/s]


In [42]:
train_dataset, test_dataset= sentences['train'].train_test_split(test_size=0.2, shuffle=True, seed=88).values()
test_dataset, validation_dataset = test_dataset.train_test_split(test_size=0.3, shuffle=True, seed=88).values()
sentences = DatasetDict({"train":train_dataset,"test": test_dataset, "validation": validation_dataset})

In [43]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [44]:
def tokenize(batch):
    tokens = tokenizer(batch['text'], padding=True, truncation=True)
    tokens['label'] = labels.str2int(batch['label'])
    return tokens

In [45]:
emotions_encoded = sentences.map(tokenize, batched=True, batch_size=None)

100%|██████████| 1/1 [00:00<00:00,  1.43ba/s]
100%|██████████| 1/1 [00:00<00:00, 13.51ba/s]
100%|██████████| 1/1 [00:00<00:00, 38.45ba/s]


In [46]:
emotions_encoded = emotions_encoded.cast_column('label', labels)

Casting the dataset: 100%|██████████| 1/1 [00:01<00:00,  1.82s/ba]
Casting the dataset: 100%|██████████| 1/1 [00:00<00:00,  5.99ba/s]
Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 18.18ba/s]


In [47]:
num_labels = 5

model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [48]:
emotions_encoded["train"].features

{'label': ClassLabel(num_classes=5, names=['happiness', 'neutral', 'anxiety', 'sadness', 'anger'], id=None),
 'text': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [49]:
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
emotions_encoded["train"].features

{'label': ClassLabel(num_classes=5, names=['happiness', 'neutral', 'anxiety', 'sadness', 'anger'], id=None),
 'text': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [50]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [51]:
from transformers import TrainingArguments

batch_size = 16
logging_steps = len(emotions_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False)

In [52]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6261
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1096
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 470
    })
})

In [53]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"])
trainer.train();

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6261
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3920
 10%|█         | 392/3920 [02:00<14:33,  4.04it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                  
 10%|█         | 392/3920 [02:01<14:33,  4.04it/s]Saving mode

{'eval_loss': 0.6452142000198364, 'eval_accuracy': 0.7723404255319148, 'eval_f1': 0.7717691969876473, 'eval_runtime': 1.0782, 'eval_samples_per_second': 435.906, 'eval_steps_per_second': 27.824, 'epoch': 1.0}


Model weights saved in results\checkpoint-392\pytorch_model.bin
 13%|█▎        | 500/3920 [02:34<16:51,  3.38it/s]  

{'loss': 0.8364, 'learning_rate': 1.7448979591836738e-05, 'epoch': 1.28}


 20%|██        | 784/3920 [03:59<12:53,  4.05it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                  
 20%|██        | 784/3920 [04:00<12:53,  4.05it/s]Saving model checkpoint to results\checkpoint-784
Configuration saved in results\checkpoint-784\config.json


{'eval_loss': 0.6572643518447876, 'eval_accuracy': 0.7702127659574468, 'eval_f1': 0.7725313793055609, 'eval_runtime': 1.0462, 'eval_samples_per_second': 449.229, 'eval_steps_per_second': 28.674, 'epoch': 2.0}


Model weights saved in results\checkpoint-784\pytorch_model.bin
 26%|██▌       | 1000/3920 [05:05<14:26,  3.37it/s]

{'loss': 0.4479, 'learning_rate': 1.4897959183673472e-05, 'epoch': 2.55}


 30%|███       | 1176/3920 [05:57<11:04,  4.13it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                   
 30%|███       | 1176/3920 [05:58<11:04,  4.13it/s]Saving model checkpoint to results\checkpoint-1176
Configuration saved in results\checkpoint-1176\config.json


{'eval_loss': 0.6375669836997986, 'eval_accuracy': 0.8021276595744681, 'eval_f1': 0.8014495146848897, 'eval_runtime': 1.0207, 'eval_samples_per_second': 460.474, 'eval_steps_per_second': 29.392, 'epoch': 3.0}


Model weights saved in results\checkpoint-1176\pytorch_model.bin
 38%|███▊      | 1500/3920 [07:35<11:48,  3.42it/s]

{'loss': 0.272, 'learning_rate': 1.2346938775510204e-05, 'epoch': 3.83}


 40%|████      | 1568/3920 [07:55<09:30,  4.12it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                   
 40%|████      | 1568/3920 [07:56<09:30,  4.12it/s]Saving model checkpoint to results\checkpoint-1568
Configuration saved in results\checkpoint-1568\config.json


{'eval_loss': 0.8435006737709045, 'eval_accuracy': 0.7936170212765957, 'eval_f1': 0.7936145278637838, 'eval_runtime': 1.0145, 'eval_samples_per_second': 463.296, 'eval_steps_per_second': 29.572, 'epoch': 4.0}


Model weights saved in results\checkpoint-1568\pytorch_model.bin
 50%|█████     | 1960/3920 [09:52<07:58,  4.10it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                   
 50%|█████     | 1960/3920 [09:53<07:58,  4.10it/s]Saving model checkpoint to results\checkpoint-1960
Configuration saved in results\checkpoint-1960\config.json


{'eval_loss': 0.9125773906707764, 'eval_accuracy': 0.7978723404255319, 'eval_f1': 0.7991018728303025, 'eval_runtime': 1.019, 'eval_samples_per_second': 461.252, 'eval_steps_per_second': 29.442, 'epoch': 5.0}


Model weights saved in results\checkpoint-1960\pytorch_model.bin
 51%|█████     | 2000/3920 [10:06<09:25,  3.40it/s]

{'loss': 0.1389, 'learning_rate': 9.795918367346939e-06, 'epoch': 5.1}


 60%|██████    | 2352/3920 [11:49<06:18,  4.15it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                   
 60%|██████    | 2352/3920 [11:50<06:18,  4.15it/s]Saving model checkpoint to results\checkpoint-2352
Configuration saved in results\checkpoint-2352\config.json


{'eval_loss': 1.0749258995056152, 'eval_accuracy': 0.7872340425531915, 'eval_f1': 0.7880043918313167, 'eval_runtime': 1.0132, 'eval_samples_per_second': 463.863, 'eval_steps_per_second': 29.608, 'epoch': 6.0}


Model weights saved in results\checkpoint-2352\pytorch_model.bin
 64%|██████▍   | 2500/3920 [12:36<07:02,  3.36it/s]

{'loss': 0.0852, 'learning_rate': 7.244897959183675e-06, 'epoch': 6.38}


 70%|███████   | 2744/3920 [13:48<04:48,  4.08it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                   
 70%|███████   | 2744/3920 [13:49<04:48,  4.08it/s]Saving model checkpoint to results\checkpoint-2744
Configuration saved in results\checkpoint-2744\config.json


{'eval_loss': 1.0908101797103882, 'eval_accuracy': 0.7893617021276595, 'eval_f1': 0.7887452642560124, 'eval_runtime': 1.0482, 'eval_samples_per_second': 448.371, 'eval_steps_per_second': 28.619, 'epoch': 7.0}


Model weights saved in results\checkpoint-2744\pytorch_model.bin
 77%|███████▋  | 3000/3920 [15:07<04:32,  3.38it/s]

{'loss': 0.0592, 'learning_rate': 4.693877551020409e-06, 'epoch': 7.65}


 80%|████████  | 3136/3920 [15:48<03:15,  4.02it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                   
 80%|████████  | 3136/3920 [15:49<03:15,  4.02it/s]Saving model checkpoint to results\checkpoint-3136
Configuration saved in results\checkpoint-3136\config.json


{'eval_loss': 1.1592433452606201, 'eval_accuracy': 0.8, 'eval_f1': 0.8002943848144972, 'eval_runtime': 1.0662, 'eval_samples_per_second': 440.8, 'eval_steps_per_second': 28.136, 'epoch': 8.0}


Model weights saved in results\checkpoint-3136\pytorch_model.bin
 89%|████████▉ | 3500/3920 [17:38<02:05,  3.35it/s]

{'loss': 0.0381, 'learning_rate': 2.1428571428571427e-06, 'epoch': 8.93}


 90%|█████████ | 3528/3920 [17:46<01:37,  4.03it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                   
 90%|█████████ | 3528/3920 [17:47<01:37,  4.03it/s]Saving model checkpoint to results\checkpoint-3528
Configuration saved in results\checkpoint-3528\config.json


{'eval_loss': 1.1812270879745483, 'eval_accuracy': 0.8042553191489362, 'eval_f1': 0.8043095167183316, 'eval_runtime': 1.0522, 'eval_samples_per_second': 446.667, 'eval_steps_per_second': 28.511, 'epoch': 9.0}


Model weights saved in results\checkpoint-3528\pytorch_model.bin
100%|██████████| 3920/3920 [19:45<00:00,  4.11it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
                                                   
100%|██████████| 3920/3920 [19:46<00:00,  4.11it/s]Saving model checkpoint to results\checkpoint-3920
Configuration saved in results\checkpoint-3920\config.json


{'eval_loss': 1.211344599723816, 'eval_accuracy': 0.7957446808510639, 'eval_f1': 0.7958733479510078, 'eval_runtime': 1.0447, 'eval_samples_per_second': 449.871, 'eval_steps_per_second': 28.715, 'epoch': 10.0}


Model weights saved in results\checkpoint-3920\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results\checkpoint-3528 (score: 0.8043095167183316).
100%|██████████| 3920/3920 [19:48<00:00,  3.30it/s]

{'train_runtime': 1188.3976, 'train_samples_per_second': 52.684, 'train_steps_per_second': 3.299, 'train_loss': 0.24288988964898245, 'epoch': 10.0}





In [60]:
results = trainer.evaluate()
results

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 470
  Batch size = 16
100it [00:10,  9.63it/s]


{'eval_loss': 1.1812270879745483,
 'eval_accuracy': 0.8042553191489362,
 'eval_f1': 0.8043095167183316,
 'eval_runtime': 1.3743,
 'eval_samples_per_second': 342.003,
 'eval_steps_per_second': 21.83,
 'epoch': 10.0}

In [61]:
preds_output = trainer.predict(emotions_encoded["test"])
preds_output.metrics

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1096
  Batch size = 16
100%|██████████| 69/69 [00:03<00:00, 21.54it/s]

{'test_loss': 1.189466953277588,
 'test_accuracy': 0.781021897810219,
 'test_f1': 0.7798639468951565,
 'test_runtime': 3.5763,
 'test_samples_per_second': 306.459,
 'test_steps_per_second': 19.293}

In [62]:
import numpy as np
from datasets import Dataset, load_dataset

def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(examples['sentence'], padding=False, max_length=None, truncation=True, verbose=False)
    return result

def predict(dataframe):
    eval_dataset = Dataset.from_pandas(dataframe)
    eval_dataset = eval_dataset.map(preprocess_function, batched=False, load_from_cache_file=True)
    # Initialize our Trainer
    predictions = trainer.predict(test_dataset=eval_dataset).predictions
    # Adding a softmax layer to get probabilities. If you want class labels instead -  np.argmax(predictions, axis=1)
    return predictions

In [63]:
example = pd.DataFrame([''], columns=['sentence'])
predictions = predict(example)
predictions = np.argmax(predictions, axis=1)
emotions[predictions]

100%|██████████| 1/1 [00:00<?, ?ex/s]
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1
  Batch size = 16


array(['neutral'], dtype=object)

In [64]:
predictions

array([1], dtype=int64)

70it [00:16, 21.54it/s]                        

In [59]:
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

Configuration saved in ./model\config.json
Model weights saved in ./model\pytorch_model.bin
tokenizer config file saved in ./model\tokenizer_config.json
Special tokens file saved in ./model\special_tokens_map.json


('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\vocab.txt',
 './model\\added_tokens.json',
 './model\\tokenizer.json')