In [58]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging
from datasets import load_dataset
from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [59]:
labels = ['admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral']
mapping =  {}
for i, lab in enumerate(labels):
    mapping[lab] = i
mapping, len(labels)

({'admiration': 0,
  'amusement': 1,
  'anger': 2,
  'annoyance': 3,
  'approval': 4,
  'caring': 5,
  'confusion': 6,
  'curiosity': 7,
  'desire': 8,
  'disappointment': 9,
  'disapproval': 10,
  'disgust': 11,
  'embarrassment': 12,
  'excitement': 13,
  'fear': 14,
  'gratitude': 15,
  'grief': 16,
  'joy': 17,
  'love': 18,
  'nervousness': 19,
  'optimism': 20,
  'pride': 21,
  'realization': 22,
  'relief': 23,
  'remorse': 24,
  'sadness': 25,
  'surprise': 26,
  'neutral': 27},
 28)

In [60]:
path = "/Users/cha/Desktop/Code/nlp-intent-classification/data/full_dataset/goemotions_1.csv"
df = pd.read_csv(path)
df["labels"] = df[labels].idxmax(1)
df = df.drop(labels, axis=1)
df = df[["text", "labels"]].copy()
df["labels"] = df["labels"].replace(mapping)
df.head()

Unnamed: 0,text,labels
0,That game hurt.,25
1,>sexuality shouldn’t be a grouping category I...,0
2,"You do right, if you don't care then fuck 'em!",27
3,Man I love reddit.,18
4,"[NAME] was nowhere near them, he was by the Fa...",27


In [61]:
train, test = train_test_split(df[:50], test_size=0.2, random_state=42)
len(train), len(test)

(40, 10)

### Training and evaluating the model

In [62]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir=True)

# Create a ClassificationModel
model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=28,
    args=model_args,
    use_cuda=False
) 

# Train the model
model.train_model(train)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test)

# Make predictions with the model
predictions, raw_outputs = model.predict(["Sam was a Wizard"])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epochs 0/1. Running Loss:    3.2526: 100%|██████████| 5/5 [00:07<00:00,  1.43s/it]
Epoch 1 of 1: 100%|██████████| 1/1 [00:08<00:00,  8.41s/it]
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
 10%|█         | 1/10 [00:02<00:19,  2.16s/it]
INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_28_2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running Evaluation: 100%|██████████| 2/2 [00:00<00:00,  3.44it/s]
INFO:simpletransformers.classification.classification_model:{'mcc': 0.0, 'eval_loss': 3.3264105319976807}
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
100%|██████████| 1/1 [00:02<00:00,  2.07s/it]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1/1 [00:00<00:00, 11.07it/s]


In [63]:
predictions

array([13])