In [1]:
from datasets import load_dataset

In [2]:
emotions = load_dataset("emotion")
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

## Device Agnostic code

In [3]:
import torch

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Tokenizing the Dataset

In [5]:
from transformers import AutoTokenizer

In [6]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [7]:
def tokenize(batch):
  return tokenizer(batch["text"], padding = True, truncation = True)

In [8]:
emotions_encoded = emotions.map(tokenize, batched = True, batch_size = None)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## Loading a Pre-trained model

In [9]:
from transformers import AutoModelForSequenceClassification

In [10]:
num_labels = 6
model = (AutoModelForSequenceClassification
             .from_pretrained(model_ckpt, num_labels=num_labels)
             .to(device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Defining the Performance Metric

In [11]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

## Training the model

## Logging onto Hugging face hub

In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
from transformers import Trainer, TrainingArguments
batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                      num_train_epochs=2,
                                      learning_rate=2e-5,
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=batch_size,
                                      weight_decay=0.01,
                                      evaluation_strategy="epoch",
                                      disable_tqdm=False,
                                      logging_steps=logging_steps,
                                      push_to_hub=True,
                                      log_level="error")

In [14]:
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
                      compute_metrics=compute_metrics,
                      train_dataset=emotions_encoded["train"],
                      eval_dataset=emotions_encoded["validation"],
                      tokenizer=tokenizer)
trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8141,0.313119,0.9015,0.900262
2,0.2441,0.219107,0.923,0.92291


## Saving and Sharing the Model

In [15]:
trainer.push_to_hub(commit_message="Training completed!")

'https://huggingface.co/theahmadfaiq/distilbert-base-uncased-finetuned-emotion/tree/main/'

In [16]:
from transformers import pipeline

In [17]:
model_id = "theahmadfaiq/distilbert-base-uncased-finetuned-emotion"
classifier = pipeline("text-classification", model=model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [18]:
custom_tweet = "I saw a movie today and it was really good."
preds = classifier(custom_tweet, return_all_scores=True)



In [25]:
preds

[[{'label': 'LABEL_0', 'score': 0.015116711147129536},
  {'label': 'LABEL_1', 'score': 0.9522669315338135},
  {'label': 'LABEL_2', 'score': 0.0077761756256222725},
  {'label': 'LABEL_3', 'score': 0.008004298433661461},
  {'label': 'LABEL_4', 'score': 0.007570923306047916},
  {'label': 'LABEL_5', 'score': 0.009264891967177391}]]

`Label_1 is joy so the model has predicted correctly.`