In [None]:
import transformers
import datasets
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from huggingface_hub import list_datasets, login
token = ""
login(token=token)

In [None]:
datasets = list_datasets(search="emotion", limit=10)
for ds in datasets:
    print(ds.id)

In [None]:
from datasets import load_dataset
emotions_dataset = load_dataset('dair-ai/emotion')

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

text = "Tokenizing text is a core task of NLP."

encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
def tokenize(batch):
  return tokenizer(batch['text'], padding=True, truncation=True)

In [None]:
import pandas as pd

emotions_dataset.set_format("pandas")
df = emotions_dataset["train"][:]
df.head()

# Looking at the Class Distribution

In [None]:
import matplotlib.pyplot as plt

df["label"].value_counts().plot.barh()
plt.title("Frequency of Classes")
plt.show()

In [None]:
df["Words per tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words per tweet", by="label", grid=False, showfliers=False, color="black")
plt.suptitle("")
plt.xlabel("")
plt.show()

In [None]:
emotions_dataset.reset_format()

# From Text to Tokens

## Character Tokenization

In [None]:
text = "Tokenizing text is a core task of NLP."
tokenized_text = list(text)
print(tokenized_text)

In [None]:
token2idx = {ch : idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
input_ids = [token2idx[token] for token in tokenized_text]

In [None]:
import torch
import torch.nn.functional as F
# 2D tensor of one-hot vectors
input_ids = torch.tensor(input_ids)
one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))
print(one_hot_encodings.shape)

In [None]:
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {input_ids[0]}")
print(f"One-hot: {one_hot_encodings[0]}")

## Word Tokenization

In [None]:
tokenized_text = text.split()
print(tokenized_text)

## Subword Tokenization

In [None]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

encoded_text = tokenizer(text)
print(encoded_text)

tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

print(tokenizer.convert_tokens_to_string(tokens))

In [None]:
print(tokenizer.vocab_size, tokenizer.model_max_length, tokenizer.model_input_names)

## Tokenizing the Whole Dataset

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)

print(tokenize(emotions_dataset["train"][:2]))

In [None]:
emotions_encoded = emotions_dataset.map(tokenize, batched=True, batch_size=None)

# Training a TEXT CLASSIFIER

In [None]:
from transformers import AutoModel
import torch

model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
torch.cuda.is_available()

## Transformers as Feature Extractors

In [None]:
# Encodings as a tensor
text = "this is a test"
inputs  = tokenizer(text, return_tensors="pt")
print(f"Input tensor shape: {inputs['input_ids'].size()}")
# [batch_size, n_tokens]

In [None]:
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
  outputs = model(**inputs)
print(outputs)

In [None]:
outputs.last_hidden_state.size()
# [bs, n_tok, hidden_dim]

In [None]:
def extract_hidden_states(batch):
  # Place model inputs on the GPU
  inputs = {k: v.to(device) for k, v in batch.items()
            if k in tokenizer.model_input_names}
  # Extract last hidden states
  with torch.no_grad():
    last_hidden_state = model(**inputs).last_hidden_state
  # return vector for [CLS] token
  return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}

In [None]:
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
# Extract hidden states across all splits in one go
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

## Creating a feature matrix

In [None]:
import numpy as np

X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["test"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["test"]["label"])
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

## Visualizing the Training Set

In [None]:
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Scale features to [0,1] range
X_scaled = MinMaxScaler().fit_transform(X_train)
# Initialize and fit UMAP
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
# Create a DataFrame of 2D embeddings
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train
df_emb.head()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(3, 2, figsize=(5, 5))
axes = axes.flatten()
cmaps = ['Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds', 'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu']
labels = emotions_dataset["train"].features["label"].names
print("Unique classes:", labels)

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
  df_emb_sub = df_emb.query(f"label == {i}")
  axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap, gridsize=20, linewidths=(0,))
  axes[i].set_title(label)
  axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

## Training a simple classifier

In [None]:
from sklearn.linear_model import LogisticRegression

# we increase 'max_iter' to guarantee convergence
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_valid, y_valid)

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix")
  plt.show()

In [None]:
y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels)

# Fine-tuning Transformers

## Loading a pretrained model

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 6
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

Classification head has not yet been trained. Some parts of the model are randomly initialized.

## Defining the performance metrics

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}

## Training the model

In [None]:
from huggingface_hub import login
token = ""
login(token=token)

In [None]:
from transformers import Trainer, TrainingArguments

bs = 64
logging_steps = len(emotions_encoded["train"]) // bs
model_name = f"{model_ckpt}-finetuned_emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=5,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=bs,
                                  per_device_eval_batch_size=bs,
                                  weight_decay=0.01,
                                  logging_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)

trainer.train();

In [None]:
preds_outputs = trainer.predict(emotions_encoded["validation"])
print(preds_outputs.metrics, preds_outputs.predictions.shape, preds_outputs.predictions[:10])

In [None]:
y_preds = np.argmax(preds_outputs.predictions, axis=1)
y_preds.shape

y_train = np.array(emotions_encoded["train"]["label"])
y_valid = np.array(emotions_encoded["validation"]["label"])
labels = emotions_dataset["train"].features["label"].names

In [None]:
plot_confusion_matrix(y_preds, y_valid, labels)

## Error Analysis

A simple yet powerful technique is to sort validaions samples by the model loss. when we pass the label during the forward pass, the loss is automatically calculated and returned. Here's a loss that returns the loss along with the predicted label:

In [None]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
  # Place all input tensors on the same device as the model
  inputs = {k:v.to(device) for k,v in batch.items()
            if k in tokenizer.model_input_names}

  with torch.no_grad():
    output = model(**inputs)
    pred_label = torch.argmax(output.logits, axis=-1)
    loss = cross_entropy(output.logits, batch["label"].to(device), reduction="none")
  return {"loss": loss.cpu().numpy(),
          "predicted_label": pred_label.cpu().numpy()}

def label_int2str(row):
  return emotions_dataset["train"].features["label"].int2str(row)

In [None]:
# using map() function to apply this function to get the losses for all the samples
#Convert our dataset back to PyTorch tensors
emotions_encoded.set_format("torch",
                            columns=["input_ids", "attention_mask", "label"])
#Compute loss values
emotions_encoded["validation"] = emotions_encoded["validation"].map(
    forward_pass_with_label, batched=True, batch_size=16)

In [None]:
# Create a DataFrame with texts, losses, and predicted/true labels
emotions_encoded.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = emotions_encoded["validation"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = df_test["predicted_label"].apply(label_int2str)
df_test.head()

In [None]:
# data samples with highest losses
df_test.sort_values("loss", ascending=False).head(10)

In [None]:
# data samples with smallest losses
df_test.sort_values("loss", ascending=True).head(10)

## Saving and sharing the model 

In [None]:
trainer.push_to_hub(commit_message="Training completed!")

In [None]:
from transformers import pipeline

# Change 'transformersbook' to you Hub username
model_id = "sergi24sanchez/distilbert-base-uncased-finetuned_emotion"
classifier = pipeline("text-classification", model=model_id)

In [None]:
#custom_tweet = "I saw a movie last Sunday which really moved me and inspired me to live my life a different way."
custom_tweet = "i should have taken more photos"
preds = classifier(custom_tweet, return_all_scores=True)

In [None]:
preds_df = pd.DataFrame(preds[0])
plt.bar(labels, 100 * preds_df["score"], color='C0')
plt.title(f'"{custom_tweet}"\n')
plt.ylabel("Class probability (%)")
plt.show()