# Importing requried libraries

In [1]:
!git clone https://github.com/nlp-with-transformers/notebooks.git
%cd notebooks
from install import *
install_requirements()

!pip -q install transformers 
import numpy as np 
import pandas as pd
from transformers import * 
from datasets import list_datasets 

In [2]:
# listing out the datasets 
all_datasets = list_datasets()
print(f"There are {len(all_datasets)} datasets currently available on the Hub")
print(f"The first 10 are: {all_datasets[:10]}")


In [3]:
from datasets import load_dataset
emotions = load_dataset("emotion")

In [4]:
emotions["train"]

In [5]:
emotions["train"].column_names

In [6]:
## using the data which is not on huggingface-hub
dataset_url = "https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt?dl=1"
emotions_remote = load_dataset("csv", data_files=dataset_url, sep=";",
names=["text", "label"])


In [7]:
## changing the dataset format to the pandas dataframe 
emotions.set_format("pandas")

In [8]:
emotions['train'][:].head()

In [9]:
import pandas as pd
emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head()

In [10]:
def label_int2str(row):
    return emotions["train"].features["label"].int2str(row)
df["label_name"] = df["label"].apply(label_int2str)
df.head()

## EDA lookging at data distribution


In [11]:
import matplotlib.pyplot as plt
df["label_name"].value_counts(ascending=True).plot.barh()
plt.title("Frequency of Classes")
plt.show()


In [12]:
df["Words Per Tweet"] = df["text"].str.split().apply(len)
df.boxplot("Words Per Tweet", by="label_name", grid=False,
showfliers=False, color="black")
plt.suptitle("")


In [13]:
emotions.reset_format() ## for reseting to the datasets format 

In [14]:
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [15]:
# downloading the tokenizer from autotokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


In [16]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [17]:
# using map function from datasets to get the input_ids and attention mask from text vector
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

## Training the text classifier

In [18]:
from transformers import AutoModel
import torch
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)


In [19]:
emotions_encoded.set_format("torch",columns=["input_ids", "attention_mask", "label"])

In [20]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [21]:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)


In [22]:
import numpy as np
X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])
y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])
X_train.shape, X_valid.shape


In [23]:
from huggingface_hub import notebook_login
notebook_login()
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
		labels = pred.label_ids
		preds = pred.predictions.argmax(-1)
		f1 = f1_score(labels, preds, average="weighted")
		acc = accuracy_score(labels, preds)
		return {"accuracy": acc, "f1": f1}
    

In [26]:
from transformers import AutoModelForSequenceClassification
num_labels = 6
model = (AutoModelForSequenceClassification
.from_pretrained(model_ckpt, num_labels=num_labels)
.to(device))

In [29]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
								num_train_epochs=2,
								learning_rate=2e-5,
								per_device_train_batch_size=batch_size,
								per_device_eval_batch_size=batch_size,
								weight_decay=0.01,
								evaluation_strategy="epoch",
								disable_tqdm=False,
								logging_steps=logging_steps,
								push_to_hub=True,
								log_level="error")

from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
									compute_metrics=compute_metrics,
									train_dataset=emotions_encoded["train"],
									eval_dataset=emotions_encoded["validation"],
									tokenizer=tokenizer)
trainer.train();

preds_output = trainer.predict(emotions_encoded["validation"])

In [30]:
from torch.nn.functional import cross_entropy
def forward_pass_with_label(batch):
    # Place all input tensors on the same device as the model
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch["label"].to(device),reduction="none")
    # Place outputs on CPU for compatibility with other dataset columns
    return {"loss": loss.cpu().numpy(),
    "predicted_label": pred_label.cpu().numpy()}

In [31]:
# Convert our dataset back to PyTorch tensors
emotions_encoded.set_format("torch",
columns=["input_ids", "attention_mask", "label"])
# Compute loss values
emotions_encoded["validation"] = emotions_encoded["validation"].map(
forward_pass_with_label, batched=True, batch_size=16)


In [32]:
emotions_encoded.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = emotions_encoded["validation"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"]
.apply(label_int2str))


In [33]:
df_test.sort_values("loss", ascending=False).head(10)
