## I. FINETUNING

##### 1. Import packages
`pip install transformers torch datasets numpy pandas`

In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# settings.py
from settings import dataset_name, model_name, finetuned_model_name, finetuned_models_folder
from functions import tokenize, get_tokenizer, get_metrics, plot_confusion_matrix

##### 2. Load dataset "emotion" - use train and validation subset
```
emotion => Labelled Twitter text messages classified into 6 different sentiments:
    1. 'sadness'
    2. 'anger',
    3. 'love',
    4. 'surprise'
    5. 'fear'
    6. 'joy'
(On the first download, this may take a while)
```
##### We use the train subset to finetune the generic BERT to our special needs, here to classify Twitter msgs. You could use any other labeled text dataset for classification according to your needs.

In [None]:
dataset = load_dataset(dataset_name)
labels = pd.Series(list(dataset.data['train'].columns[2])).unique().astype(str).tolist()
print('Labels of "emotion" train dataset:', labels)
print('Subsets of "emotion" dataset: ', list(dataset.data.keys()))
print(f'Size of "emotion" train/validation datset: {len(dataset.data["train"])}/{len(dataset.data["validation"])}')

In [None]:
print('"""  Example: """')
index = 42
print(f"TEXT: '{dataset['train']['text'][index]}'")
print(f"LABEL: {dataset['train']['label'][index]}")
print(f"LABEL_TEXT: {dataset['train']['label_text'][index]}")

##### 3. Tokenize text (train and validation subsets)


In [None]:
emotions_train_tokenized = dataset['train'].map(tokenize, fn_kwargs={'model_name': model_name}, batched=True, batch_size=None)
emotions_validation_tokenized = dataset['validation'].map(tokenize, fn_kwargs={'model_name': model_name}, batched=True, batch_size=None)

##### 4. Now finetune pretrained BERT

In [None]:
# GPU/Cuda (if installed on your computer) or simple CPU?
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'device: {device} is used to finetune model!')

###### "AutoModelForSequenceClassification" replaces the head (last linear layer) of the pretrained model with a randomly initialized linear layer. The number of neurons in this new linear layer must correspond to the number of classes/labels in our dataset (here: the number of sentiments = 6). Only this new linear layer will be trained during finetuning, the other parts of the pretrained model will be frozen:

In [None]:
num_labels = len(labels)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=num_labels).to(device=device)

In [None]:
# Settings - check docs for further details: https://huggingface.co/docs/transformers/training
batch_size = 64
num_epochs = 5
learning_rate = 2e-5
weight_decay = 0.01
logging_steps = len(emotions_train_tokenized) // batch_size
finetuned_model_name_and_path = f"./{finetuned_models_folder}/{finetuned_model_name}"
tokenizer = get_tokenizer(model_name=model_name)

In [None]:
# Training Arguments - check docs for further details: https://huggingface.co/docs/transformers/training
training_args = TrainingArguments(output_dir=finetuned_model_name_and_path,
                                  num_train_epochs=num_epochs,
                                  learning_rate=learning_rate,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=weight_decay,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level="error")

In [None]:
# Trainer Arguments - check docs for further details: https://huggingface.co/docs/transformers/training
trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=get_metrics,
                  train_dataset=emotions_train_tokenized,
                  eval_dataset=emotions_validation_tokenized,
                  tokenizer=tokenizer)

###### FINETUNING STARTS:
###### WARNING: This takes some time (~ 3 to 30 mins or more) depending on "settings" above and your GPU/CPU/hardware setup:

In [None]:
# Start finetuning / train last linear layer only:
trainer.train()

###### Metrics (for validation dataset) for model that was just finetuned :

In [None]:
validation_set_predictions = trainer.predict(emotions_validation_tokenized)
print('General metrics for the finetuned model: ', validation_set_predictions.metrics)

In [None]:
y_pred = np.argmax(validation_set_predictions.predictions, axis=1)
print('Further metrics for finetuned model: Confusion Matrix')
plot_confusion_matrix(y_pred, list(dataset['validation']['label']), labels)

##### 5. Save finetuned model for later predictions

In [None]:
trainer.save_model(output_dir=finetuned_model_name_and_path)

##### 6. Clear memory

In [None]:
# Empty cache
del model
del trainer
torch.cuda.empty_cache()