1. Installing required modules

In [None]:
# Installs required modules

!pip install --upgrade watermark blackcellmagic
!pip install datasets
!pip install transformers
!pip install umap-learn
!pip install nlpaug
!pip install "ray[tune]"
!pip install hyperopt

2. Importing required modules

In [None]:
# Loads Black extension and imports required modules
%load_ext blackcellmagic

import torch
from datasets import load_dataset, DatasetDict, Features, Value, ClassLabel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import set_seed
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from ray.tune.suggest.hyperopt import HyperOptSearch
from ray.tune.schedulers import ASHAScheduler
from ray import tune
import json
import os
import shutil
from joblib import dump, load
set_seed(0)
torch.manual_seed(0)
np.random.seed(0)

3. Defining utility functions for the entire project

In [None]:
def prepare_dataset(working_dir=os.getcwd()):

  train_file = working_dir + "/train_dataset_processed.csv"
  test_file = working_dir + "/test_dataset_processed.csv"

  data_files = {"train": train_file, "test": test_file}
  class_names = ["negative", "neutral", "positive"]
  data_columns = Features({"label": ClassLabel(names=class_names), "text": Value("string")})
  original_data = load_dataset("csv", data_files=data_files, column_names=["label", "text"], features=data_columns)

  train_valid_data = original_data["train"].train_test_split(test_size=0.2)
  train_dataset = train_valid_data["train"]
  valid_dataset = train_valid_data["test"]
  project_dataset = DatasetDict({
      "train": train_dataset,
      "valid": valid_dataset,
      "test": original_data["test"]})

  # Removing empty text entries
  project_dataset = project_dataset.filter(lambda example: example["text"] != None)
  return project_dataset


def plot_confusion_matrix(y_preds, y_true, labels, title):
  cm = confusion_matrix(y_true, y_preds, normalize="true")
  fix, ax = plt.subplots(figsize=(6, 6))
  cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  cm_display.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix for " + title)
  plt.show()

def print_classification_report(y_preds, y_true, title):
  labels = ["negative", "neutral", "positive"]
  plot_confusion_matrix(y_preds, y_true, labels, title)
  print("\n\n")
  print("Classification report for " + title)
  mlp_report = classification_report(y_true, y_preds, target_names=labels)
  print("-" * 75)
  print(mlp_report)
  print("-" * 75)

def display_data_properties(dataset):
  """Checking the data distribution"""
  # Using pandas for checking the data distribution
  dataset.set_format(type="pandas")
  train_df = dataset["train"][:]
  valid_df = dataset["valid"][:]
  test_df = dataset["test"][:]

  # Printing the first 5 examples in the train data
  def label_int2str(row):
    return dataset["train"].features["label"].int2str(row)

  train_df["label_name"] = train_df["label"].apply(label_int2str)
  valid_df["label_name"] = valid_df["label"].apply(label_int2str)
  test_df["label_name"] = test_df["label"].apply(label_int2str)
  print(train_df.head())

  # Displaying the frequency of classes in the training data
  train_df["label_name"].value_counts(ascending=True).plot.barh()
  plt.title("Frequency of Classes in training data")
  plt.show()
  plt.figure()
  valid_df["label_name"].value_counts(ascending=True).plot.barh()
  plt.title("Frequency of Classes in validation data")
  plt.show()
  plt.figure()
  test_df["label_name"].value_counts(ascending=True).plot.barh()
  plt.title("Frequency of Classes in test data")
  plt.show()

  # Resetting the dataset format
  dataset.reset_format()

# display_data_properties(dataset)

# 2. Transformers section

## 2.1. Completing project setup for transformer models

In [None]:
working_dir = os.getcwd()   # Sets the working directory for the project
project_dataset = prepare_dataset()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class_samples = [9023, 12366, 6416]
class_weights = [1 - (x / sum(class_samples)) for x in class_samples]
class_weights = torch.FloatTensor(class_weights).to(device)

model_checkpoint = "distilbert-base-uncased"

if not os.path.exists(working_dir + "/output-" + model_checkpoint):
  os.makedirs(working_dir + "/output-" + model_checkpoint)
model_output_dir = working_dir + "/output-" + model_checkpoint

if not os.path.exists(working_dir + "/best_models/" + model_checkpoint):
  os.makedirs(working_dir + "/best_models/" + model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## 2.2 Preparing input data

2.2.1. Tokenizating the input

In [None]:
def encode_dataset(tokenizer, project_dataset):
  
  # Initializing the tokenizer
  tokenizer = tokenizer

  # Defining tokenization function
  def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)
    
  # Applying the tokenize function across all the datasets (train/validation/test)
  return project_dataset.map(tokenize, batched=True, batch_size=None)

dataset_encoded = encode_dataset(tokenizer, project_dataset)

## 2.3 Training a classifier

### 2.3.1 Training a classifier using feature extraction

2.3.1.1 Extracting the last hidden states

In [None]:
def extract_hidden_states_features(model_checkpoint, dataset_encoded):

  model = AutoModel.from_pretrained(model_checkpoint).to(device)

  # Extracting the last hidden states
  def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
      last_hidden_state = model(**inputs).last_hidden_state
    return {"hidden_state": last_hidden_state[:, 0].cpu().numpy()}

  # Extracting hidden states across all dataset splits in one go
  dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
  dataset_hidden = dataset_encoded.map(extract_hidden_states, batched=True)

  return dataset_hidden

dataset_hidden = extract_hidden_states_features(model_checkpoint, dataset_encoded)

2.3.1.2 Creating train, validation, test datasets of the hidden states as required for the classifier

In [None]:
# Creating a feature matrix
X_train = np.array(dataset_hidden["train"]["hidden_state"])
X_valid = np.array(dataset_hidden["valid"]["hidden_state"])
X_test = np.array(dataset_hidden["test"]["hidden_state"])
y_train = np.array(dataset_hidden["train"]["label"])
y_valid = np.array(dataset_hidden["valid"]["label"])
y_test = np.array(dataset_hidden["test"]["label"])

2.3.1.3 Training an MLP classifier using the hidden states and labels

In [None]:
mlp = MLPClassifier(random_state=0,
                    validation_fraction=0.2, 
                    early_stopping=True)
mlp.fit(X_train, y_train)
print("Training accuracy: %.2f %%" % (mlp.score(X_train, y_train) * 100))
print("Validation accuracy: %.2f %%"  % (mlp.score(X_valid, y_valid) * 100))

In [None]:
def hyperparameter_search_mlp_model():
  PARAM_GRID = {
     'learning_rate_init': [0.001, 0.0001, 0.00001],
     'hidden_layer_sizes': [(512, 512, 512, 512, 512), (1024, 1024, 1024, 1024)],
     'alpha': [0.0001, 0.001]
     }

  mlp = MLPClassifier(random_state=0,
                      validation_fraction=0.2, 
                      early_stopping=True)

  gs_clf = GridSearchCV(mlp, PARAM_GRID, verbose=3).fit(X_train, y_train)
  mlp = gs_clf.best_estimator_
  print("Best model training accuracy: %.2f %%" % (mlp.score(X_train, y_train) * 100))
  print("Best model validation accuracy: %.2f %%"  % (mlp.score(X_valid, y_valid) * 100))

# hyperparameter_search_mlp_model()

2.3.1.4 Generating results to document a baseline

Plots the training loss and validation accuracy of the MLP model

In [None]:
plt.plot(mlp.loss_curve_, label="training loss")
plt.plot(mlp.validation_scores_, label="validation accuracy")
plt.legend();

Provides the confusion matrix and classification report for validation data and test data of the project

In [None]:
y_preds_v = mlp.predict(X_valid)
print_classification_report(y_preds_v, y_valid, "project_dataset[\"valid\"] using MLP")

y_preds_t = mlp.predict(X_test)
print_classification_report(y_preds_t, y_test, "project_dataset[\"test\"] using MLP")

2.3.1.5 Saving the trained model

In [None]:
mlp_save_filename = working_dir + "/best_models/mlp.joblib"
dump(mlp, mlp_save_filename)

2.3.1.5 Loading saved model and testing performance on test data

In [None]:
loaded_mlp = load(working_dir + "/best_models/mlp.joblib")
y_preds_t = loaded_mlp.predict(X_test)
print_classification_report(y_preds_t, y_test, "project_dataset[\"test\"] using MLP")

### 2.3.2 Training a classifier using fine tuning

2.3.2.1 Defining functions required for training a classifier by fine-tuning a transformer model

In [None]:
# Defining the performance metrics that will be used to evaluate our model's performance during fine-tuning
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
  loss_val = loss_fct(torch.from_numpy(pred.predictions).to(device), torch.from_numpy(labels).to(device))
  acc = accuracy_score(labels, preds)
  f1 = f1_score(labels, preds, average="weighted")
  prec = precision_score(labels, preds, average="weighted")
  rec = recall_score(labels, preds, average="weighted")
  return {"accuracy": acc, "precision": prec, "recall": rec, "f1_score": f1, "loss": loss_val}


def model_init():
  dropout_val = 0.3
  try:
    with open(model_output_dir + "/model_hp_values.json", "r") as f:
      prev_hp = json.load(f)
      dropout_val = prev_hp["dropout"]
      print("Found previously searched dropout value: ", dropout_val)
  except:
    print("No previous dropout value found. Using default setting of", dropout_val)

  if model_checkpoint == "distilbert-base-uncased":
    return (AutoModelForSequenceClassification.from_pretrained(model_checkpoint, 
                                                             num_labels=3, 
                                                             return_dict=True, 
                                                             dropout=dropout_val,
                                                             attention_dropout=dropout_val,
                                                             qa_dropout=dropout_val).to(device))
  elif model_checkpoint == "bert-base-uncased":
    return (AutoModelForSequenceClassification.from_pretrained(model_checkpoint, 
                                                             num_labels=3, 
                                                             return_dict=True, 
                                                             attention_probs_dropout_prob=dropout_val,
                                                             hidden_dropout_prob=dropout_val,
                                                             classifier_dropout=dropout_val).to(device))

def compute_objective(metrics):
  return metrics["eval_loss"]

2.3.2.2 Defining the project trainer class

In [None]:
class ProjectTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(output_dir=model_output_dir,
                                  evaluation_strategy="steps",
                                  logging_strategy="steps",
                                  eval_steps=500,
                                  disable_tqdm=False,
                                  logging_dir=model_output_dir + "/logs",
                                  save_total_limit=1,
                                  load_best_model_at_end=True)

trainer = ProjectTrainer(args=training_args,
                            tokenizer=tokenizer,
                            train_dataset=dataset_encoded["train"],
                            eval_dataset=dataset_encoded["valid"],
                            model_init=model_init,
                            compute_metrics=compute_metrics)

2.3.2.3 Performing hyperparameter tuning for the model

In [None]:
def project_hyperparameter_tuning():

  hp_search_space = dict()
  try:
    with open(model_output_dir + "/model_hp_values.json", "r") as f:
      prev_hp = json.load(f)
    plr = prev_hp["learning_rate"]

    hp_search_space= {
        "per_device_train_batch_size": tune.choice([8, 16, 32, 64]),
        "learning_rate": tune.loguniform(plr * 0.1, plr * 10),
        "weight_decay": tune.choice([0.1, 0.2, 0.3]),
        "num_train_epochs": tune.choice([1, 2, 3, 4, 5])
    }
  except:
    hp_search_space = {
        "per_device_train_batch_size": tune.choice([8, 16, 32, 64]),
        "learning_rate": tune.loguniform(1e-5, 1e-4),
        "weight_decay": tune.choice([0.1, 0.2, 0.3]),
        "num_train_epochs": tune.choice([1, 2, 3, 4, 5])
    }

  if model_checkpoint == "distilbert-base-uncased":
    hp_search_space["dropout"] = tune.uniform(0.1, 0.3)
    hp_search_space["attention_dropout"] = tune.uniform(0.1, 0.3)
    hp_search_space["qa_dropout"] = tune.uniform(0.1, 0.3)
  elif model_checkpoint == "bert-base-uncased":
    hp_search_space["attention_probs_dropout_prob"] = tune.uniform(0.1, 0.3)
    hp_search_space["hidden_dropout_prob"] = tune.uniform(0.1, 0.3)
    hp_search_space["classifier_dropout"] = tune.uniform(0.1, 0.3)

  best_trial = None
  try:
    best_trial = trainer.hyperparameter_search(hp_space=lambda _: hp_search_space,
                                             compute_objective=compute_objective,
                              direction="minimize",
                              backend="ray",
                              n_trials=50,
                              search_alg=HyperOptSearch(metric="objective", mode="min"),
                              scheduler=ASHAScheduler(metric="objective", mode="min"),
                              local_dir=model_output_dir + "/ray_asha_results_bert",
                              log_to_file=True)

    save_best_model = False
    if "objective" not in hp_search_space:
      save_best_model = True
    if ("objective" in hp_search_space) and (hp_search_space["objective"] > best_trial.objective):
      print("New best hyperparameters found.")
      save_best_model = True

    if save_best_model == True:
      save_model_dir = working_dir + "/best_models/" + model_checkpoint
      for f in os.listdir(save_model_dir):
          os.remove(os.path.join(save_model_dir, f))

      best_run_dir = model_output_dir + "/run-" + best_trial.run_id
      best_model_checkpoint = sorted(os.listdir(best_run_dir))[-1]
      copy_model_dir = os.path.join(best_run_dir, best_model_checkpoint)

      for f in os.listdir(copy_model_dir):
        shutil.copy(os.path.join(copy_model_dir, f), save_model_dir)
      shutil.make_archive(save_model_dir, 'zip', save_model_dir)

      print("Saved model associated with the new best hyperparameters in this directory: ", save_model_dir)
      print("Download the zip file to use the model later.")
    else:
      print("Could not find better hyperparameters in this search.")

    for f in os.listdir(model_output_dir):
      path = os.path.join(model_output_dir, f)
      try:
        shutil.rmtree(path)
      except OSError:
        os.remove(path)
    print("Cleared all files from this hyperparameter search from the following directory: ", model_output_dir)

    hp_values = best_trial.hyperparameters
    hp_values["objective"] = best_trial.objective
    hp_values_json = json.dumps(hp_values)
    with open(model_output_dir + "/model_hp_values.json", "w") as f:
      f.write(hp_values_json)
    print("New hyperparameters written to " + model_output_dir + "/model_hp_values.json")
  except:
    print("Hyperparameter search did not complete successfully.")

  return best_trial

current_best_trial = project_hyperparameter_tuning()
print("Best hyperparameters from this search: ", current_best_trial)

In [None]:
save_model_dir = working_dir + "/best_models/" + model_checkpoint
shutil.make_archive(save_model_dir, 'zip', save_model_dir)

2.3.2.4 Training the model with the last best set of hyperparameters found.

This function is written because it was observed that sometimes the hyperparameter tuning phase does not complete successfully due to memory related issues. This function allows us work with any improved hyperparameters found during unsuccessful hyperparameter tuning attempts by manually going through the logs.

In [None]:
def train_with_last_best_hyperparameters():
  try:
    with open(model_output_dir + "/model_hp_values.json", "r") as f:
      hp_values = json.load(f)
    print("Set trainer.args using previous hyperparamter search results.")
  except:
    hp_values = dict()
    hp_values["learning_rate"] = 1.5544610095816334e-05
    hp_values["num_train_epochs"] = 2
    hp_values["per_device_train_batch_size"] = 64
    hp_values["seed"] = 25
    hp_values["weight_decay"] = 0.1
    hp_values["dropout"] = 0.12078133498900412
    print("No previous hyperparameter search results found. Manually setting the hyperparameter values.")

  for n, v in hp_values.items():
    if n == "seed":
      v = int(v)
    setattr(trainer.args, n, v)
  setattr(trainer.args, "callbacks", [EarlyStoppingCallback(early_stopping_patience=3)])

  trainer.train()

  model_save_path = model_output_dir +  "/best_model"
  tokenizer.save_pretrained(model_save_path)
  trainer.save_model(model_save_path)

  train_losses = list()
  validation_losses = list()

  for entry in trainer.state.log_history:
    if "loss" in entry:
      train_losses.append(entry["loss"])
    if "eval_loss" in entry:
      validation_losses.append(entry["eval_loss"])

  plt.plot(train_losses, color="blue")
  plt.plot(validation_losses, color="red")
  plt.ylim(-0.05, 2)
  plt.xlabel("Steps")
  plt.ylabel("loss")
  plt.legend(["train_loss", "validation_loss"])
  plt.title("Loss every 500 steps")
  plt.show()

train_with_last_best_hyperparameters()

2.3.2.5 Generating results

Provides the confusion matrix and classification report for validation data and test data of the project

In [None]:
y_valid = np.array(dataset_encoded["valid"]["label"])
validation_predictions_all = trainer.predict(dataset_encoded["valid"])
y_preds_v = np.argmax(validation_predictions_all.predictions, axis=1)
print_classification_report(y_preds_v, y_valid, "project_dataset[\"valid\"] using finetuned " + model_checkpoint)

y_test = np.array(dataset_encoded["test"]["label"])
test_predictions_all = trainer.predict(dataset_encoded["test"])
y_preds_t = np.argmax(test_predictions_all.predictions, axis=1)
print_classification_report(y_preds_t, y_test, "project_dataset[\"test\"] using finetuned " + model_checkpoint)

2.3.2.5 Loading saved model and testing performance on test data

In [None]:
def evaluate_test_data_using_saved_model(dataset):
  # Load model
  try:
    model_load_path = working_dir +  "/best_models/" + model_checkpoint 
    loaded_tokenizer = AutoTokenizer.from_pretrained(model_load_path)
    loaded_m = (AutoModelForSequenceClassification.from_pretrained(model_load_path, num_labels=3).to(device))
    trainer = ProjectTrainer(model=loaded_m)

    dataset_encoded = encode_dataset(loaded_tokenizer, dataset)
    y_test = np.array(dataset_encoded["label"])
    labels = dataset.features["label"].names

    test_predictions_all = trainer.predict(dataset_encoded)
    y_preds_t = np.argmax(test_predictions_all.predictions, axis=1)
    print_classification_report(y_preds_t, y_test, "project_dataset[\"test\"] using finetuned " + model_checkpoint)
  except FileNotFoundError:
    print("Could not load a model from the provided path.")
  except Exception as e:
    print("Could not complete the opration because of the following error: ", e)
evaluate_test_data_using_saved_model(project_dataset["test"])