#Bert Training



## Parameters

In [None]:
DATASET_PATH = "/content/meld.csv"

RANDOM_SEED = 42

# data
SAMPLE = None
X_LABEL = 'Utterance'  # Utterance, Transcription
Y_LABEL = 'Sentiment'
Y_CLASSES = ['negative', 'positive', "neutral"]
TRANSCRIPT_MATCH_THRESHOLD = 0.2

# model
MODEL_NAME = 'tiny_bert'  # 'bert', 'distil_bert', 'tiny_bert'
DROPOUT_PROB = 0.8

# training
EPOCHS = 5
BATCH_SIZE = 16
LEARNING_RATE=1e-5
MAX_LENGTH = 100
WEIGHT_DECAY= 2e-4  # 2e-5

## Dependencies

In [None]:
!pip install transformers

In [None]:
!mkdir "models"

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup, AutoModel, AutoTokenizer, DistilBertTokenizer, DistilBertModel, PreTrainedTokenizerBase
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from difflib import SequenceMatcher
from matplotlib import pyplot as plt
from typing import Tuple
from abc import abstractmethod

## Dataset

In [None]:
class MeldDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, x_label: str, y_label: str, max_length: int, augment=None):
        self.x_list: np.ndarray = df[x_label].to_numpy()
        ohe = OneHotEncoder()
        codes = df[y_label].to_numpy()
        codes = np.expand_dims(codes, axis=1)
        self.y_list: np.ndarray = ohe.fit_transform(codes).toarray()
        self.categories = ohe.categories_
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment

    def __len__(self):
        return len(self.x_list)

    def __getitem__(self, item):
        text = self.x_list[item]
        if self.augment:
            text = self.augment(text)
        encoded_dict: dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )
        inputs_ids = encoded_dict['input_ids'].reshape(-1)
        attention_mask = encoded_dict['attention_mask'].reshape(-1)
        y_tensor = torch.tensor(self.y_list[item])
        return inputs_ids, attention_mask, y_tensor, text

In [None]:
def create_data_loader(dataset: Dataset, batch_size: int):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True
    )

In [None]:
def prepare_dataset_based_on_class(df: pd.DataFrame, y_label: str, y_classes: list) -> pd.DataFrame:
    all_possibles_classes = df[y_label].unique()
    if len(all_possibles_classes) == len(y_classes):
        return df
    elif len(y_classes) < 2:
        print("Minimal number of analyzed class is 2, setting analyzed class into two -> positive and negative")
        y_classes = ['negative', 'positive']
    class_to_delete = set(all_possibles_classes) - set(y_classes)
    df = df.loc[df[y_label] != list(class_to_delete)[0]].reset_index(drop=True)
    return df

In [None]:
def remove_junk_transcriptions(df_row):
    s = SequenceMatcher(None, df_row['Utterance'], df_row['Transcription'])
    return s.ratio()

In [None]:
def process_data(df: pd.DataFrame, y_label: str, y_classes: list, match_threshold: float, sample: int = None):
    df = df.dropna()
    df = df[df.apply(lambda row: remove_junk_transcriptions(row),
                     axis=1) > match_threshold]
    df = prepare_dataset_based_on_class(df, y_label=y_label,
                                        y_classes=y_classes)
    # limit dataframe length
    if sample:
        df = df.head(sample)
    return df

## Bert Model

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, model_name: str, dropout_prob: float):
        super(BertClassifier, self).__init__()
        self.module_name = model_name
        self.dropout = nn.Dropout(p=dropout_prob)

    @abstractmethod
    def forward(self, input_ids, attention_mask):
        pass

In [None]:
class CustomBertClassifier(BertClassifier):
    # bert core + dropout + one layer feed-forward
    def __init__(self, model_name, dropout_prob, n_classes):
        super(CustomBertClassifier, self).__init__(model_name=model_name, dropout_prob=dropout_prob)
        self.bert = BertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        output = self.dropout(pooled_output)
        return self.classifier(output)

In [None]:
class CustomDistilBertClassifier(BertClassifier):
    def __init__(self, model_name, dropout_prob, n_classes):
        super(CustomDistilBertClassifier, self).__init__(model_name=model_name, dropout_prob=dropout_prob)
        self.bert = DistilBertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )[0]
        pooled_output = pooled_output[:, 0]
        output = self.dropout(pooled_output)
        return self.classifier(output)

In [None]:
class CustomTinyBertClassifier(BertClassifier):
    def __init__(self, model_name, dropout_prob, n_classes):
        super(CustomTinyBertClassifier, self).__init__(model_name=model_name, dropout_prob=dropout_prob)
        self.bert = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )[0]
        pooled_output = pooled_output[:, 0]
        output = self.dropout(pooled_output)
        return self.classifier(output)

In [None]:
class BertFactory:
    POSSIBLE_MODEL_NAMES = ['bert', 'distil_bert', 'tiny_bert']

    @staticmethod
    def create(model_name: str, dropout_prob: float, n_classes: int) -> Tuple[PreTrainedTokenizerBase, BertClassifier]:
        if not model_name.lower() in BertFactory.POSSIBLE_MODEL_NAMES:
            raise Exception(f"Received model is not supported, received: {model_name},"
                            f" supported: {BertFactory.POSSIBLE_MODEL_NAMES}")
        if model_name == "bert":
            whole_model_name = "bert-base-uncased"
            return (BertTokenizer.from_pretrained(whole_model_name),
                    CustomBertClassifier(whole_model_name, dropout_prob, n_classes))
        elif model_name == "distil_bert":
            whole_model_name = "distilbert-base-uncased"
            return (DistilBertTokenizer.from_pretrained(whole_model_name),
                    CustomDistilBertClassifier(whole_model_name, dropout_prob, n_classes))
        else:
            whole_model_name = "huawei-noah/TinyBERT_General_4L_312D"
            return (AutoTokenizer.from_pretrained(whole_model_name),
                    CustomTinyBertClassifier(whole_model_name, dropout_prob, n_classes))


## Building model

In [None]:
class BertClassifierInterface:
    def __init__(self, model_name, dropout_prob, n_classes, lr=1e-5, wg=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.n_classes = n_classes
        self.tokenizer, self.model = BertFactory.create(model_name, dropout_prob, n_classes)
        self.model.to(self.device)
        self.loss_function = nn.CrossEntropyLoss().to(self.device)
        if model_name == "distil_bert":
            self.loss_function = nn.BCEWithLogitsLoss().to(self.device)
        if wg:
            self.optimizer = AdamW(self.model.parameters(), lr=lr, weight_decay=wg)
        else:
            self.optimizer = AdamW(self.model.parameters(), lr=lr)
        self.mapper = []
        self.history = []

    @staticmethod
    def data_preprocess(df: pd.DataFrame, tokenizer, x_label: str, y_label: str,
                 max_length: int, batch_size=16, transform=None):
        df_train, df_val = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
        train_dataset = MeldDataset(df_train, tokenizer, x_label, y_label, max_length, transform)
        val_dataset = MeldDataset(df_val, tokenizer, x_label, y_label, max_length, transform)
        assert np.all(np.equal(train_dataset.categories[0], val_dataset.categories[0]))
        mapper = train_dataset.categories[0]
        train_data_loader: DataLoader = create_data_loader(train_dataset, batch_size)
        val_data_loader: DataLoader = create_data_loader(val_dataset, batch_size)
        return train_data_loader, val_data_loader, mapper

    def train(self, df: pd.DataFrame, x_label: str, y_label: str,
                 max_length: int = 70, batch_size: int = 16, epochs: int = 10, transform=None):
        train_data_loader, val_data_loader, self.mapper = self.data_preprocess(df, self.tokenizer, x_label, y_label, max_length, batch_size, transform)
        total_steps: int = len(train_data_loader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer=self.optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )
        best_acc: float = 0
        for epoch_i in range(epochs):
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

            print('Training...')

            train_acc, train_loss = self._train(
                data_loader=train_data_loader,
                scheduler=scheduler
            )

            print("  Train accuracy: {0:.2f}".format(train_acc))
            print("  Train loss: {0:.2f}".format(train_loss))

            print('Running validation...')

            val_acc, val_loss = self.evaluate(
                data_loader=val_data_loader,
            )

            print("  Validation accuracy: {0:.2f}".format(val_acc))
            print("  Validation loss: {0:.2f}".format(val_loss))
            current_history = [train_acc, train_loss, val_acc, val_loss]
            self.history.append(current_history)

            # save model state with best accuracy
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(self.model.state_dict(), '/content/models/best_model.bin')


    def _train(self, data_loader: DataLoader, scheduler):

        model = self.model.train()

        losses = []
        correct_predictions: int = 0

        loop = tqdm(data_loader)
        for idx, d in enumerate(loop):
            input_ids = d[0].to(self.device)
            attention_mask = d[1].to(self.device)
            targets = d[2].to(self.device)

            # get model outputs
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )


            _, predictions = torch.max(outputs, dim=1)
            _, correct = torch.max(targets, dim=1)
            correct_predictions += sum(torch.eq(predictions, correct))

            loss = self.loss_function(outputs, targets)
                

            losses.append(loss.item())
            # Backward prop
            loss.backward()

            # Gradient Descent
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            self.optimizer.step()
            scheduler.step()
            self.optimizer.zero_grad()

        return float(correct_predictions) / len(data_loader.dataset), np.mean(losses)


    def evaluate(self, data_loader: DataLoader):
        # set mode
        model = self.model.eval()

        losses = []
        correct_predictions: int = 0

        with torch.no_grad():
            loop = tqdm(data_loader)
            for idx, d in enumerate(loop):
                input_ids = d[0].to(self.device)
                attention_mask = d[1].to(self.device)
                targets = d[2].to(self.device)

                # get model outputs
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                _, preds = torch.max(outputs, dim=1)
                _, correct_preds = torch.max(targets, dim=1)
                correct_predictions += sum(torch.eq(preds, correct_preds))
                loss = self.loss_function(outputs, targets)
                losses.append(loss.item())

        return float(correct_predictions) / len(data_loader.dataset), np.mean(losses)

    def predict(self, data_loader: DataLoader):
        # set mode
        model = self.model.eval()

        x_values = []
        y_predictions = []
        y_probabilities = []
        y_actual = []

        with torch.no_grad():
            loop = tqdm(data_loader)
            for idx, d in enumerate(loop):
                input_ids = d[0].to(self.device)
                attention_mask = d[1].to(self.device)
                targets = d[2].to(self.device)
                x_vals = d[3]

                # get model outputs
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )

                _, preds = torch.max(outputs, dim=1)

                x_values.extend(x_vals)
                y_predictions.extend(preds)
                y_probabilities.extend(outputs)
                y_actual.extend(targets)

        y_predictions = torch.stack(y_predictions).cpu()
        y_probabilities = torch.stack(y_probabilities).cpu()
        y_actual = torch.stack(y_actual).cpu()

        return x_values, y_predictions, y_probabilities, y_actual

    def load(self, filename):
        self.model.load_state_dict(torch.load(f'/content/models/{filename}.bin'))

    def save(self, filename="final_model"):
        torch.save(self.model.state_dict(), f'/content/models/{filename}.bin')


# Training

In [None]:
df: pd.DataFrame = pd.read_csv(DATASET_PATH)

df = process_data(df, Y_LABEL, Y_CLASSES, TRANSCRIPT_MATCH_THRESHOLD, SAMPLE)

df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
# limit dataframe length
if SAMPLE:
    df = df.head(SAMPLE)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
model = BertClassifierInterface(MODEL_NAME, DROPOUT_PROB, len(Y_CLASSES), lr=LEARNING_RATE, wg=WEIGHT_DECAY)

In [None]:
model.train(df_train, X_LABEL, Y_LABEL, MAX_LENGTH, BATCH_SIZE, EPOCHS)

In [None]:
hist_time = model.history
epochs = list(range(1, len(hist_time)+1))

train_acc = [nested[0] for nested in hist_time]
train_loss = [nested[1] for nested in hist_time]
val_acc = [nested[2] for nested in hist_time]
val_loss = [nested[3] for nested in hist_time]


# Plotting these values
plt.plot(epochs, train_acc, label='Training Accuracy')
plt.plot(epochs, train_loss, label='Training Loss')
plt.plot(epochs, val_acc, label='Validation Accuracy')
plt.plot(epochs, val_loss, label='Validation Loss')

# Adding a title
plt.title('DistilBert Model Training')

# Adding x and y label
plt.xlabel('Epochs')
plt.ylabel('Loss and Accuracy')

# Add a legend
plt.legend()

# Displaying the plot
plt.show()

# Testing

In [None]:
test_dataset = MeldDataset(df_test, model.tokenizer, X_LABEL, Y_LABEL, MAX_LENGTH)
assert np.all(np.equal(test_dataset.categories[0], model.mapper))
mapper = test_dataset.categories
test_data_loader = create_data_loader(test_dataset, BATCH_SIZE)

In [None]:
test_acc, _ = model.evaluate(test_data_loader)

print("  Test accuracy: {0:.2f}".format(test_acc))

In [None]:
x_val, y_pred, y_probs, y_test = model.predict(test_data_loader)

print(classification_report(np.argmax(y_test, axis=1), y_pred))
print("0:", test_dataset.categories[0][0], ", 1:", test_dataset.categories[0][1], ", 2:", test_dataset.categories[0][2])