#Bert Training



## Parameters

In [None]:
DATASET_PATH = "/content/meld.csv"

RANDOM_SEED = 42

# data
SAMPLE = 1000
X_LABEL = 'Utterance'
Y_LABEL = 'Sentiment'
Y_CLASSES = ['negative', 'positive', "neutral"]

# model
MODEL_NAME = 'bert-base-uncased'
DROPOUT_PROB = 0.3

# training
EPOCHS = 4
BATCH_SIZE = 16
MAX_LENGTH = 44

## Dependencies

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder

## Dataset

In [None]:
class MeldDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, x_label: str, y_label: str, max_length: int, augment=None):
        self.x_list: np.ndarray = df[x_label].to_numpy()
        ohe = OneHotEncoder()
        codes = df[y_label].to_numpy()
        codes = np.expand_dims(codes, axis=1)
        self.y_list: np.ndarray = ohe.fit_transform(codes).toarray()
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment

    def __len__(self):
        return len(self.x_list)

    def __getitem__(self, item):
        text = self.x_list[item]
        if self.augment:
            text = self.augment(text)
        encoded_dict: dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )
        inputs_ids = encoded_dict['input_ids'].reshape(-1)
        attention_mask = encoded_dict['attention_mask'].reshape(-1)
        y_tensor = torch.tensor(self.y_list[item])
        return inputs_ids, attention_mask, y_tensor

In [None]:
def create_data_loader(dataset: Dataset, batch_size: int):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True
    )

In [None]:
def preparing_dataset_based_on_class(df: pd.DataFrame, y_label: str, y_classes: list) -> pd.DataFrame:
    all_possibles_classes = df[y_label].unique()
    if len(all_possibles_classes) == len(y_classes):
        return df
    elif len(y_classes) < 2:
        print("Minimal number of analyzed class is 2, setting analyzed class into two -> positive and negative")
        y_classes = ['negative', 'positive']
    class_to_delete = set(all_possibles_classes) - set(y_classes)
    df = df.loc[df[y_label] != list(class_to_delete)[0]].reset_index(drop=True)
    return df

## Bert Model

In [None]:
class CustomBertClassifier(nn.Module):
    # bert core + dropout + one layer feed-forward
    def __init__(self, model_name, dropout_prob, n_classes=2):
        super(CustomBertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )
        output = self.dropout(pooled_output)
        return self.classifier(output)

## Building model

In [None]:
def train(model: nn.Module, data_loader: DataLoader, loss_fn, optim, dev: torch.device, sched, n_samples: int):
    # set mode
    model = model.train()

    losses = []
    correct_predictions = 0

    loop = tqdm(data_loader)
    for idx, d in enumerate(loop):
        input_ids = d[0].to(dev)
        attention_mask = d[1].to(dev)
        targets = d[2].to(dev)

        # get model outputs
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, predictions = torch.max(outputs, dim=1)
        _, correct = torch.max(targets, dim=1)
        correct_predictions += sum(torch.eq(predictions, correct))

        loss = loss_fn(outputs, targets)
        losses.append(loss.item())

        # Backward prop
        loss.backward()

        # Gradient Descent
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optim.step()
        sched.step()
        optim.zero_grad()

    return float(correct_predictions) / n_samples, np.mean(losses)


In [None]:
def evaluate(model: CustomBertClassifier, data_loader: DataLoader, loss_fn, dev: torch.device, n_samples: int):
    # set mode
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        loop = tqdm(data_loader)
        for idx, d in enumerate(loop):
            input_ids = d[0].to(dev)
            attention_mask = d[1].to(dev)
            targets = d[2].to(dev)

            # get model outputs
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, predictions = torch.max(outputs, dim=1)
            _, correct = torch.max(targets, dim=1)
            correct_predictions += sum(torch.eq(predictions, correct))

            loss = loss_fn(outputs, targets)
            losses.append(loss.item())

    return float(correct_predictions) / n_samples, np.mean(losses)

# Training

In [None]:
df: pd.DataFrame = pd.read_csv(DATASET_PATH)
df: pd.DataFrame = preparing_dataset_based_on_class(df, y_label=Y_LABEL, y_classes=Y_CLASSES)

In [None]:
# limit dataframe length
if SAMPLE:
    df = df.head(SAMPLE)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)


In [None]:
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
train_dataset: Dataset = MeldDataset(df_train, tokenizer, X_LABEL, Y_LABEL, MAX_LENGTH)
val_dataset: Dataset = MeldDataset(df_val, tokenizer, X_LABEL, Y_LABEL, MAX_LENGTH)
test_dataset: Dataset = MeldDataset(df_test, tokenizer, X_LABEL, Y_LABEL, MAX_LENGTH)

In [None]:
train_data_loader: DataLoader = create_data_loader(train_dataset, BATCH_SIZE)
val_data_loader: DataLoader = create_data_loader(val_dataset, BATCH_SIZE)
test_data_loader: DataLoader = create_data_loader(test_dataset, BATCH_SIZE)


In [None]:
bert_model = BertModel.from_pretrained(MODEL_NAME)
custom_model = CustomBertClassifier(
    model_name=MODEL_NAME,
    dropout_prob=DROPOUT_PROB,
    n_classes=len(Y_CLASSES)
).to(device)


In [None]:
params: list[tuple] = list(custom_model.named_parameters())
optimizer = AdamW(custom_model.parameters(), lr=2e-5)

In [None]:
total_steps: int = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
loss_function = nn.CrossEntropyLoss().to(device)
best_acc: float = 0

In [None]:
for epoch_i in range(EPOCHS):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))

    print('Training...')

    train_acc, train_loss = train(
        model=custom_model,
        data_loader=train_data_loader,
        loss_fn=loss_function,
        optim=optimizer,
        dev=device,
        sched=scheduler,
        n_samples=len(df_train)
    )

    print("  Train accuracy: {0:.2f}".format(train_acc))
    print("  Train loss: {0:.2f}".format(train_loss))

    print('Running validation...')

    val_acc, val_loss = evaluate(
        model=custom_model,
        data_loader=val_data_loader,
        loss_fn=loss_function,
        dev=device,
        n_samples=len(df_val)
    )

    print("  Validation accuracy: {0:.2f}".format(val_acc))
    print("  Validation loss: {0:.2f}".format(val_loss))

    # save model state with best accuracy
    if val_acc > best_acc:
        torch.save(custom_model.state_dict(), 'best_model.bin')
        best_acc = val_acc