# NLP
## Description
The following notebook takes a pretrained BERT model and trains a new model on top (a 2-layer neural network). For efficiency, we can simply compute the pretrained BERT model on the data first and then use the outputs of BERT (which are simply 1d tensors) as the inputs for our network. This leads to much faster training.

The motivation for using BERT was to transfer the performance of a strong NLP model to our problem of classifying the sentences in Government documents.

## BERT -> multilabel classification

In [None]:
TOKEN_LENGTH = 512
BATCH_SIZE = 16
SEED = 42

In [None]:
import tensorflow as tf
import torch
import pandas as pd

# Check GPU availability
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    print('No GPU found.')
    

# Initialise pytorch with GPU
if torch.cuda.is_available():    
    device = torch.device('cuda')
    torch.cuda.empty_cache()
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

In [None]:
fpath = '../input/cdp-30-35-54-translated-with-labels/bert_translations_01122020_1.csv'
df = pd.read_csv(fpath, header=0, encoding='ISO-8859-1')
required_columns = [
    'response_answer_translated',
    'Employment',
    'Inclusion',
    'Health',
    'Congestion'
]
df = df[[c for c in required_columns if 'Unnamed' not in c]].dropna()
label_cols = ['Employment', 'Inclusion', 'Health', 'Congestion']
for c in label_cols:
    df[c] = df[c].astype(int)
df['label'] = df[label_cols].values.tolist()
print(f"Loaded dataset with {df['label'].shape[0]} labelled examples.")
df.sample(5)

In [None]:
# Create test set
from sklearn.model_selection import train_test_split
df, df_test = train_test_split(df, test_size=0.1)

In [None]:
import numpy as np
sequences = df['response_answer_translated'].values
labels = np.array([np.array(l, dtype=np.float16) for l in df['label'].values])

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_len = 0
for s in sequences:
    input_ids = tokenizer.encode(s, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))  # keep track of longest seq


print('Max sequence length: ', max_len)

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for s in sequences:
    # `Tokenize sentence, add special chars
    encoded_dict = tokenizer.encode_plus(
                        s,
                        add_special_tokens = True,
                        max_length = TOKEN_LENGTH,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])   # mask padding

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels, dtype=torch.float)

In [None]:
from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

First run BERT on the whole dataset, so that we get encodings for all of our inputs.

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

all_dataloader = DataLoader(
    dataset,
    sampler=SequentialSampler(dataset),
    batch_size=BATCH_SIZE,
)

In [None]:
from transformers.modeling_bert import BertPreTrainedModel, BertModel
from torch.nn import BCEWithLogitsLoss

# Modified from Kaushal Trivedi's multilabel example
# https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d
# Source: https://github.com/kaushaltrivedi/bert-toxic-comments-multilabel/blob/master/toxic-bert-multilabel-classification.ipynb
class BertFromPreTrained(BertPreTrainedModel):
    """
    BERT model for encoding.
    """
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        
        return pooled_output
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [None]:
pretrained_bert = BertFromPreTrained.from_pretrained(
    'bert-base-uncased',  # Use the 12-layer BERT model, with an uncased vocab.
    output_attentions=False,
    output_hidden_states=False,
)

pretrained_bert.to(device)
pretrained_bert.freeze_bert_encoder()

In [None]:
bert_encodings = []
for batch in all_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # Forward pass (predict)
    with torch.no_grad():
        latents = pretrained_bert(b_input_ids, None, b_input_mask, b_labels)
    
    bert_encodings.append(latents)
        
bert_encodings = torch.cat(bert_encodings, axis=0)

In [None]:
from torch.utils.data import TensorDataset, random_split

encoded_dataset = TensorDataset(bert_encodings, attention_masks, labels)

In [None]:
# Divide the dataset by randomly selecting samples.
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(encoded_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(0))
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

# Create data loaders to feed information to the GPU
train_dataloader = DataLoader(
            train_dataset,
            sampler=RandomSampler(train_dataset),  # random example to batch selection
            batch_size=BATCH_SIZE 
        )

# Train-val set
seq_train_dataloader = DataLoader(
            train_dataset,
            sampler=SequentialSampler(train_dataset), 
            batch_size=BATCH_SIZE 
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler=SequentialSampler(val_dataset),
            batch_size=BATCH_SIZE
        )

In [None]:
import wandb
USE_WANDB = False

# if USE_WANDB:
#     wandb.login()

In [None]:
import torch
import torch.nn.functional as F
from sklearn.datasets import make_multilabel_classification
from torch import optim

class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = torch.nn.Dropout(0.1)
        self.layer1 = torch.nn.Linear(768, 32)
        # self.layer2 = torch.nn.Linear(32, 32)
        self.logits = torch.nn.Linear(32, 4)
        # self.relu = torch.nn.ReLU
        
    def forward(self, x):
        x = self.dropout(x)
        x = self.layer1(x)
        x = F.relu(x)
        #x = self.layer2(x)
        #x = F.relu(x)
        return self.logits(x)


model = Network()
model.to(device)

In [None]:
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW

# Set up optimiser to manage modification of model weights
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# lr in (5e-5, 3e-5, 2e-5)
lr = 1e-3
eps = 1e-8
epochs = 800

optimizer = AdamW(model.parameters(), lr=lr, eps=eps)

total_steps = len(train_dataloader) * epochs  # batches * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

In [None]:
# # 1. Start a new run
tag = 5

# if USE_WANDB:
#     wandb.init(project="cdp-abc", tags=list(str(tag)))
    
#     # 2. Save model inputs and hyperparameters
#     config = wandb.config
#     config.learning_rate = lr
#     config.eps = eps

#     # 3. Log gradients and model parameters
#     wandb.watch(model)

In [None]:
import numpy as np
import time
import datetime


def flat_accuracy(preds, labels):
    return np.mean(preds == labels)
    

def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


# Not yet implemented
# def f_beta(y_pred: Tensor, y_true: Tensor, thresh: float = 0.2, 
#            beta: float = 2, eps: float = 1e-9, sigmoid: bool = True):
#     beta2 = beta ** 2
#     if sigmoid: y_pred = y_pred.sigmoid()
#     y_pred = (y_pred>thresh).float()
#     y_true = y_true.float()
#     tp = (y_pred*y_true).sum(dim=1)
#     prec = TP/(y_pred.sum(dim=1)+eps)
#     rec = TP/(y_true.sum(dim=1)+eps)
#     res = (prec*rec)/(prec*beta2+rec+eps)*(1+beta2)
#     return res.mean().item()


def format_time(elapsed):
    """ Takes a time in seconds and returns a string hh:mm:ss. """
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# # Load model
# tag = 3
# load_model = False
# if load_model:
#     chosen_tag = 2
#     chosen_epoch = 150
#     fpath = f'./model_{chosen_tag}_{chosen_epoch}'
#     print(f'Model fpath: {fpath}')
#     model.load_state_dict(torch.load(fpath))

In [None]:
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

def validation(model, dataloader, criterion, threshold=0.5):
    print('Running Validation...')
    t0 = time.time()

    model.eval()  # evaluation mode, change dropout behaviour

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    tp = 0
    fp = 0
    fn = 0
    preds_l = []
    labels_l = []
    for batch in dataloader:  #validation_dataloader:
        b_x = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Forward pass (predict)
        with torch.no_grad():
            logits = model(b_x)
            
        loss = criterion(logits, b_labels)

        preds = F.sigmoid(logits).cpu().numpy()
        preds_l.append(preds)
        logits = logits.detach().cpu().numpy()  # move logits and labels to CPU
        label_ids = b_labels.to('cpu').numpy()
        labels_l.append(label_ids)
        
        preds = (preds > threshold).astype(float)
        tp += np.sum((label_ids == 1) * (preds == 1)) 
        fp += np.sum((label_ids == 0) * (preds == 1))
        fn += np.sum((label_ids == 1) * (preds == 0))

        total_eval_loss += loss.item()
        total_eval_accuracy += flat_accuracy(preds, label_ids)

    all_preds = np.concatenate(preds_l, axis=0)
    all_labels = np.concatenate(labels_l, axis=0)
    auc = roc_auc_score(y_true=all_labels.ravel(), y_score=all_preds.ravel())
        
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    avg_val_accuracy = total_eval_accuracy / len(dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    print('  Accuracy: {0:.2f}'.format(avg_val_accuracy))
    print('  Validation Loss: {0:.2f}'.format(avg_val_loss))
    print('  Validation took: {:}'.format(validation_time))
    return {
        'ava': avg_val_accuracy, 
        'avl': avg_val_loss, 
        'vt': validation_time, 
        'preds': preds, 
        'recall': recall, 
        'precision': precision,
        'auc': auc,
    }

In [None]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)


training_stats = []
total_t0 = time.time()

criterion = torch.nn.BCEWithLogitsLoss()

for epoch_i in range(0, epochs):
    
    # TRAINING
    print('\n======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_accuracy = 0
    total_train_loss = 0

    model.train()  # training mode, change dropout behaviour
    
    for step, batch in enumerate(train_dataloader):
        if step % 25 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            msg = '  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'
            print(msg.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)  # copy batch to GPU
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()  # clear gradients before backwards pass     

        # Forward pass (train)
        b_token_type_ids = None
        logits = model(b_input_ids)
        loss = criterion(logits, b_labels)
        
        total_train_loss += loss.item()  # collect training loss
        
        _logits = logits.detach().cpu().numpy()
        _label_ids = b_labels.to('cpu').numpy()
        total_train_accuracy += flat_accuracy(_logits, _label_ids)

        # Backward pass (calculate gradients)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # prevent exploding grad
        optimizer.step()  # update parameters, take step using gradient
        scheduler.step()  # update learning rate

    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    avg_train_loss = total_train_loss / len(train_dataloader)            
    training_time = format_time(time.time() - t0)
    print('  Average training loss: {0:.2f}'.format(avg_train_loss))
    print('  Training epcoh took: {:}'.format(training_time))
        
    # VALIDATION
    train_stat = validation(model, seq_train_dataloader, criterion)
    val_stat = validation(model, validation_dataloader, criterion)
    
    wandb_stats = {
        'loss_trn': train_stat['avl'],
        'loss_val': val_stat['avl'],
        'accuracy_trn': train_stat['ava'],
        'accuracy_val': val_stat['ava'],
        'recall_trn': train_stat['recall'],
        'recall_val': val_stat['recall'],
        'precision_trn': train_stat['precision'],
        'precision_val': val_stat['precision'],
        'auc_trn': train_stat['auc'],
        'auc_val': val_stat['auc'],
    }
    
    training_stats.append({
        'epoch': epoch_i + 1,
        'Training Time': training_time,
        'validation_time_trn': train_stat['vt'],
        'validation_time_val': val_stat['vt'],
        **wandb_stats,
    })
    
    if epoch_i % 50 == 0:
        torch.save(model.state_dict(), f'model_{tag}_{epoch_i}')
        
#     # Log to WandB
#     try:
#         if USE_WANDB:
#             wandb.log(wandb_stats)
#     except Exception as e:
#         print(e)

print('Training took {:} (h:mm:ss)'.format(format_time(time.time() - total_t0)))

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(20, 10))

ax = axs[0][0]
ax.plot([stats['auc_val'] for stats in training_stats], label=f"{training_stats[-1]['auc_val']:.02f}")
ax.grid()
ax.legend()
ax.set_title('Validation AUC')

ax = axs[0][1]
ax.plot([stats['precision_val'] for stats in training_stats], label=f"{training_stats[-1]['precision_val']:.02f}")
ax.grid()
ax.legend()
ax.set_title('Validation Precision')

ax = axs[1][0]
ax.plot([stats['recall_val'] for stats in training_stats], label=f"{training_stats[-1]['recall_val']:.02f}")
ax.grid()
ax.legend()
ax.set_title('Validation Recall')

ax = axs[1][1]
ax.plot([stats['accuracy_val'] for stats in training_stats], label=f"{training_stats[-1]['accuracy_val']:.02f}")
ax.grid()
ax.legend()
ax.set_title('Validation Accuracy')
pass

We are able to achieve AUC on the validation set of 0.83, precision of 0.68, accuracy of 0.85 and recall of 0.49. A random model would have obtained precision of about 20%, since the labels in our validation set are only about 20% 1s.

In [None]:
# criterion = torch.nn.BCEWithLogitsLoss()

# acc_l = []
# loss_l = []
# for epoch in range(2000):
#     optimizer.zero_grad()
#     logits = network(x)
#     loss = criterion(logits, y)
#     loss.backward()
#     optimizer.step()
#     acc_l.append(
#         flat_accuracy(
#             (F.sigmoid(logits) > 0.5).cpu().numpy(), 
#             y.cpu().numpy()
#         )
#     )
#     # print('Loss: {:.3f}'.format(loss.item()))
#     loss_l.append(loss.item())
    
# plt.plot(loss_l)
# plt.plot(acc_l)
# acc_l[-1]

In [None]:
# import pandas as pd

# df_stats = pd.DataFrame(data=training_stats)
# df_stats = df_stats.set_index('epoch')
# df_stats

In [None]:
# # Learning curve
# import matplotlib.pyplot as plt
# import seaborn as sns

# sns.set(style='darkgrid')
# plt.rcParams['figure.figsize'] = (12, 6)

# alpha = 0.6
# plt.plot(df_stats['Training Loss'], 'b-o', label='Training loss', alpha=alpha)
# plt.plot(df_stats['Valid. Loss'], 'g-o', label='Validation loss', alpha=alpha)
# plt.plot(df_stats['train_accuracy'], 'r-o', label='Training accuracy', alpha=alpha)
# plt.plot(df_stats['validation_accuracy'], 'y-o', label='Validation accuracy', alpha=alpha)
# plt.title('Loss + accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()
# xs = []
# for e in range(epochs):
#     if e % 50 == 0:
#         xs.append(e)
# plt.xticks(xs)
# plt.show()

Prediction

In [None]:
# import pandas as pd

# # df_test = df_cheeky_test.copy()
# print('Number of test sequences: {:,}\n'.format(df_test.shape[0]))

# sequences = df_test['response_answer_translated'].values
# labels = np.array([np.array(l, dtype=np.float16) for l in df_test['label'].values])

# input_ids = []
# attention_masks = []
# for s in sequences:
#     encoded_dict = tokenizer.encode_plus(
#         s,
#         add_special_tokens = True,
#         max_length = TOKEN_LENGTH,
#         pad_to_max_length = True,
#         return_attention_mask = True,
#         return_tensors = 'pt',
#     )
     
#     input_ids.append(encoded_dict['input_ids'])
#     attention_masks.append(encoded_dict['attention_mask'])


# input_ids = torch.cat(input_ids, dim=0)
# attention_masks = torch.cat(attention_masks, dim=0)
# labels = torch.tensor(labels)

# prediction_data = TensorDataset(input_ids, attention_masks, labels)
# prediction_sampler = SequentialSampler(prediction_data)
# prediction_dataloader = DataLoader(
#     prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE
# )

In [None]:
# # Prediction on test set
# print('Predicting labels for {} test sequences...'.format(len(input_ids)))

# model.eval()

# predictions = [] 
# true_labels = []
# for batch in prediction_dataloader:
#     batch = tuple(t.to(device) for t in batch)
#     b_input_ids, b_input_mask, b_labels = batch
    
#     # Forward pass, predict
#     with torch.no_grad():
#         b_token_type_ids = None
#         outputs = model(b_input_ids, b_token_type_ids, b_input_mask)  # no labels
#         m = torch.nn.Sigmoid()
#         outputs = m(outputs)

#     logits = outputs.detach().cpu().numpy()
#     label_ids = b_labels.to('cpu').numpy()
#     predictions.append(logits)
#     true_labels.append(label_ids)

# predictions = np.vstack(predictions)
# true_labels = np.vstack(true_labels)
# print('Completed predictions.')

In [None]:
# def precision_recall(preds, labels):
#     tp = np.sum((preds == 1) * (labels == 1))
#     fp = np.sum((preds == 1) * (labels == 0))
#     fn = np.sum((preds == 0) * (labels == 1))
#     precision = tp / (tp + fp)
#     recall = tp / (tp + fn)
#     return precision, recall


# xs = np.linspace(0, 1, 11)
# prec = [precision_recall((predictions > x), true_labels)[0] for x in xs]
# recs = [precision_recall((predictions > x), true_labels)[1] for x in xs]
# plt.plot(xs, prec, label='precision')
# plt.plot(xs, recs, label='recall')
# plt.legend()

In [None]:
# from sklearn.metrics import multilabel_confusion_matrix, roc_auc_score

# rocauc_micro = roc_auc_score(true_labels, predictions, average='micro')
# print('ROC-AUC: {:.3f}\n'.format(rocauc_micro))
# print('Confusion matrices:')
# print(multilabel_confusion_matrix(true_labels, (predictions > 0.5)))

In [None]:
# threshold = 0.5
# output_col_names = ['employment', 'inclusion', 'health', 'congestion']
# df_preds = pd.DataFrame(predictions, columns=output_col_names)
# _df_test = pd.concat([df_test.reset_index(), df_preds], axis=1)
# _df_test[['employment', 'inclusion', 'health', 'congestion']] = (
#     _df_test[['employment', 'inclusion', 'health', 'congestion']].apply(lambda x: (x > threshold).astype(float))
# )
# _df_test['label_preds'] = _df_test[['employment', 'inclusion', 'health', 'congestion']].apply(list, axis=1)
# _df_test

Miscellaneous

In [None]:
# import torch
# import torch.nn.functional as F
# from sklearn.datasets import make_multilabel_classification
# from torch import optim

# class Network(torch.nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.layer1 = torch.nn.Linear(768, 32)
#         # self.layer2 = torch.nn.Linear(32, 32)
#         self.logits = torch.nn.Linear(32, 4)
#         self.dropout = torch.nn.Dropout(0.1)
#         # self.relu = torch.nn.ReLU
        
#     def forward(self, x):
#         x = self.dropout(x)
#         x = self.layer1(x)
#         x = F.relu(x)
#         #x = self.layer2(x)
#         #x = F.relu(x)
#         return self.logits(x)

    
# network = Network()
# network.to(device)

# # x = torch.randn(16, 4).cuda()
# x = model.forward(next(iter(train_dataloader))[0].cuda())
# _, y = make_multilabel_classification(n_samples=16, n_features=4, n_classes=4, n_labels=2)
# # y = torch.Tensor(y).cuda()
# y = torch.Tensor(labels).cuda()

# network.train()

# optimizer = optim.AdamW(network.parameters(), lr=1e-3)

# acc_l = []
# loss_l = []
# for epoch in range(2000):
#     optimizer.zero_grad()
#     logits = network(x)
#     loss = criterion(logits, y)
#     loss.backward()
#     optimizer.step()
#     acc_l.append(
#         flat_accuracy(
#             (F.sigmoid(logits) > 0.5).cpu().numpy(), 
#             y.cpu().numpy()
#         )
#     )
#     # print('Loss: {:.3f}'.format(loss.item()))
#     loss_l.append(loss.item())
    
# plt.plot(loss_l)
# plt.plot(acc_l)
# acc_l[-1]