1. Define Model 
    * Load a pre-trained BERT model. Add dense layers for classification
2. Create a Dataset object from the input data
3. Create a Dataloader to pass input in batches to the model
    * Return it in the format expected by the model (tokenizer)

# Import Libraries

In [1]:
import pandas as pd
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModel, AdamW

import time
from tqdm import tqdm
import copy

from sklearn.metrics import classification_report, roc_auc_score

# Parameters

In [2]:
PROJECT_FOLDER = "d:/Documents/Work/PayPal/DS_Meet/CoLA_BERT/"
DATA_FOLDER = PROJECT_FOLDER + 'Data/'

EPOCHS = 300
BATCH_SIZE = 64
EARLY_STOPPING_ROUNDS = 15

# Devices Available

In [3]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    print(torch.cuda.get_device_properties(device))

Using device: cuda

GeForce RTX 2060
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
_CudaDeviceProperties(name='GeForce RTX 2060', major=7, minor=5, total_memory=6144MB, multi_processor_count=30)


# Functions and Classes

In [4]:
class CoLADataset(Dataset):
    def __init__(self, data, target_col, sentence_col, tokenizer):
        self.sentences = data[sentence_col].values
        self.labels = data[target_col].astype(float).values
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        target = self.labels[idx]
        sentence = self.sentences[idx]
        return sentence, target

    def pad_batches(self, batch):
        '''Receives batches of data in the format returned by __getitem__. 
        Here we receive a batch of sentences along with their labels'''

        batch_sentences = list(np.array(batch, dtype=object)[:, 0])
        batch_labels_tens = torch.tensor(np.array(batch, dtype=object)[:, 1].astype(float))

        batch_tokenized_sent_tens = self.tokenizer(batch_sentences,
                                                    padding=True,
                                                    truncation=True,
                                                    max_length=512,
                                                    return_tensors='pt')
        
        return batch_tokenized_sent_tens, batch_labels_tens



class BertCoLAClassifier(nn.Module):
    def __init__(self, pretrained_model_name_or_path):
        super().__init__()

        # Base model
        self.base_model = AutoModel.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path)
        self.base_model.eval()
        for param in self.base_model.parameters():
            param.requires_grad=False

        # FC layers to be trained
        self.lin1 = nn.Linear(768, 128)
        self.drop = nn.Dropout(p=0.1)
        self.lin2 = nn.Linear(128, 16)
        self.lin3 = nn.Linear(16, 1)


    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        embed = self.base_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[1]
        x = self.lin1(embed)
        x = F.relu(x)
        x = self.lin2(x)
        x = F.relu(x)
        output = self.lin3(x)

        return output



def get_preds(outputs, device, threshold=0.5):
    return outputs.cpu().detach().apply_(lambda x: float(x>threshold)).to(device)



def train_model(model, dataloaders, criterion, optimizer, num_epochs, device, early_stopping_rounds=None):
    total_begin = time.time()

    stop_flag=False
    best_model_wts = copy.deepcopy(model.state_dict())
    best_auc = 0.0

    for epoch in range(num_epochs):
        if stop_flag:
            break

        epoch_begin = time.time()
        
        print('\n')
        print('-' * 20)
        print(f'Epoch {epoch}/{num_epochs-1}')

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            all_predictions, all_actuals = [], []
            
            print('-' * 10)
            print(phase)
            print('-' * 10)
            # Iterate over data.
            for inputs, labels in tqdm(dataloaders[phase]):
                inputs = {key:input.to(device) for key, input in inputs.items()}
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(**inputs)
                    outputs = outputs.reshape(-1)
                    loss = criterion(outputs, labels)
                    preds = get_preds(outputs, device)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                
                # statistics
                running_loss += loss.item() * labels.size(0)
                running_corrects += torch.sum(preds == labels.data)

                all_predictions = all_predictions + list(preds.cpu().numpy())
                all_actuals = all_actuals + list(labels.cpu().numpy())


            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            epoch_auc = roc_auc_score(all_actuals, all_predictions)

            print(f'Loss: {round(epoch_loss, 4)} Acc: {round(epoch_acc.item(), 4)} AUC-ROC {round(epoch_auc, 2)}')

            # deep copy the model
            if phase == 'val':
                if epoch_auc > best_auc:
                    best_auc = epoch_auc
                    best_model_wts = copy.deepcopy(model.state_dict())
                    best_epoch = epoch

                if (early_stopping_rounds) and (epoch > best_epoch + early_stopping_rounds):
                    print('-----------------------EARLY STOPPING-----------------------')
                    stop_flag=True

                time_elapsed = time.time() - epoch_begin
                print('Epoch complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

            del inputs, labels
            torch.cuda.empty_cache()

    time_elapsed = time.time() - total_begin
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val AUC: {:4f}'.format(best_auc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

# Read CoLA Data

In [5]:
train_data = pd.read_table(DATA_FOLDER + "cola_public/raw/in_domain_train.tsv",  usecols=[1,3], header=None, names=['label', 'sentence'])
validation_data = pd.read_table(DATA_FOLDER + "cola_public/raw/in_domain_dev.tsv", usecols=[1,3], header=None, names=['label', 'sentence'])

train_data['label'] = train_data['label'].astype(float)
validation_data['label'] = validation_data['label'].astype(float)

dataset_sizes = {'train': train_data.shape[0], 'val': validation_data.shape[0]} 

print(f"Number of training records: {train_data.shape[0]}")
print(f"Number of validation records: {validation_data.shape[0]}")

print("\nTraining label distribution")
print(train_data['label'].value_counts()*100/ train_data.shape[0])

print("\nValidation label distribution")
print(validation_data['label'].value_counts()*100/ validation_data.shape[0])

Number of training records: 8551
Number of validation records: 527

Training label distribution
1.0    70.436206
0.0    29.563794
Name: label, dtype: float64

Validation label distribution
1.0    69.259962
0.0    30.740038
Name: label, dtype: float64


In [6]:
train_data.head()

Unnamed: 0,label,sentence
0,1.0,"Our friends won't buy this analysis, let alone..."
1,1.0,One more pseudo generalization and I'm giving up.
2,1.0,One more pseudo generalization or I'm giving up.
3,1.0,"The more we study verbs, the crazier they get."
4,1.0,Day by day the facts are getting murkier.


In [7]:
print("Training Sentence Length Distribution")
print(train_data['sentence'].apply(lambda x: len(x.split(' '))).describe())

print("\nValidation Sentence Length Distribution")
print(validation_data['sentence'].apply(lambda x: len(x.split(' '))).describe())

Training Sentence Length Distribution
count    8551.000000
mean        7.696059
std         3.622946
min         2.000000
25%         5.000000
50%         7.000000
75%         9.000000
max        42.000000
Name: sentence, dtype: float64

Validation Sentence Length Distribution
count    527.000000
mean       7.544592
std        3.440063
min        2.000000
25%        5.000000
50%        7.000000
75%        9.000000
max       27.000000
Name: sentence, dtype: float64


# Prepare Dataloaders

In [8]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = 'bert-base-uncased')

In [9]:
print(tokenizer(train_data.loc[1, 'sentence']))
print(tokenizer.convert_ids_to_tokens( [101, 2028, 2062, 18404, 2236, 3989, 1998, 1045, 1005, 1049, 3228, 2039, 1012, 102]))

{'input_ids': [101, 2028, 2062, 18404, 2236, 3989, 1998, 1045, 1005, 1049, 3228, 2039, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', 'one', 'more', 'pseudo', 'general', '##ization', 'and', 'i', "'", 'm', 'giving', 'up', '.', '[SEP]']


In [10]:
# load tokenizer - Normalize, Pre-tokenizations, Model, Post-processing
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = 'bert-base-uncased')


# Create Dataset Object - Stores samples and their corresponding label

train_dataset = CoLADataset(data=train_data,
                            target_col='label',
                            sentence_col='sentence',
                            tokenizer=tokenizer)

validation_dataset = CoLADataset(data=validation_data,
                                target_col='label',
                                sentence_col='sentence',
                                tokenizer=tokenizer)

# Wrap with Dataloader to pass to the model - Wraps an iterable around the Dataset to enable easy access to the samples
train_dataloader = DataLoader(dataset=train_dataset,
                            batch_size=BATCH_SIZE,
                            collate_fn=train_dataset.pad_batches)

validation_dataloader = DataLoader( dataset=validation_dataset,
                                    batch_size=BATCH_SIZE,
                                    collate_fn=train_dataset.pad_batches,
                                    )

dataloaders = {'train': train_dataloader, 'val':validation_dataloader}

In [11]:
a = next(iter(train_dataloader))

# Model

In [12]:
## Model
model = BertCoLAClassifier(pretrained_model_name_or_path = 'bert-base-uncased')
# model.to(device)

print(f"Total Parameters: {sum(p.numel() for p in model.parameters())}")
print(f"Total Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Total Parameters: 109582753
Total Trainable Parameters: 100513


In [13]:
print(a[0]['input_ids'][0].shape)
a[0]

torch.Size([19])


{'input_ids': tensor([[  101,  2256,  2814,  ..., 16599,  1012,   102],
        [  101,  2028,  2062,  ...,     0,     0,     0],
        [  101,  2028,  2062,  ...,     0,     0,     0],
        ...,
        [  101,  2023,  2311,  ...,     0,     0,     0],
        [  101,  2023,  2311,  ...,     0,     0,     0],
        [  101,  2023,  2311,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [14]:
out = model.base_model(**a[0])
out

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 3.2538e-01,  2.4478e-01,  1.0296e-01,  ..., -2.6292e-01,
           2.7816e-01,  4.9933e-01],
         [ 5.4273e-01, -8.2028e-03, -3.3833e-01,  ..., -1.3425e-01,
           8.9056e-01,  5.7038e-01],
         [ 4.0858e-01, -3.6979e-02,  1.3732e-01,  ..., -4.3616e-01,
          -1.7655e-01,  1.9794e-01],
         ...,
         [ 4.9888e-01,  4.7405e-01, -6.0764e-02,  ..., -3.5368e-01,
           1.7316e-01,  4.5580e-01],
         [ 8.8303e-01,  2.3548e-01, -9.5102e-02,  ...,  1.2074e-01,
          -5.0536e-01, -4.5264e-01],
         [ 6.9930e-02,  6.1669e-01,  6.0433e-01,  ..., -1.7900e-01,
          -2.1171e-01,  1.4934e-01]],

        [[ 2.2020e-01,  2.1622e-01,  1.4586e-01,  ..., -6.5850e-04,
           2.5975e-01,  6.3853e-01],
         [ 3.9434e-01,  1.1485e-01,  1.1057e-01,  ...,  5.2229e-01,
           7.5671e-02,  9.5173e-01],
         [ 5.7084e-01,  4.8586e-03, -1.2215e-01,  ...,  4.4161e-01,
           5.

In [15]:
print(out[0].shape)
print(out[1].shape)

torch.Size([64, 19, 768])
torch.Size([64, 768])


In [16]:
# Define Loss Function
objective = nn.BCEWithLogitsLoss()

# Define Optimizer - Passing in trainable parameters
optimizer = torch.optim.AdamW([params for params in model.parameters() if params.requires_grad],
                              lr=0.001)

In [17]:
model = train_model(model,
                    dataloaders=dataloaders,
                    criterion=objective,
                    optimizer=optimizer,
                    num_epochs=EPOCHS,
                    device=device,
                    early_stopping_rounds=EARLY_STOPPING_ROUNDS)

  0%|          | 0/134 [00:00<?, ?it/s]



--------------------
Epoch 0/299
----------
train
----------


  0%|          | 0/134 [00:01<?, ?it/s]


RuntimeError: Input, output and indices must be on the current device

# Inference

In [11]:
model.eval()

BertCoLAClassifier(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [12]:
all_preds, all_actual = [], []
pred_vs_actual = pd.DataFrame()

for inputs, labels in tqdm(train_dataloader):

    inputs = {key:input.to(device) for key, input in inputs.items()}
    labels = list(labels.numpy())

    with torch.no_grad():
        outputs = model(**inputs).reshape(-1)
        preds = list(get_preds(outputs, threshold=0.5, device=device).cpu().numpy())
    
    all_preds.extend(preds)
    all_actual.extend(labels)

pred_vs_actual['actual'] = all_actual
pred_vs_actual['predicted'] = all_preds

100%|██████████| 134/134 [00:11<00:00, 12.13it/s]


In [13]:
print(classification_report(pred_vs_actual['actual'], pred_vs_actual['predicted']))

              precision    recall  f1-score   support

         0.0       0.66      0.27      0.38      2528
         1.0       0.75      0.94      0.84      6023

    accuracy                           0.74      8551
   macro avg       0.71      0.61      0.61      8551
weighted avg       0.73      0.74      0.70      8551



In [None]:
test_data = pd.read_table(DATA_FOLDER + "cola_public/raw/out_of_domain_dev.tsv", usecols=[1,3], header=None, names=['label', 'sentence'])

test_data['label'] = test_data['label'].astype(float)

print(f"Number of records: {test_data.shape[0]}")

print("\nlabel distribution")
print(test_data['label'].value_counts()*100/ test_data.shape[0])

Number of records: 516

label distribution
1.0    68.604651
0.0    31.395349
Name: label, dtype: float64
