## How quickly train BERT model to solve binary classification task

In [None]:
import torch.nn as nn
import torch
import transformers
import pandas as pd 
import numpy as np
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
from sklearn import model_selection
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

from transformers import logging
logging.set_verbosity_warning()

In [None]:
# this is the maximum number of tokens in the sentence
MAX_LEN = 512
# batch sizes for pytorch dataloaders
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 10
TRAINING_FILE = "../input/imdb-sentiment-10k-reviews-binary-classification/imdb_10K_sentimnets_reviews.csv"

### Load pretained BERT model
#### More models and docs here https://huggingface.co/models

In [None]:
TOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

### Class that helps load data for Pytorch
##### more here https://pytorch.org/docs/stable/data.html#dataset-types

In [None]:
class BertDataSetPytorch:
    def __init__(self, review, sentiment):
        self.review = review
        self.sentiment = sentiment
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
    def __len__(self):
        return len(self.review)
    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())
        # Tokenize with padding and max lenght of the sentence
        inputs = self.tokenizer.encode_plus(review, None, add_special_tokens=True, truncation=True, max_length=self.max_len, pad_to_max_length=True)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        return {"ids": torch.tensor(ids, dtype=torch.long),
                "mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "targets": torch.tensor(self.sentiment[item], dtype=torch.float)}
# Test
test_data_ = pd.read_csv(TRAINING_FILE, nrows=3) # just 3 reviews
test_bert_class_ = BertDataSetPytorch(test_data_.review, test_data_.sentiment)
len(test_bert_class_[0]['ids'])
print(f'The number of tokens in the sentence is {MAX_LEN}')

### Class for BERT model
#### more here "Approaching (Almost) Any Machine Learning Problem" https://github.com/abhishekkrthakur/approachingalmost, by p.256

In [None]:
class BertPretrainedUncased(nn.Module):
    def __init__(self):
        super(BertPretrainedUncased, self).__init__()
        self.bert = transformers.BertModel.from_pretrained("bert-base-uncased", return_dict=False) # Here we load from pretrained
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
    def forward(self, ids, mask, token_type_ids):
        ids, results = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        drop_out = self.drop(results)
        out_layer = self.out(drop_out)
        return out_layer
# test model and print # params
test_bert_model_ = BertPretrainedUncased()
print(f'number of parameters {test_bert_model_.bert.num_parameters()}')

### Loss Function

In [None]:
def loss(results, targets):
    return nn.BCEWithLogitsLoss()(results, targets.view(-1,1))

# test loss with random results 
inp_ = torch.randn(3, requires_grad=True).view(-1,1)
out_ = torch.empty(3).random_(2)
ff_ = loss(inp_, out_)
print(ff_)

### Train and validate functions

In [None]:
def train(data_loader, model, optimizer, device, scheduler):
    model.train()
    for data in data_loader:
        # prepare data        
        ids = data["ids"]
        token_type_ids = data["token_type_ids"]
        mask = data["mask"]
        targets = data["targets"]
        # to device
        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        # grads to zero
        optimizer.zero_grad()
        # get the model result
        out = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        # loss
        ls = loss(out, targets)
        # backword gradients
        ls.backward()
        # steps of optimizer and scheduler
        optimizer.step()
        scheduler.step()
        
def validate(data_loader, model, device):
    model.eval()
    target_list = []
    output_list = []
    with torch.no_grad():
        for data in data_loader:
            # prepare data        
            ids = data["ids"]
            token_type_ids = data["token_type_ids"]
            mask = data["mask"]
            targets_ = data["targets"]
            # to device
            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets_.to(device, dtype=torch.float)
            # get results of current model
            outs = model(ids, mask, token_type_ids)
            # convert targets to cpu and extend the final list
            targets__ = targets.cpu().detach()
            target_list.extend(targets__.numpy().tolist())
            # convert outs to cpu
            out_ = torch.sigmoid(outs).cpu().detach()
            output_list.extend(out_.numpy().tolist())
    return output_list, target_list

### Prepate and split data

In [None]:
data = pd.read_csv(TRAINING_FILE).fillna('none')
# We need to stratify (not distributed well)
data_train, data_valid = model_selection.train_test_split(data, test_size=0.10, random_state=11, stratify=data.sentiment.values)
data_train.reset_index(drop=True)
data_valid.reset_index(drop=True)

print(len(data_train), len(data_valid))
data_train.head(2)

### Conver data into tensors after being tokenezed

In [None]:
train_data_set = BertDataSetPytorch(data_train.review.values, data_train.sentiment.values)
test_data_set = BertDataSetPytorch(data_valid.review.values, data_valid.sentiment.values)

print('params of the data tokenized')
print()
print(train_data_set.tokenizer)

### Load torch dataloader

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=TRAIN_BATCH_SIZE, num_workers=2)
test_data_loader = torch.utils.data.DataLoader(test_data_set, batch_size=VALID_BATCH_SIZE, num_workers=1)

print(train_data_loader.dataset.review[0], 'AND REVIEW IS ', train_data_loader.dataset.sentiment[0]) # negative?

### Set number of training steps

In [None]:
num_train_steps = int(len(data_train) / TRAIN_BATCH_SIZE * EPOCHS)
print(num_train_steps)

### Prepare model and set paraments space
#### more here "Approaching (Almost) Any Machine Learning Problem" https://github.com/abhishekkrthakur/approachingalmost, by p.270

In [None]:
# cuda if exits 
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)    
# set model
MODEL = BertPretrainedUncased()
# send model to device 
MODEL.to(device)
# params of the model
param_optimizer = list(MODEL.named_parameters())
# get names of paraments to search 
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
# params space to optimize 
optimizer_parameters = [{"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],\
                         "weight_decay": 0.001,},
                        {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],\
                         "weight_decay": 0.0,},]
# set optimizer
optimizer = AdamW(optimizer_parameters, lr=3e-5)
# set scheduler to stop
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

print(optimizer_parameters[0]['params'][0])

### Run model with params 

In [None]:
accuracy = 0 # set initial 
save_model = True # if we need to save model

for epoch in range(EPOCHS):
    # train
    print('start train')
    train(train_data_loader, MODEL, optimizer, device, scheduler)
    # evaluate
    print('start validate')
    outputs, targets = validate(test_data_loader, MODEL, device)
    # all that less 0.5 is negative
    outputs = np.array(outputs) >= 0.5
    acc = accuracy_score(outputs, targets)
    print(f'accuracy for {epoch} is {acc}')
    if acc > accuracy:
        accuracy = acc
        if save_model:
            torch.save(MODEL.state_dict(), './model_bert.bin')
            
print(f'best accuraccy is {acc}')