# Training BERT

This notebook uses the data from `pre_process_data.py` and trains BERT models to classify toxicity.

## Potential Extensions

The addition this method really needs is to use the extra columns, such as identity attack. If we did this way, we could build BERT models for each attack and then use some form of logistic regression on top.

## Configuration

#### Model

In [1]:
# model_type = 'bert-base-cased'
model_type = 'bert-base-uncased'
# model_type = 'bert-large-cased'
# model_type = 'bert-large-uncased'

In [2]:
dataset_size = None # set to None for full dataset
min_length = 140

#### Learning Parameters

In [3]:
epochs = 10
learning_rate = 2e-5
warmup = 0.05
batch_size = 32
accumulation_steps=2
seed = 0

## Variables to Not Change

In [4]:
max_sentence_length = 512
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

In [5]:
if dataset_size == None:
    output_model_file = f'{model_type}.bin'
else:
    output_model_file = f'{dataset_size}_{model_type}.bin'

## Check Configuration

This is kept pretty naive. Mainly want to make sure that a model isn't overwritten.

In [6]:
import os

if not os.path.isdir('model'):
    os.mkdir('model')

model_output_path = os.path.join('model', output_model_file)
assert os.path.exists(model_output_path) == False

## Getting Data for BERT

In [7]:
from torch.utils.data import TensorDataset

import numpy as np
import pickle
import torch

In [8]:
data_path = f'{model_type}_{dataset_size}'

In [9]:
f = open(os.path.join('data', f'{data_path}_training_data.pkl'), 'rb')
x, y = pickle.load(f)
f.close()

In [10]:
y = torch.tensor([torch.tensor(_y, dtype=torch.float) for _y in y])

In [11]:
new_x = []
for row in x:
    while len(row) < min_length:
        row.append(0)
        
    new_x.append(x)

In [12]:
dataset = TensorDataset(torch.tensor(x, dtype=torch.long), y)

## Loading Bert

In [13]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import numpy as np

In [14]:
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [15]:
tokenizer = BertTokenizer.from_pretrained(model_type)

## Load Pre-Trained BERT Model

In [16]:
from pytorch_pretrained_bert import BertForSequenceClassification,BertAdam

In [17]:
%%time

# num_labels can be updates so we could extend this to predict more than just the toxicity.
model = BertForSequenceClassification.from_pretrained(model_type,cache_dir=None,num_labels=1)

CPU times: user 3.75 s, sys: 667 ms, total: 4.42 s
Wall time: 4.55 s


## Fine-Tune BERT

In [18]:
from torch.nn import functional as F
from tqdm import tqdm_notebook

In [19]:
train_optimization_steps = int(epochs*len(dataset)/batch_size/accumulation_steps)

In [20]:
param_optimizer = list(model.named_parameters())

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [21]:
optimizer = BertAdam(
    optimizer_grouped_parameters,
    lr=learning_rate,
    warmup=warmup,
    t_total=train_optimization_steps)

In [22]:
%%time

criterion = torch.nn.MSELoss()  
model = model.train()


for _ in tqdm_notebook(range(epochs), desc='epoch'):
    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer.zero_grad()

    for step, (x, y) in tqdm_notebook(enumerate(train_loader), desc='batch'):
        predictions = model(x)
        
        loss = criterion(predictions, y)
        
        loss.backward()
        optimizer.step()        
        optimizer.zero_grad()

HBox(children=(IntProgress(value=0, description='epoch', max=10, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=1, bar_style='info', description='batch', max=1, style=ProgressStyle(descript…

  return F.mse_loss(input, target, reduction=self.reduction)


KeyboardInterrupt: 

In [23]:
# torch.save(model.state_dict(), model_output_path)