In [1]:
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data

from transformers import BertTokenizerFast, BertModel

from bert import BERTClassifier
from training import train_model, evaluate

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
# BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens


init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']



In [6]:
# VOCABS
TEXT = data.Field(
    batch_first = True,
    use_vocab = False,
    tokenize = tokenize_and_cut,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)

LABEL = data.LabelField()

# load train, validation and test data from corresponding CSV files
train_data, valid_data = data.TabularDataset.splits(
    path='../data', 
    train='spam-train.csv',
    validation='spam-valid.csv',
    format='csv', 
    skip_header=True,
    fields=[('label', LABEL),('text', TEXT)]
)

# Build vocabulary for labels
LABEL.build_vocab(train_data)


In [9]:
# Define iterators to iterate through different datasets 
# during training and testing model

BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.text),
    device=device)

In [10]:
bert = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# model hyper-parameters
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.25

In [12]:
# Create the BERT model
model = BERTClassifier(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

# freeze all bert parameters
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [13]:
#################################
### DEFINE LOSS AND OPTIMIZER ###
#################################
optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)
criterion = nn.CrossEntropyLoss()

# move the model and the loss function to GPU if there is a GPU 
model = model.to(device)
criterion = criterion.to(device)

In [14]:
#############
### TRAIN ###
#############
fname=f'models/bert-{N_LAYERS}-{HIDDEN_DIM}.pt'

train_model(
    model, 
    device,
    train_iterator, 
    valid_iterator, 
    optimizer, 
    criterion, 
    scheduler, 
    n_epochs=5, 
    fname=fname)

Unnamed: 0,Epoch,Train Loss,Valid Loss,Train Acc,Valid Acc,Time
0,1,0.234,0.069,91.41,98.32,1m 6s
1,2,0.052,0.059,98.71,98.75,1m 10s
2,3,0.033,0.047,98.96,98.91,1m 11s
3,4,0.031,0.051,99.08,98.91,1m 21s
4,5,0.027,0.048,99.16,98.91,1m 32s


In [15]:
test_loss, test_acc = evaluate(model, valid_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

[INFO] Testing on test dataset...


Validation: | Loss=0.0114 | Acc=1.0000 |: 100%|██████████| 5/5 [00:02<00:00,  1.88it/s]

| Test Loss: 0.047 | Test Acc: 98.91% |





In [16]:
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(), lr = 1e-3)

In [18]:
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd

train_df = pd.read_csv('../data/spam-train.csv')
train_labels = train_df['label']
#compute the class weights
class_wts = compute_class_weight('balanced', np.unique(train_labels), train_labels)

print(class_wts)

[0.57831603 3.6921944 ]


1       ham
2       ham
3       ham
4       ham
       ... 
5009    ham
5010    ham
5011    ham
5012    ham
5013    ham
Name: label, Length: 5014, dtype: object as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


In [19]:
# convert class weights to tensor
weights = torch.tensor(class_wts, dtype=torch.float)
weights = weights.to(device)

# loss function
# cross_entropy  = nn.NLLLoss(weight=weights) 
criterion = nn.CrossEntropyLoss(weight=weights)

# number of training epochs
epochs = 5

In [20]:
fname=f'models/bert-balanced-{N_LAYERS}-{HIDDEN_DIM}.pt'

train_model(
    model, 
    device,
    train_iterator, 
    valid_iterator, 
    optimizer, 
    criterion, 
    scheduler, 
    n_epochs=5, 
    fname=fname)

Unnamed: 0,Epoch,Train Loss,Valid Loss,Train Acc,Valid Acc,Time
0,1,0.089,0.106,97.56,98.16,1m 4s
1,2,0.045,0.084,98.48,98.44,1m 15s
2,3,0.028,0.099,99.34,97.66,1m 24s
3,4,0.026,0.1,99.18,97.03,1m 35s
4,5,0.012,0.117,99.57,95.78,1m 23s


In [21]:
test_loss, test_acc = evaluate(model, valid_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

Validation: | Loss=0.0074 | Acc=1.0000 |: 100%|██████████| 5/5 [00:02<00:00,  1.96it/s]

| Test Loss: 0.084 | Test Acc: 98.44% |



