### BERT fine-tuning for document classification

In [14]:
import os
import re
import numpy as np 
from sklearn.metrics import accuracy_score

import transformers
from transformers import BertTokenizer, BertModel
from transformers import ElectraConfig, ElectraModel, ElectraTokenizer
import torch
from torch import cuda
from tqdm.notebook import tqdm
device = 'cuda' if cuda.is_available() else 'cpu'

device

'cuda'

- use X.txt and YL1.txt 

In [2]:
X = [line.strip() for line in open('X.txt').readlines()]
y = train_data = [int(line.strip()) for line in open('YL1.txt').readlines()]

len(X), len(y), max(y)

(46985, 46985, 6)

### An easy train/test split

In [3]:
train_X = X[:46000]
train_y = np.array(y[:46000])
test_X = X[46000:]
test_y = np.array(y[46000:])

len(train_X), len(train_y), len(test_X), len(test_y)

(46000, 46000, 985, 985)

In [4]:
# not needed for training or evaluation, but useful for mapping examples
labels = {
    0:'Computer Science',
    1:'Electrical Engineering',
    2:'Psychology',
    3:'Mechanical Engineering',
    4:'Civil Engineering',
    5:'Medical Science',
    6:'Biochemistry'
}

len(labels)

7

### Fine-tune BERT on the dataset

In [5]:
class MultiLabelDataset(torch.utils.data.Dataset):

    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.targets = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

### BERT Class

In [6]:
class BERTClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()
                   
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(768, 256)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
#         self.dropout = torch.nn.Dropout(0.5)
        #self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.Tanh()(pooler)
#         pooler = self.dropout(pooler)
        output = self.classifier(pooler)

        # CrossEntropyLoss combines nn.LogSoftmax() and nn.NLLLoss() in one single class
        #output = self.softmax(output)
        return output

### Helpful Functions

In [7]:
def loss_fn(outputs, targets):
    # Change to cross entropy
    return torch.nn.CrossEntropyLoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()
    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            #outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs.cpu().detach())
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

### The Tokenizer

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# what does the tokenizer do?
print(train_X[5])
tokenizer.encode_plus(
            train_X[5],
            None,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )

(Objective) In order to increase classification accuracy of tea-category identification (TCI) system, this paper proposed a novel approach. (Method) The proposed methods first extracted 64 color histogram to obtain color information, and 16 wavelet packet entropy to obtain the texture information. With the aim of reducing the 80 features, principal component analysis was harnessed. The reduced features were used as input to generalized eigenvalue proximal support vector machine (GEPSVM). Winner-takes-all (WTA) was used to handle the multiclass problem. Two kernels were tested, linear kernel and Radial basis function (RBF) kernel. Ten repetitions of 10-fold stratified cross validation technique were used to estimate the out-of-sample errors. We named our method as GEPSVM + RBF + WTA and GEPSVM + WTA. (Result) The results showed that PCA reduced the 80 features to merely five with explaining 99.90% of total variance. The recall rate of GEPSVM + RBF + WTA achieved the highest overall reca

{'input_ids': [101, 1006, 7863, 1007, 1999, 2344, 2000, 3623, 5579, 10640, 1997, 5572, 1011, 4696, 8720, 1006, 22975, 2072, 1007, 2291, 1010, 2023, 3259, 3818, 1037, 3117, 3921, 1012, 1006, 4118, 1007, 1996, 3818, 4725, 2034, 15901, 4185, 3609, 2010, 3406, 13113, 2000, 6855, 3609, 2592, 1010, 1998, 2385, 4400, 7485, 14771, 23077, 2000, 6855, 1996, 14902, 2592, 1012, 2007, 1996, 6614, 1997, 8161, 1996, 3770, 2838, 1010, 4054, 6922, 4106, 2001, 17445, 2098, 1012, 1996, 4359, 2838, 2020, 2109, 2004, 7953, 2000, 18960, 1041, 29206, 10175, 5657, 4013, 9048, 9067, 2490, 9207, 3698, 1006, 16216, 4523, 2615, 2213, 1007, 1012, 3453, 1011, 3138, 1011, 2035, 1006, 21925, 1007, 2001, 2109, 2000, 5047, 1996, 4800, 26266, 3291, 1012, 2048, 16293, 2015, 2020, 7718, 1010, 7399, 16293, 1998, 15255, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Fine Tuning

In [20]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
NUM_OUT = 7
LEARNING_RATE = 2e-05

training_data = MultiLabelDataset(train_X, torch.from_numpy(train_y), tokenizer, MAX_LEN)
test_data = MultiLabelDataset(test_X, torch.from_numpy(test_y), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = torch.utils.data.DataLoader(training_data, **train_params)
testing_loader = torch.utils.data.DataLoader(test_data, **test_params)

### Train + Evaluate

In [10]:
torch.cuda.empty_cache()

In [12]:
model = BERTClass(NUM_OUT)
model.to(device)    

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss: {loss.mean().item()}')
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1).indices  
    targets = targs  
    print('Accuracy on test set: {}'.format(accuracy_score(guesses, targets)))

  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 0, Loss: 0.45446404814720154


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.8365482233502538


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 1, Loss: 0.3859385848045349


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.8406091370558376


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 2, Loss: 0.08846022933721542


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.8467005076142132


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 3, Loss: 0.03497802093625069


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.8324873096446701


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 4, Loss: 0.0783449038863182


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.8406091370558376


### Electra Tokenizer + Class

In [16]:
e_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

In [26]:
class ELECTRAClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(ELECTRAClass, self).__init__()
                   
        self.l1 = ElectraModel.from_pretrained("google/electra-small-discriminator")
        # ELECTRA small has 256 hidden units
        self.classifier = torch.nn.Linear(256, NUM_OUT)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        output = self.classifier(pooler)

        return output

In [27]:
e_training_data = MultiLabelDataset(train_X, torch.from_numpy(train_y), e_tokenizer, MAX_LEN)
e_test_data = MultiLabelDataset(test_X, torch.from_numpy(test_y), e_tokenizer, MAX_LEN)

e_train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

e_test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

e_training_loader = torch.utils.data.DataLoader(e_training_data, **e_train_params)
e_testing_loader = torch.utils.data.DataLoader(e_test_data, **e_test_params)

### Train + Evaluate

In [28]:
e_model = ELECTRAClass(NUM_OUT)
e_model.to(device)

optimizer = torch.optim.Adam(params=e_model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(e_model, e_training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss: {loss.mean().item()}')
    guess, targs = validation(e_model, e_testing_loader)
    guesses = torch.max(guess, dim=1).indices
    targets = targs  
    print('Accuracy on test set: {}'.format(accuracy_score(guesses, targets)))

  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 0, Loss: 0.38243016600608826


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.7928934010152284


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 1, Loss: 0.16275136172771454


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.8182741116751269


  0%|          | 0/2875 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.float)


Epoch: 2, Loss: 0.7416286468505859


  0%|          | 0/62 [00:00<?, ?it/s]

Accuracy on test set: 0.8355329949238579


### Questions & Answers
1) What does the Bert Tokenizer do?</br>
The BERT Tokenizer breaks the train_X data into tokens, then for each token it generates an ID that corresponds to the model's vocabulary.  It then pads or truncates the tokens to match the "maximum length."  It also generates token type ids and attention masks (distinguishes which tokens are real vs padding).
2) What loss function did you use? Why did you choose that loss function?</br>
I chose Categorical Cross Entropy as my loss function because it is better equipped to be used on data that has more than just 2 (binary) categories.
3) Try different batch sizes (e.g., 8 vs 16 vs 32). How does that affect your results?</br>
The batch size affected the training time first, the higher the batch size, the shorter the training time (at least for me).  The lower batch size required around 1-2 epochs more to get the same accuracy with the high batch size.  The difference between the batch sizes were usually around 1-1.5% with the highest accuracy being 16 with 5 epochs.
4) How do the results compare to BERT? </br>
I tried the small ELECTRA model because it was the fastest to tune.  With 3 epochs, I was able to get within 1% accuracy of the original BERT results.  My conclusion is that they performed relatively similar and maybe given 2 more epochs, the ELECTRA model could surpass it.
5) What is the power of fine-tuning (as opposed to pre-training)? </br>
The power of fine-tuning is that it allows us to use much less time, power and resources to achieve a similar-ish result as opposed to pre-training.  Pre-training is expensive and it costs companies millions of dollars to complete, while fine-tuning can be achieved on Jimmy's chromebook laptop.