In [1]:
import os
import seaborn as sns
import numpy as np
import random
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import transformers
import tokenizers
import datetime
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef

from datasets import load_dataset
import torch
from transformers import BertTokenizer, TrainingArguments
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

import import_ipynb
from MyTrainer import MyTrainer

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

In [None]:
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
def save_my_model_and_tokenizer (model, tokenizer=None, output_dir='./model_save/'):
    """
    to load the saved model just do: 
    model     = BertForSequenceClassification.from_pretrained (output_dir)
    tokenizer = BertTokenizer.from_pretrained (output_dir, do_lower_case=False)
    """
    
    # Create output directory if needed
    if not os.path.exists (output_dir):
        os.makedirs (output_dir)

    print ("Saving model to %s" % output_dir)

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model.cpu ()
    model_to_save = model.module if hasattr (model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained (output_dir)
    if tokenizer is not None:
        tokenizer.save_pretrained (output_dir)
    model.cuda ()
    
    # Good practice: save your training arguments together with the trained model
    # torch.save(args, os.path.join(output_dir, 'training_args.bin'))
    return

# Load the saved model and tokenizer

In [2]:
# model = BertForSequenceClassification.from_pretrained ('./model_save/0/')
# tokenizer = BertTokenizer.from_pretrained ('./model_save/0/', do_lower_case=False)
# model

# My Trainer

In [None]:
def format_time (elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str (datetime.timedelta (seconds=elapsed_rounded))

In [None]:
def compute_metrics (labels, pred_logits):
    
    preds = pred_logits.argmax (-1)
    precision, recall, f1, _ = precision_recall_fscore_support (labels, preds, average='macro')
    acc = accuracy_score (labels, preds)
    mcc = matthews_corrcoef (labels, preds)        # matthews correlation coefficient
    metrics = {
        'mcc'      : mcc,
        'accuracy' : acc,
        'f1'       : f1,
        'precision': precision,
        'recall'   : recall
    }
    return metrics

In [1]:
class MyTrainer:
    
    def __init__(self, model, args, train_dataset, eval_dataset, tokenizer, compute_metrics=compute_metrics):
        
        self.model           = model
        self.args            = args
        self.train_dataset   = train_dataset
        self.eval_dataset    = eval_dataset
        self.tokenizer       = tokenizer
        self.compute_metrics = compute_metrics
        self.isTrained       = False
        self.device          = self.get_device_type ()
        
        # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
        # I believe the 'W' stands for 'Weight Decay fix"
        self.optimizer = AdamW (model.parameters (),
                           lr  = args.learning_rate,
                           eps = args.adam_epsilon # args.adam_epsilon  - default is 1e-8 is “a very small number to prevent any division by zero"
        )

        # Number of training epochs. The BERT authors recommend between 2 and 4. 
        # We chose to run for 4, but we'll see later that this may be over-fitting the
        # training data.
        self.epochs = self.args.num_train_epochs
        self.train_dataloader, self.validation_dataloader, self.lr_scheduler, self.num_training_steps = self.get_dataLoaders ()        
        return
    
    def get_device_type (self):
        
        # If there's a GPU available...
        if torch.cuda.is_available ():    

            # Tell PyTorch to use the GPU.    
            device = torch.device ("cuda")

            print('There are %d GPU(s) available.' % torch.cuda.device_count ())

            print('We will use the GPU:', torch.cuda.get_device_name(0))
        # If not...
        else:
            print('No GPU available, using the CPU instead.')
            device = torch.device ("cpu")
        return device
    
    def get_dataLoaders (self):        
        
        # Create the DataLoaders for our training and validation sets.
        if isinstance (self.train_dataset, torch.utils.data.IterableDataset):
            train_sampler = None
        else:
            train_sampler = SequentialSampler (self.train_dataset)
        train_dataloader = DataLoader (
                    self.train_dataset,  # The training samples.
                    sampler = train_sampler,
                    batch_size = self.args.per_device_train_batch_size  # Trains with this batch size.
        )
        
        # For validation the order doesn't matter, so we'll just read them sequentially.
        validation_dataloader = DataLoader (
                    self.eval_dataset,             # The validation/dev samples.
                    sampler = SequentialSampler (self.eval_dataset),
                    batch_size = self.args.per_device_eval_batch_size  # Evaluate with this batch size.
        )
        
        # Total number of training steps is [number of batches] x [number of epochs]. 
        # (Note that this is not the same as the number of training samples).
        num_training_steps = len (train_dataloader) * self.epochs

        # Create the learning rate scheduler.
        lr_scheduler = get_linear_schedule_with_warmup (self.optimizer, 
                                                        num_warmup_steps   = self.args.num_warmup_steps, # Default value in run_glue.py
                                                        num_training_steps = num_training_steps)
        return train_dataloader, validation_dataloader, lr_scheduler, num_training_steps
    
    
    def test_iterate_dataloader ():
        
        for step, batch in enumerate (self.train_dataloader):
            print (step, batch)
        return
    
    
    def train (self, is_train_base_encoding_model=True):
        
        # for tuning only the outer classifier layar and not the base bert encoding layer
        if not is_train_base_encoding_model:
            for param in self.model.base_model.parameters ():
                param.requires_grad = False
        
        # This training code is based on the `run_glue.py` script here:
        # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

        # Set the seed value all over the place to make this reproducible.
        seed_val = 42
        random.seed (seed_val)
        np.random.seed (seed_val)
        torch.manual_seed (seed_val)
        torch.cuda.manual_seed_all (seed_val)

        # We'll store a number of quantities such as training and validation loss, 
        # validation accuracy, and timings.
        training_stats = []
        # Measure the total training time for the whole run.
        total_t0 = time.time ()
        # inint min_val_loss to a large val, if after each epoch eval-loss < min_val_loss, then save the model
        min_val_loss = 99999999
        
        # For each epoch...
        for epoch_i in range (0, self.epochs):

            # ========================================
            #               Training
            # ========================================

            # Perform one full pass over the training set.

            print("")
            print('======== Epoch {:} / {:} ========'.format (epoch_i + 1, self.epochs))
            print('Training...')

            # Measure how long the training epoch takes.
            t0 = time.time ()

            # Reset the total loss for this epoch.
            total_train_loss = 0

            # Put the model into training mode. Don't be mislead--the call to 
            # `train` just changes the *mode*, it doesn't *perform* the training.
            # `dropout` and `batchnorm` layers behave differently during training
            # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
            self.model.train ()

            # For each batch of training data...
            for step, batch in enumerate (self.train_dataloader):

                # Progress update every 40 batches.
                # print ('batch =', batch)
                if step % 50 == 0 and not step == 0:
                    # Calculate elapsed time in minutes.
                    elapsed = format_time (time.time() - t0)

                    # Report progress.
                    print ('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len (self.train_dataloader), elapsed))
                if (self.args.max_steps > 0 and self.args.max_steps < step) or  \
                   (is_train_base_encoding_model and self.args.eval_steps>0 and step % self.args.eval_steps==0 and step>0):
                    
                    avg_train_loss = total_train_loss / step
                    training_time = format_time (time.time () - t0)
                    print ("Running Validation...")
                    avg_val_loss, avg_val_f1, avg_val_mcc, avg_val_precision, avg_val_recall, avg_val_accuracy, validation_time = self.evaluate ()
                    training_stats.append ({
                            'epoch'         : epoch_i + 1,
                            'training_loss' : avg_train_loss,
                            'eval_loss'     : avg_val_loss,
                            'eval_f1'       : avg_val_f1,
                            'eval_mcc'      : avg_val_mcc, 
                            'eval_precision': avg_val_precision,
                            'eval_recall'   : avg_val_recall,
                            'eval_accuracy' : avg_val_accuracy,
                            'training_time' : training_time,
                            'eval_time'     : validation_time                   
                    })
                    # save this model if the eval loss decreases from the minimum so far
                    if avg_val_loss < min_val_loss: 

                        min_val_loss = avg_val_loss
                        save_my_model_and_tokenizer (self.model, self.tokenizer, output_dir='./model_save/')

                # Unpack this training batch from our dataloader. 
                #
                # As we unpack the batch, we'll also copy each tensor to the GPU using the 
                # `to` method.
                #
                # `batch` contains four pytorch tensors:
                #   [0]: input ids 
                #   [1]: attention masks
                #   [2]: segment_ids
                #   [3]: labels 
                # b_input_ids = batch[0].to(device)
                # b_input_mask = batch[1].to(device)
                # b_segment_ids = batch[2].to(device)
                # b_labels = batch[3].to(device)

                # Always clear any previously calculated gradients before performing a
                # backward pass. PyTorch doesn't do this automatically because 
                # accumulating the gradients is "convenient while training RNNs". 
                # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
                self.model.zero_grad ()        

                # Perform a forward pass (evaluate the model on this training batch).
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # It returns different numbers of parameters depending on what arguments
                # arge given and what flags are set. For our useage here, it returns
                # the loss (because we provided labels) and the "logits"--the model
                # outputs prior to activation.
                # loss, logits = self.model (b_input_ids, 
                #                            token_type_ids=b_segment_ids, 
                #                            attention_mask=b_input_mask, 
                #                            labels=b_labels)
                for k in batch:
                    batch[k] = batch[k].to (self.device)
                output = self.model (**batch)
                loss   = output.loss
                logits = output.logits
                
                # Accumulate the training loss over all of the batches so that we can
                # calculate the average loss at the end. `loss` is a Tensor containing a
                # single value; the `.item()` function just returns the Python value 
                # from the tensor.
                total_train_loss += loss.item ()

                # Perform a backward pass to calculate the gradients.
                loss.backward ()

                # Clip the norm of the gradients to 1.0.
                # This is to help prevent the "exploding gradients" problem.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

                # Update parameters and take a step using the computed gradient.
                # The optimizer dictates the "update rule"--how the parameters are
                # modified based on their gradients, the learning rate, etc.
                self.optimizer.step ()

                # Update the learning rate.
                self.lr_scheduler.step ()
            
            # if self.args.eval_steps <= 0:
            # At the end of each epoch measure stats and eval:
            # Calculate the average loss over all of the batches.
            avg_train_loss = total_train_loss / len (self.train_dataloader)
            # Measure how long this epoch took.
            training_time = format_time (time.time () - t0)            
            print("  Average training loss: {0:.2f}".format (avg_train_loss))
            print("  Training epcoh took: {:}".format (training_time))
            print ("\n  Running Validation...")
            avg_val_loss, avg_val_f1, avg_val_mcc, avg_val_precision, avg_val_recall, avg_val_accuracy, validation_time = self.evaluate ()
            
            # Record all statistics from this epoch.
            training_stats.append ({
                    'epoch'         : epoch_i + 1,
                    'training_loss' : avg_train_loss,
                    'eval_loss'     : avg_val_loss,
                    'eval_f1'       : avg_val_f1,
                    'eval_mcc'      : avg_val_mcc, 
                    'eval_precision': avg_val_precision,
                    'eval_recall'   : avg_val_recall,
                    'eval_accuracy' : avg_val_accuracy,
                    'training_time' : training_time,
                    'eval_time'     : validation_time                   
            })
            # save this epoch's model if the eval loss decreases from the minimum so far
            if avg_val_loss < min_val_loss:
                
                min_val_loss = avg_val_loss
                save_my_model_and_tokenizer (self.model, self.tokenizer, output_dir='./model_save/')
        
        print ("")
        print ("Training complete!")
        print ("Total training took {:} (h:mm:ss)".format (format_time (time.time ()-total_t0)))
        self.isTrained = True
        self.plot_train_stats (training_stats)
        return training_stats
    
    def evaluate (self):
        
        t0 = time.time ()
        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        self.model.eval ()

        # Tracking variables 
        total_eval_mcc       = 0
        total_eval_f1        = 0
        total_eval_precision = 0
        total_eval_recall    = 0
        total_eval_accuracy  = 0
        total_eval_loss      = 0
        nb_eval_steps        = 0

        # Evaluate data for one epoch
        for batch in self.validation_dataloader:

            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using 
            # the `to` method.
            #
            # `batch` contains four pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: segment_ids
            #   [3]: labels 
            # b_input_ids = batch[0].to (device)
            # b_input_mask = batch[1].to (device)
            # b_segment_ids = batch[2].to (device)
            # b_labels = batch[3].to (device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad ():        

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                # output = self.model (b_input_ids, 
                #                      token_type_ids=b_segment_ids, 
                #                      attention_mask=b_input_mask,
                #                      labels=b_labels)
                for k in batch:
                    batch[k] = batch[k].to (self.device)
                output = self.model (**batch)
                loss   = output.loss
                logits = output.logits

            # Accumulate the validation loss.
            total_eval_loss += loss.item ()

            # Move logits and labels to CPU
            logits    = logits.detach ().cpu ().numpy ()
            label_ids = batch['labels'].to ('cpu').numpy ()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            metrics = self.compute_metrics (label_ids, logits)
            total_eval_mcc       += metrics['mcc']
            total_eval_f1        += metrics['f1']
            total_eval_precision += metrics['precision']
            total_eval_recall    += metrics['recall']
            total_eval_accuracy  += metrics['accuracy']


        # Report the final accuracy for this validation run.
        avg_val_f1 = total_eval_f1 / len (self.validation_dataloader)
        print ("  F1: {0:.3f}".format (avg_val_f1))
        avg_val_mcc = total_eval_mcc / len (self.validation_dataloader)
        print ("  MCC: {0:.3f}".format (avg_val_mcc))
        avg_val_precision = total_eval_precision / len (self.validation_dataloader)
        print ("  Precision: {0:.3f}".format (avg_val_precision))
        avg_val_recall = total_eval_recall / len (self.validation_dataloader)
        print ("  Recall: {0:.3f}".format (avg_val_recall))
        avg_val_accuracy = total_eval_accuracy / len (self.validation_dataloader)
        print ("  Accuracy: {0:.3f}".format (avg_val_accuracy))
        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len (self.validation_dataloader)
        # Measure how long the validation run took.
        validation_time = format_time (time.time () - t0)
        print ("  Validation Loss: {0:.2f}".format (avg_val_loss))
        print ("  Validation took: {:}".format (validation_time))            
        return avg_val_loss, avg_val_f1, avg_val_mcc, avg_val_precision, avg_val_recall, avg_val_accuracy, validation_time
    
    def plot_train_stats (self, training_stats):
        
        # Classification Report curve
        mccs       = [e['eval_mcc'] for e in training_stats]
        accuracies = [e['eval_accuracy'] for e in training_stats]
        f1_scores  = [e['eval_f1'] for e in training_stats]
        precisions = [e['eval_precision'] for e in training_stats]
        recalls    = [e['eval_recall'] for e in training_stats]
        losses     = [e['eval_loss'] for e in training_stats]
        epochs     = training_stats[-1]['epoch']

        print ('mccs:',       mccs)
        print ('accuracies:', accuracies)
        print ('precisions:', precisions)
        print ('recalls:',    recalls)
        print ('f1_scores:',  f1_scores)
        print ('losses:',     losses)

        sns.lineplot (x=np.arange(1, epochs + 1), y=mccs,       label='val_mcc')
        sns.lineplot (x=np.arange(1, epochs + 1), y=accuracies, label='val_accuracy')
        sns.lineplot (x=np.arange(1, epochs + 1), y=precisions, label='val_precision')
        sns.lineplot (x=np.arange(1, epochs + 1), y=recalls,    label='val_recall')
        sns.lineplot (x=np.arange(1, epochs + 1), y=f1_scores,  label='val_f1') 
        plt.show ()
        return
    
    
    def getTrainedModel (self):
        
        if self.isTrained:
            return self.model
        return None
    
    
    def predict (self, prediction_dataset, isRemoveLabels=True):
        """
        return: pred_logits, true_labels, metrics (if true 'labels' are input in the prediction_dataset)
        """     
        
        prediction_sampler    = SequentialSampler (prediction_dataset)
        prediction_dataloader = DataLoader (prediction_dataset, sampler=prediction_sampler, batch_size=self.args.per_device_eval_batch_size)
        print ('Predicting labels for {:,} test sentences...'.format (len (prediction_dataset)))
        
        # Put model in evaluation mode
        self.model.eval ()

        # Tracking variables 
        predictions = []
        # true_labels = []
        
        # Predict 
        for batch in prediction_dataloader:
            
            # Add batch to GPU
            batch = {t:batch[t].to (self.device) for t in batch}

            # Unpack the inputs from our dataloader
            # b_input_ids, b_input_mask, b_segment_ids = batch

            # Telling the model not to compute or store gradients, saving memory and 
            # speeding up prediction
            with torch.no_grad ():
                # Forward pass, calculate logit predictions
                if isRemoveLabels:
                    batch.pop ('labels')
                for k in batch:
                    batch[k] = batch[k].to (self.device)
                outputs = model (**batch)
            logits = outputs[0]

            # Move logits and labels to CPU
            logits = logits.detach ().cpu ().numpy ()
            # label_ids = b_labels.to ('cpu').numpy ()

            # Store predictions and true labels
            predictions.append (logits)
            # true_labels.append (label_ids)
            print ('Done predictions for ', len(predictions), '/', len(prediction_dataloader), 'batches')
        print ('Done prediction')
        
        # Combine the results across all batches to get the predicted logits
        pred_logits = np.concatenate (predictions, axis=0)
        # For each sample, pick the label (0,1,2) with the highest score.
        # pred_labels = np.argmax (pred_logits, axis=1).flatten()
        # returns the predicted logits
        return pred_logits, None, None

# sample training_args and trainer

# Tell pytorch to run this model on the GPU.
model.cuda()

training_args = TrainingArguments (
    
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    warmup_steps=1000,               # for lr scheduling
    eval_steps=5000,                 # Number of update steps between two evaluations, if <=0 then eval at end of each epoch
    max_steps=5000,                  # If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs
    # learning_rate=2e-5             # args.learning_rate - default is 5e-5, our notebook has 2e-5
    # adam_epsilon=1e-8              # - default is 1e-8 is “a very small number to prevent any division by zero"
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
)

trainer = MyTrainer (
    
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
    tokenizer=tokenizer,                 # this is used only to save the tokenizer along with the model during training
    compute_metrics=compute_metrics,
)