## Imports & Prerequisites

In [None]:
import os
import sys
import shutil 
from distutils.dir_util import copy_tree # Shutil doesn't preserve meta
import random
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML
%matplotlib inline

  import pandas.util.testing as tm


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
plt.style.use("ggplot")

In [None]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

In [None]:
import tensorflow as tf
import torch

Install Nvidia Apex library

In [None]:
# %%writefile setup.sh

# git clone https://github.com/NVIDIA/apex
# pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [None]:
# !sh setup.sh

In [None]:
from apex import amp

Install HuggingFace Transformers library

In [None]:
#!pip install transformers

Torch Dataset parent and utility classes

In [None]:
from torch.utils.data.dataset import Dataset
from torch.utils.tensorboard import SummaryWriter

Get interfaces from Transformers

In [None]:
from transformers import Trainer, HfArgumentParser, TrainingArguments, EvalPrediction, set_seed

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

In [None]:
from transformers import PreTrainedTokenizer, default_data_collator, PreTrainedModel

Support for Data Classes and Annotations

In [None]:
from dataclasses import dataclass, field
from typing import Dict, Optional, Any
from typing import List

## Data download & summary

### Download all tasks

In [None]:
gdrive_tasks_dir = '/content/drive/My Drive/model_csv_files/bert/' # Change this

In [None]:
task1_filenames = ['task1_pres_elg_train.csv', 'task1_pres_elg_val.csv', 'task1_pres_elg_test.csv']

In [None]:
task2_filenames = ['task2_cond_elg_train.csv', 'task2_cond_elg_val.csv', 'task2_cond_elg_test.csv']

In [None]:
task3_filenames = ['task3_cond_intv_train.csv', 'task3_cond_intv_val.csv', 'task3_cond_intv_test.csv']

In [None]:
copy_tree(gdrive_tasks_dir, '/content/data/')

['/content/data/task1_pres_elg_train.csv',
 '/content/data/task1_pres_elg_val.csv',
 '/content/data/task1_pres_elg_test.csv',
 '/content/data/task2_cond_elg_train.csv',
 '/content/data/task2_cond_elg_test.csv',
 '/content/data/task2_cond_elg_val.csv',
 '/content/data/task3_cond_intv_val.csv',
 '/content/data/task3_cond_intv_test.csv',
 '/content/data/task3_cond_intv_train.csv']

## Objective/s
Finetune BERT for 3 seperate tasks:
- **`Task 1`** Input: Prescription -> Output: Eligibility (Binary)
- **`Task 2`** Input: Condition -> Output: Eligibility (Binary)
- **`Task 3`** Input: Condition ->  Output: Intervention (Multi class)

BERT Finetuning task: Sequence Classification

Using [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) and ['BertForSequenceClassification'](https://huggingface.co/transformers/model_doc/bert.html#tfbertforsequenceclassification) from [huggingface](https://huggingface.co/)

##  Define implemention

### Classes

In [None]:
@dataclass
class Features:
  input_ids: List[int]
  attention_mask: List[int]
  label: int

In [None]:
@dataclass
class ModelParameters:
  model_name: str = field(
      default = None,
      metadata = {'help': 'specify pretrained `model name` or `path`'},
  )
  max_seq_len: Optional[int] = field(
      default = None,
      metadata = {'help': 'maximum seq len'},
  )
  dynamic_padding: bool = field(
      default = False,
      metadata = {'help': 'limit pad size at batch level'},
  )
  smart_batching: bool = field(
      default = False,
      metadata = {'help': 'build batch of similar sizes'},
  )

In [None]:
# Create a Dataset sub-class and implement necessary methods
class SequenceDataset(Dataset):
  def __init__(self, tokenizer: PreTrainedTokenizer, pad_to_max_length: bool, max_len: int,
               data_df: pd.DataFrame, input_column_name: str, output_column_name: str) -> None:
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.pad_to_max_length = pad_to_max_length
    self.data_df: pd.DataFrame = data_df
    self.current_row_index: int = 0
    self.input_column_name: str = input_column_name
    self.output_column_name: str = output_column_name

  def encode_sequence(self, sequence: pd.Series) -> Features:
    # encode plus returns a dictionary
    encode_dict = self.tokenizer.encode_plus(text = sequence[self.input_column_name],
                                             add_special_tokens = True,
                                             max_length = self.max_len,
                                             truncation = True, # Uses max_length to shorten the sequence to 'self.max_len'
                                             pad_to_max_length = self.pad_to_max_length,
                                             return_token_type_ids = False,
                                             return_attention_mask = True,
                                             return_overflowing_tokens = False,
                                             return_special_tokens_mask = False
                                             )
    return Features(input_ids = encode_dict['input_ids'],
                    attention_mask = encode_dict['attention_mask'],
                    label = sequence[self.output_column_name])
    
  def __getitem__(self, idx) -> Features:
    # If dataset is completely parsed, return head to begining of df
    if self.current_row_index == self.data_df.shape[0]:
      self.current_row_index = 0
    # Get the row at current
    sequence = self.data_df.loc[self.current_row_index, :]
    # Increment the head after extracting sequence at current_row_index
    self.current_row_index += 1
    # Return an encoded sequence using encode_sequence()
    return self.encode_sequence(sequence)

  def __len__(self) -> int:
    # Return shape of Dataset
    return self.data_df.shape[0]


In [None]:
def dynamic_data_collator(batch: List[Features]) -> Dict[str, torch.Tensor]:
  #batch = default_data_collator(features)
  batch_inputs = list()
  batch_attention_masks = list()
  batch_labels = list()
  # Find max size of input in the received batch
  max_input_size = max([len(sequence.input_ids) for sequence in batch])
  # Iterate through each sequence in batch and apply the padding based on max_input_size
  for sequence in batch:
    # Pad input ids
    batch_inputs += [pad_sequence(sequence.input_ids, max_input_size, 0)]
    # Pad Attention mask
    batch_attention_masks += [pad_sequence(sequence.attention_mask, max_input_size, 0)]
    # Append label as is 
    batch_labels.append(sequence.label)
  
  # Return dict with input ids, attention masks, labels for the batch
  # Wrap lists in implicit torch long 64-bit integer tensor.
  return {
      'input_ids': torch.tensor(batch_inputs, dtype=torch.long),
      'attention_mask': torch.tensor(batch_attention_masks, dtype=torch.long),
      'labels': torch.tensor(batch_labels, dtype=torch.long)
  }

### Helper Functions

In [None]:
def convert_args_dict_to_arg_list(arg_dict: Dict[str, Any]) -> List[str]:
  args_str_list = []
  for arg_op, arg_val in arg_dict.items():
    if type(arg_val) == bool:
      if arg_val:
        args_str_list.append(f"{arg_op}")
    else:
      args_str_list.append(f"{arg_op}")
      args_str_list.append(f"{arg_val}")
  return args_str_list

In [None]:
def load_data(path: str, X_col_name: str, sort: Optional[bool] = False) -> pd.DataFrame:
  # Read csv
  data_df = pd.read_csv(path)
  if sort: # Sort df to prepare for smart batching
    data_df = data_df.loc[data_df[X_col_name].str.split().str.len().rename("count").sort_values().index]
  return data_df

In [None]:
def organize_data(data_df: pd.DataFrame, batch_size: int, smart_batching: bool, forTrain: bool = True) -> pd.DataFrame:
  # Compute approx number of batches
  n_batches = np.ceil(data_df.shape[0]/batch_size).astype(int)
  if forTrain:
    if smart_batching: # For Training with Smart Batching
      data_df = np.array_split(data_df, n_batches)
      np.random.shuffle(data_df)
      data_df = pd.concat(data_df, axis=0)
    else: # For Training with Random Batching
      data_df = np.array_split(data_df.sample(frac=1), n_batches)
      data_df = pd.concat(data_df, axis=0)
  return data_df

In [None]:
def load_pretrained_model(pretrained_model_name_or_path: str,
                          use_cuda: bool,
                          mixed_precision: bool,
                          num_labels: int) -> PreTrainedModel:
  # Download model config
  model_config = AutoConfig.from_pretrained(
      pretrained_model_name_or_path = pretrained_model_name_or_path,
      num_labels = num_labels)
  # Download model using model config
  model = AutoModelForSequenceClassification.from_pretrained(
      pretrained_model_name_or_path = pretrained_model_name_or_path,
      config = model_config
  )
  # Utilize GPU if specified and available
  if use_cuda and torch.cuda.is_available():
    # Specify and select device as GPU
    device = torch.device('cuda')
    # Transfer model execution to GPU device
    model.to(device)

  # Utilize Mixed precision if specified
  if mixed_precision:
    try:
      model = amp.initialize(model, opt_level='O1')
    except:
      raise ValueError('Trouble initializing `mixed precision` on the initialized model')

  return model

In [None]:
def pad_sequence(seq: List[int], max_input_size: int, pad_value: int) -> List[int]:
  return seq + (max_input_size - len(seq)) * [pad_value]

In [None]:
def compute_acc_metric(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return {"acc": (preds == p.label_ids).mean()}

### Train

In [None]:
def train(finetuning_args_dict: Dict[str, Any],
          train_file_path: str, X_col_name: str, y_col_name: str, num_labels: int,
          validation_file_path: Optional[str] = None,):
  
  # Trainer needs Training Arguments to access all the points of customization during training.
  # HfArgumentParser is subclass of ArgumentParser, uses type hints on dataclasses to generate arguments.
  parser = HfArgumentParser((TrainingArguments, ModelParameters))
  trainer_args, model_args = parser.parse_args_into_dataclasses(convert_args_dict_to_arg_list(finetuning_args_dict))
  
  # If validating, check if path is supplied
  if validation_file_path is None and trainer_args.evaluate_during_training == True:
    raise ValueError("'validation_file_path' must be a supplied, when 'evaluate_during_training == True'.")
  # Check if both data exists in path
  if not os.path.exists(train_file_path) or not os.path.exists(validation_file_path):
    raise ValueError("Please make sure specified data files exist at their location and rerun.")
  
  # Load train data
  train_data_df = load_data(train_file_path, X_col_name, model_args.smart_batching)
  
  # Auto load respective tokenizer (Eg: name = bert-base-uncased)
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_args.model_name)
  
  # Organize Train data
  organized_train_df = organize_data(data_df = train_data_df,
                                     batch_size = trainer_args.per_gpu_train_batch_size,
                                     smart_batching = model_args.smart_batching,
                                     forTrain = True)
  
  # Construct Train Dataset
  train_dataset = SequenceDataset(tokenizer = tokenizer,
                                  max_len = model_args.max_seq_len,
                                  pad_to_max_length = not model_args.dynamic_padding,
                                  data_df = organized_train_df,
                                  input_column_name = X_col_name,
                                  output_column_name = y_col_name)

  # Download and load pretrained model
  model = load_pretrained_model(pretrained_model_name_or_path = model_args.model_name,
                                use_cuda = True,
                                mixed_precision = trainer_args.fp16,
                                num_labels = num_labels)

  
  validation_dataset = None
  # We validate (if requested)
  if trainer_args.evaluate_during_training == True:
    # Load Validation dataset
    val_data_df = load_data(validation_file_path, X_col_name, model_args.smart_batching)
    # Organize Validation batches
    organized_val_df = organize_data(data_df=val_data_df,
                                     batch_size=trainer_args.per_gpu_train_batch_size,
                                     smart_batching = model_args.smart_batching,
                                     forTrain=False)
    # Construct Validation Dataset
    validation_dataset = SequenceDataset(tokenizer = tokenizer,
                                         max_len = model_args.max_seq_len,
                                         pad_to_max_length = not model_args.dynamic_padding,
                                         data_df = organized_val_df,
                                         input_column_name = X_col_name,
                                         output_column_name = y_col_name)
  
  # Define and Intialize a Trainer 
  # https://huggingface.co/transformers/main_classes/trainer.html
  trainer = Trainer(
      model = model,
      args = trainer_args,
      train_dataset = train_dataset,
      eval_dataset = validation_dataset,
      data_collator = dynamic_data_collator,
      compute_metrics = compute_acc_metric,
      tb_writer = SummaryWriter(log_dir='logs', flush_secs=10)
  )


  tick = time.time() # Trainer start time
  trainer.train()
  tock = time.time() # Trainer end time
  print('Training Complete.')
  print(f"Total training time: {time.strftime('%H:%M:%S', time.gmtime(tock-tick))}")

  print('Saving Model')
  trainer.save_model()

  eval_result = None
  if trainer_args.evaluate_during_training == True:
    print('Evaluating Model')
    eval_result = trainer.evaluate()
    print('Evaluation Complete.')

  return trainer, eval_result

### Predict

In [None]:
def predict(finetuning_args_dict: Dict[str, Any], test_file_path: str, X_col_name: str, y_col_name: str, num_labels: int, trained_trainer: Trainer):
  # Arguments 
  parser = HfArgumentParser((TrainingArguments, ModelParameters))
  trainer_args, model_args = parser.parse_args_into_dataclasses(convert_args_dict_to_arg_list(finetuning_args_dict))

  # Load test data
  test_data_df = load_data(test_file_path, X_col_name, model_args.smart_batching)

  # Auto load respective tokenizer (Eg: name = bert-base-uncased)
  tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_args.model_name)

  # Organize Testing batches
  organized_test_df = organize_data(data_df=test_data_df,
                                    batch_size=trainer_args.per_gpu_train_batch_size,
                                    smart_batching = model_args.smart_batching,
                                    forTrain=False)
  

  # Construct Test Dataset
  test_dataset = SequenceDataset(tokenizer = tokenizer,
                                max_len = model_args.max_seq_len,
                                pad_to_max_length = not model_args.dynamic_padding,
                                data_df = organized_test_df,
                                input_column_name = X_col_name,
                                output_column_name = y_col_name)
  print('Testing Model')
  predictions, label_ids, metrics = trained_trainer.predict(test_dataset)
  print('Testing Complete.')

  return predictions, label_ids, metrics

## Finetune

In [None]:
seed_value = 128

In [None]:
set_seed(seed_value)

In [None]:
model_save_root_directory = "model_save"

Name or the path of the pretrained model

In [None]:
# Name or the path of the pretrained model
model_base_name = 'bert-base-uncased'

Token length Analysis

[https://colab.research.google.com/drive/1QSVCtQlDnhW_ico8rgULE2kETyABiGN_?usp=sharing](https://colab.research.google.com/drive/1QSVCtQlDnhW_ico8rgULE2kETyABiGN_?usp=sharing)

In [None]:
# Maximum token length for prescription
pres_max_token_len = 75

In [None]:
# Maximum token length for condition
cond_max_token_len = 50

Some utility functions for colab

In [None]:
def del_directory_tree(folder_name):
  shutil.rmtree(f"{folder_name}")
  if not os.path.exists(folder_name):
    print(f"{folder_name} tree deleted.")

In [None]:
def create_directory_tree(tree_path):
  if not os.path.exists(tree_path):
    os.makedirs(tree_path)
  if os.path.exists(tree_path):
    print(f"{tree_path} tree/path created.")

In [None]:
def save_to_drive(from_dir_path='model_save', to_path="/content/drive/My Drive/trained_model_files", folder_prefix="bert", from_is_file=False):
  print(f'Saving model on google drive...')
  from_dir = f"{from_dir_path}"
  to_dir = f"{to_path}/{folder_prefix}_{from_dir}"
  try:
    if from_is_file:
      to_dir = f"{to_path}/"
      if not os.path.exists(to_dir):
        os.makedirs(to_dir)
      shutil.copy2(from_dir, to_dir)
    else:
      if not os.path.exists(to_dir):
        os.makedirs(to_dir)
      copy_tree(from_dir, to_dir)
    print(f'Model saved.')
  except:
    print(f"Copy to Google drive failed: {sys.exc_info()[-2:]}")

In [None]:
#del_directory_tree('logs')

### Task 1: Prescription -> Eligibility

In [None]:
# Task 1 file names
task1_filenames

['task1_pres_elg_train.csv',
 'task1_pres_elg_val.csv',
 'task1_pres_elg_test.csv']

In [None]:
task1_save_directory_name = "task1_pres_elg_mixed_dynamic_smart_batch_16_seed_128"

In [None]:
task1_finetuning_args = {
	'--output_dir': f'./{model_save_root_directory}/{task1_save_directory_name}',
	'--overwrite_output_dir': True,
	'--save_steps': 0,
	'--seed': seed_value,
	'--num_train_epochs': 1,
	'--learning_rate': 5e-5,
	'--per_gpu_train_batch_size': 16,
	'--gradient_accumulation_steps': 1,
	'--per_gpu_eval_batch_size': 16,
  '--evaluate_during_training': False, # Enable if logs are needed
  '--max_seq_len': pres_max_token_len, # 75 here for prescription
	'--dynamic_padding': True,
	'--smart_batching': True,
	'--fp16': True,
	'--model_name': model_base_name
}

In [None]:
# Create model output dir
create_directory_tree(task1_finetuning_args['--output_dir'])

./model_save/task1_pres_elg_mixed_dynamic_smart_batch_16_seed_128 tree/path created.


Train

In [None]:
task1_trainer, task1_eval_result = train(finetuning_args_dict = task1_finetuning_args,
                                         train_file_path = f'/content/data/{task1_filenames[0]}',
                                         X_col_name = 'prescription',
                                         y_col_name = 'label',
                                         num_labels = 2,
                                         validation_file_path = f'/content/data/{task1_filenames[1]}')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights       

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=26853.0, style=ProgressStyle(description_…



Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0


Saving model to Google Drive

In [None]:
save_to_drive(from_dir_path='model_save', to_path="/content/drive/My Drive/trained_model_files", folder_prefix="bert", from_is_file=False)

Saving model on google drive...
Model saved.


Saving logs

In [None]:
save_to_drive(from_dir_path='logs', to_path="/content/drive/My Drive/trained_model_files", folder_prefix="Logs_bert", from_is_file=False)

Saving model on google drive...
Model saved.


Predict

In [None]:
predictions, label_ids, metrics = predict(finetuning_args_dict = task1_finetuning_args,
                                          test_file_path = f'/content/data/{task1_filenames[2]}',
                                          X_col_name = 'prescription',
                                          y_col_name = 'label',
                                          num_labels = 2,
                                          trained_trainer = task1_trainer)

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


Testing Model


HBox(children=(FloatProgress(value=0.0, description='Prediction', max=7459.0, style=ProgressStyle(description_…


Testing Complete.


In [None]:
metrics # Fuck rerun 

NameError: ignored

Resources I used:

**[Slow approach]**
**Native [Pytorch + Huggingface] approach (2hrs/epoch)**:: based approach**(4.5hrs)** + gradient accumulation(-0) + mixed precision**(-2.5hrs)**
https://colab.research.google.com/drive/1TJmhr-n9_Ynrb6eusAim-7DmuqY34OcC?usp=sharing

**Resources:**
**(Google Tensorflow)** https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb#scrollTo=lXsXev5MNr20
**(Glue)** https://mccormickml.com/2019/11/05/GLUE/
**(Finetuning BERT Native)** https://mccormickml.com/2019/07/22/BERT-fine-tuning/
**(Barebones Word and Sentence Embedding extraction strategy)** http://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
**(Bert Concept)** https://mccormickml.com/2019/11/11/bert-research-ep-1-key-concepts-and-sources/
**(Bio_Discharge BERT)** https://huggingface.co/emilyalsentzer/Bio_Discharge_Summary_BERT

**Model execution improvement techniques:**
**Grad Accumulation**
- https://towardsdatascience.com/how-to-break-gpu-memory-boundaries-even-with-large-batch-sizes-7a9c27a400ce
- https://towardsdatascience.com/what-is-gradient-accumulation-in-deep-learning-ec034122cfa
- https://towardsdatascience.com/how-to-easily-use-gradient-accumulation-in-keras-models-fa02c0342b60
- (Grad Accum + Distributed) https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255

**Grad Accumulation + Mixed Precision + Dynamic Batching + Smart Batching**
- (Deep Explanations) https://towardsdatascience.com/divide-hugging-face-transformers-training-time-by-2-or-more-21bf7129db9q-21bf7129db9e
- (Performance overview & visualizations) https://app.wandb.ai/pommedeterresautee/speed_training/reports/Train-HuggingFace-models-twice-as-fast-with-dynamic-padding-and-uniform-length-batching--VmlldzoxMDgzOTI
- (Lightning Library approaches) https://towardsdatascience.com/9-tips-for-training-lightning-fast-neural-networks-in-pytorch-8e63a502f565
(Brief implementation overview): https://huggingface.co/transformers/training.html
- (Trainer class overview): https://huggingface.co/transformers/main_classes/trainer.html

**Nvidia Apex Presentation for Mixed Precision comparison:** https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9998-automatic-mixed-precision-in-pytorch.pdf

**[Fast approach]**
**Trainer [Huggingface] approach (1hr/epoch)**:: base approch (unknown) + gradient accumulation + dynamic padding + smart batching + mixed precision
https://colab.research.google.com/drive/1tWk9BFsdANr25yXANdalPP3wXhUBWRk3?usp=sharing

In [None]:
d=[]
while(1):
  d.append('1')