# Fine Tuning T5 Models

## One Time Setup

### Install Dependencies

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install git+https://github.com/google-research/bleurt.git
!pip install setuptools accelerate nvidia-ml-py3

### Connect to Google Drive
We will be loading data from google drive and also save trained models to google drive. So lets mount google drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Imports and Constants

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import transformers
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from pynvml import *
import os,sys,humanize,psutil
import gc
from torch.utils.data.dataloader import DataLoader
import torch
import time

SEED = 42
CHECKPOINTS_TO_SAVE = 1
PROMPT = 'Generate next line: '
SAVED_MODEL_PATH_FORMAT = 'drive/MyDrive/MIDS/w266/project/saved_models/final/{}-{}-finetuned'
DATA_FILES_BASE_PATH = 'drive/MyDrive/MIDS/w266/project/datasci-w266-2023-spring-team-story-bot/data/'
MAIN_DATA_FILE_FORMAT = 'posptproc_corpus_spacy_{}.csv'
TRAIN_VAL_FILE_FORMAT = 'posptproc_corpus_spacy_{}_train_val.csv'
TEST_FILE_FORMAT = 'posptproc_corpus_spacy_{}_test.csv'

def print_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print("CPU RAM Used: " + humanize.naturalsize( psutil.virtual_memory().used))
    print("CPU RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available))

    print(f"GPU memory occupied: {info.used//1024**2} MB.")
    print('Using device:', device)
    print()
    if device.type == 'cuda':
        print(torch.cuda.get_device_name(0))
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")

# Display details about the environment.
print(f'torch.__version__: {torch.__version__}')
!nvcc --version
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Utilization at the beginning:')
print_utilization()
!nvidia-smi

In [None]:
# Helper Methods and classes
# Create torch dataset
class T5InputDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.targets["input_ids"])
    
    def __getitem__(self, index):
        input_ids = self.inputs["input_ids"][index].squeeze()
        target_ids = self.targets["input_ids"][index].squeeze()
        attention_mask = self.inputs['attention_mask'][index].squeeze()
        return {'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': target_ids}

class TuningConfig:
  def __init__(self, model_name, data_files_base_path, dataset, max_len, epochs, training_samples, val_samples, batch_size):
    self.model_name = model_name
    self.dataset = dataset
    self.max_len = max_len
    self.epochs = epochs
    self.training_samples = training_samples
    self.val_samples = val_samples
    self.train_batch_size = batch_size
    self.val_batch_size = 8
    self.main_data_file = data_files_base_path + MAIN_DATA_FILE_FORMAT.format(dataset)
    self.train_val_data_file = data_files_base_path + TRAIN_VAL_FILE_FORMAT.format(dataset)
    self.test_data_file = data_files_base_path + TEST_FILE_FORMAT.format(dataset)
    self.tuned_model_path = SAVED_MODEL_PATH_FORMAT.format(model_name, dataset)

def load_data(main_file, train_val_file, test_file, test_seed=SEED, load_splits_from_file=False, prompt='', include_test=False, train_size=-1, val_size=-1):
  def save_to(x, y, file_name):
    xy = {'variable': x, 'label': y}
    df = pd.DataFrame(xy)
    df.to_csv(file_name, index=False)

  def load_from(file_name):
    df = pd.read_csv(file_name)
    df = df.astype({'variable':'string', 'label':'string'})
    return df['variable'], df['label']

  if load_splits_from_file:
    x_train_val, y_train_val = load_from(train_val_file)
    x_test, y_test = load_from(test_file)
  else:
    x, y = load_from(main_file)
    # Split the dataset into train (80%), validation (10%) and test (10%) datasets.
    # Test data should be determinable.
    x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, train_size=0.9, random_state=test_seed)
    # Save train-val and test data separately.
    save_to(x_train_val, y_train_val, train_val_file)
    save_to(x_test, y_test, test_file)

  # Split train and validation datasets.
  x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, train_size=0.88)

  if train_size > 0:
    x_train = x_train[:train_size]
    y_train = y_train[:train_size]
  if val_size > 0:
    x_val = x_val[:val_size]
    y_val = y_val[:val_size]

  if prompt is not None and len(prompt) > 0:
    x_train = prompt + x_train
    x_val = prompt + x_val
    x_test = prompt + x_test

  if include_test:
    return x_train, x_val, y_train, y_val, x_test, y_test
  else:
    return x_train, x_val, y_train, y_val



In [None]:
TRAINING_SAMPLES = 100000
VAL_SAMPLES = 1000

tuning_configs = [
    TuningConfig('google/t5-v1_1-base', data_files_base_path=DATA_FILES_BASE_PATH, 
                 dataset='s1', max_len=65, epochs=3, training_samples=TRAINING_SAMPLES,
                 val_samples=VAL_SAMPLES, batch_size=32),
    TuningConfig('google/t5-v1_1-base', data_files_base_path=DATA_FILES_BASE_PATH, 
                 dataset='s2', max_len=110, epochs=3, training_samples=TRAINING_SAMPLES,
                 val_samples=VAL_SAMPLES, batch_size=32),
    TuningConfig('google/t5-v1_1-base', data_files_base_path=DATA_FILES_BASE_PATH,
                 dataset='s3', max_len=150, epochs=3, training_samples=TRAINING_SAMPLES,
                 val_samples=VAL_SAMPLES, batch_size=32)
]

# DATA_NAME = "s2"
# T5_MODEL_NAME = "t5-small"
# T5_MODEL_NAME = "t5-base"
# T5_MODEL_NAME = "t5-large" - colab instances do not have enough memory for T5 large.
# T5_MODEL_NAME = 'google/t5-v1_1-small'
# T5_MODEL_NAME = 'google/t5-v1_1-base'

# MAIN_DATA_FILE = f'drive/MyDrive/MIDS/w266/project/datasci-w266-2023-spring-team-story-bot/posptproc_corpus_spacy_{DATA_NAME}.csv'
# TRAIN_DATA_FILE = f'posptproc_corpus_spacy_{DATA_NAME}_train.csv'
# VAL_DATA_FILE = f'posptproc_corpus_spacy_{DATA_NAME}_val.csv'

# NUM_TRAIN_SAMPLES = 100000
# # NUM_TRAIN_SAMPLES = 25000
# # NUM_VAL_SAMPLES = 45000
# NUM_VAL_SAMPLES = 1000
# # MAX_LOAD_AT_ONCE = 10000
# SRC_MAX_LENGTH=512
# TARGET_MAX_LENGTH=128

# MODEL_CKPT_FOLDER = 'drive/MyDrive/MIDS/w266/project/checkpoints/'
# MODEL_CKPT_FILE = MODEL_CKPT_FOLDER + f'{T5_MODEL_NAME}-finetuned-02'
# TUNED_T5_SAVED = f'drive/MyDrive/MIDS/w266/project/saved_models/final/{T5_MODEL_NAME}-data{DATA_NAME}-finetuned'
# BATCH_SIZE = 16

### Train Model

In [None]:
def train(config, device):
  def tokenize(tokenizer, data, max_length):
    tokenized = tokenizer(
      list(data),
      max_length=max_length,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt')
    return tokenized

  # Load the data
  x_train, x_val, y_train, y_val = load_data(
      config.main_data_file, config.train_val_data_file, 
      config.test_data_file, test_seed=SEED, 
      load_splits_from_file=True, prompt=PROMPT, include_test=False,
      train_size=config.training_samples, val_size=config.val_samples)
  print(f'x-train shape: {x_train.shape}, x-val shape: {x_val.shape}, y-train shape: {y_train.shape}, y-val shape: {y_val.shape}')

  # Load the model
  tokenizer = T5Tokenizer.from_pretrained(config.model_name)
  model = T5ForConditionalGeneration.from_pretrained(config.model_name).to(device)
  print('Utilization after loading model:')
  print_utilization()

  # Tokenize data
  x_train_tokenized = tokenize(tokenizer, x_train, config.max_len)
  y_train_tokenized = tokenize(tokenizer, y_train, config.max_len)
  x_val_tokenized = tokenize(tokenizer, x_val, config.max_len)
  y_val_tokenized = tokenize(tokenizer, y_val, config.max_len)

  training_set = T5InputDataset(x_train_tokenized, y_train_tokenized)
  validation_set = T5InputDataset(x_val_tokenized, y_val_tokenized)

  args = Seq2SeqTrainingArguments(
      output_dir='checkpoints',
      evaluation_strategy='epoch',
      save_strategy='epoch',
      per_device_train_batch_size=config.train_batch_size,
      per_device_eval_batch_size=config.val_batch_size,
      num_train_epochs=config.epochs,
      load_best_model_at_end=True,
      save_total_limit=CHECKPOINTS_TO_SAVE,
      learning_rate=3e-4,
      optim='adamw_torch',
      # gradient_accumulation_steps=4,
      # fp16=True,
      bf16=True,
      tf32=True
  )

  # Define the trainer, passing in the model, training args, and data generators
  trainer = Seq2SeqTrainer(
      model,
      args,
      train_dataset=training_set,
      eval_dataset=validation_set
  )

  print(f'{"*"*25}Training model {config.model_name} on {config.dataset} {"*"*25}')
  st = time.time()
  result = trainer.train()
  et = time.time()

  # get the execution time
  elapsed_time = et - st
  print_summary(result)
  print('Utilization after training: ')
  print_utilization()  
  
  # Save the tuned model
  trainer.save_model(config.tuned_model_path)

  # Post training cleanup
  trainer = None
  model = None
  with torch.no_grad():
      torch.cuda.empty_cache()
  gc.collect()
  os.system('nvidia-smi -caa')
  print('Utilization after post training cleanup: ')
  print_utilization()  
  print(f'{"*"*25}Training took {elapsed_time} seconds {"*"*25}')


In [None]:
# Train all model configurations.
for config in tuning_configs:
  train(config, device)


## Inference

In [None]:
# # Final test list for model trained against s2 dataset.
# FINAL_TEST_LIST = ['Princess Leia lay upon her bed all the night. She could not sleep at all.',
#                    'He stopped himself for a minute and thought if it was the right thing to do. It did seem like a good thing to do.',
#                    'There once lived king named Rama. He was very wise and just.',
#                    'Once upon a time, an old owl lived in the forest. He was very wise.']

# Final test list for model trained against s1 dataset.
FINAL_TEST_LIST = ['Princess Leia lay upon her bed all the night.',
                   'He stopped himself for a minute and thought if it was the right thing to do.',
                   'There once lived king named Rama.',
                   'Once upon a time, an old owl lived in the forest.']

def evaluate(model, tokenizer, lines, prompt):
  transformers.logging.set_verbosity_error()
  for test_input_text in lines:
      test_inputs = tokenizer([prompt + test_input_text], return_tensors='pt')
      test_output_ids = model.generate(
          test_inputs['input_ids'].cuda(),
          num_beams=5,
          no_repeat_ngram_size=3,
          num_return_sequences=5,
          max_new_tokens=100,
          do_sample=True,
          top_k=0)
      print(f'Input: {test_input_text}')
      decoded = [tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) for out_ids in test_output_ids]
      print(f'Output: {decoded}')

In [None]:
## Untrained T5 model
# evaluate(T5ForConditionalGeneration.from_pretrained("t5-large").cuda(), t5_tokenizer, FINAL_TEST_LIST, "Continue the next sentence of the story: ")

In [None]:
## Fine tuned T5 model
evaluate(T5ForConditionalGeneration.from_pretrained(TUNED_T5_SAVED).cuda(), t5_tokenizer, FINAL_TEST_LIST, PROMPT)


In [None]:
TUNED_T5_SAVED

In [None]:
evaluate(T5ForConditionalGeneration.from_pretrained(TUNED_T5_SAVED).cuda(), t5_tokenizer, FINAL_TEST_LIST, PROMPT)
