# Fine Tuning T5 Models

## One Time Setup

### Install Dependencies

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install git+https://github.com/google-research/bleurt.git
!pip install setuptools accelerate nvidia-ml-py3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m93.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://u

### Connect to Google Drive
We will be loading data from google drive and also save trained models to google drive. So lets mount google drive.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Imports and Constants

In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import transformers
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from pynvml import *
import os,sys,humanize,psutil
import gc
from torch.utils.data.dataloader import DataLoader
import torch
import time

SEED = 42
CHECKPOINTS_TO_SAVE = 1
PROMPT = 'Generate next line: '
SAVED_MODEL_PATH_FORMAT = 'drive/MyDrive/MIDS/w266/project/saved_models/final/{}-{}-finetuned'
DATA_FILES_BASE_PATH = 'drive/MyDrive/MIDS/w266/project/datasci-w266-2023-spring-team-story-bot/data/'
MAIN_DATA_FILE_FORMAT = 'posptproc_corpus_spacy_{}.csv'
TRAIN_VAL_FILE_FORMAT = 'posptproc_corpus_spacy_{}_train_val.csv'
TEST_FILE_FORMAT = 'posptproc_corpus_spacy_{}_test.csv'

def print_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print("CPU RAM Used: " + humanize.naturalsize( psutil.virtual_memory().used))
    print("CPU RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available))

    print(f"GPU memory occupied: {info.used//1024**2} MB.")
    print('Using device:', device)
    print()
    if device.type == 'cuda':
        print(torch.cuda.get_device_name(0))
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")

# Display details about the environment.
print(f'torch.__version__: {torch.__version__}')
!nvcc --version
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Utilization at the beginning:')
print_utilization()
!nvidia-smi

torch.__version__: 2.0.0+cu118
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Utilization at the beginning:
CPU RAM Used: 1.9 GB
CPU RAM Free: 86.9 GB
GPU memory occupied: 449 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
Thu Apr  6 18:54:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off 

In [4]:
# Helper Methods and classes
# Create torch dataset
class T5InputDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.targets["input_ids"])
    
    def __getitem__(self, index):
        input_ids = self.inputs["input_ids"][index].squeeze()
        target_ids = self.targets["input_ids"][index].squeeze()
        attention_mask = self.inputs['attention_mask'][index].squeeze()
        return {'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': target_ids}

class TuningConfig:
  def __init__(self, model_name, data_files_base_path, dataset, max_len, epochs, training_samples, val_samples, batch_size):
    self.model_name = model_name
    self.dataset = dataset
    self.max_len = max_len
    self.epochs = epochs
    self.training_samples = training_samples
    self.val_samples = val_samples
    self.train_batch_size = batch_size
    self.val_batch_size = 8
    self.main_data_file = data_files_base_path + MAIN_DATA_FILE_FORMAT.format(dataset)
    self.train_val_data_file = data_files_base_path + TRAIN_VAL_FILE_FORMAT.format(dataset)
    self.test_data_file = data_files_base_path + TEST_FILE_FORMAT.format(dataset)
    self.tuned_model_path = SAVED_MODEL_PATH_FORMAT.format(model_name, dataset)

def load_data(main_file, train_val_file, test_file, test_seed=SEED, load_splits_from_file=False, prompt='', include_test=False, train_size=-1, val_size=-1):
  def save_to(x, y, file_name):
    xy = {'variable': x, 'label': y}
    df = pd.DataFrame(xy)
    df.to_csv(file_name, index=False)

  def load_from(file_name):
    df = pd.read_csv(file_name)
    df = df.astype({'variable':'string', 'label':'string'})
    return df['variable'], df['label']

  if load_splits_from_file:
    x_train_val, y_train_val = load_from(train_val_file)
    x_test, y_test = load_from(test_file)
  else:
    x, y = load_from(main_file)
    # Split the dataset into train (80%), validation (10%) and test (10%) datasets.
    # Test data should be determinable.
    x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, train_size=0.9, random_state=test_seed)
    # Save train-val and test data separately.
    save_to(x_train_val, y_train_val, train_val_file)
    save_to(x_test, y_test, test_file)

  # Split train and validation datasets.
  x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, train_size=0.88)

  if train_size > 0:
    x_train = x_train[:train_size]
    y_train = y_train[:train_size]
  if val_size > 0:
    x_val = x_val[:val_size]
    y_val = y_val[:val_size]

  if prompt is not None and len(prompt) > 0:
    x_train = prompt + x_train
    x_val = prompt + x_val
    x_test = prompt + x_test

  if include_test:
    return x_train, x_val, y_train, y_val, x_test, y_test
  else:
    return x_train, x_val, y_train, y_val



In [12]:
TRAINING_SAMPLES = 100000
TRAINING_SAMPLES = -1
VAL_SAMPLES = 1000

tuning_configs = [
    TuningConfig('google/t5-v1_1-base', data_files_base_path=DATA_FILES_BASE_PATH, 
                 dataset='s1', max_len=65, epochs=3, training_samples=TRAINING_SAMPLES,
                 val_samples=VAL_SAMPLES, batch_size=128),
    TuningConfig('google/t5-v1_1-base', data_files_base_path=DATA_FILES_BASE_PATH, 
                 dataset='s2', max_len=110, epochs=3, training_samples=TRAINING_SAMPLES,
                 val_samples=VAL_SAMPLES, batch_size=64),
    TuningConfig('google/t5-v1_1-base', data_files_base_path=DATA_FILES_BASE_PATH,
                 dataset='s3', max_len=150, epochs=3, training_samples=TRAINING_SAMPLES,
                 val_samples=VAL_SAMPLES, batch_size=64)
]

# DATA_NAME = "s2"
# T5_MODEL_NAME = "t5-small"
# T5_MODEL_NAME = "t5-base"
# T5_MODEL_NAME = "t5-large" - colab instances do not have enough memory for T5 large.
# T5_MODEL_NAME = 'google/t5-v1_1-small'
# T5_MODEL_NAME = 'google/t5-v1_1-base'

# MAIN_DATA_FILE = f'drive/MyDrive/MIDS/w266/project/datasci-w266-2023-spring-team-story-bot/posptproc_corpus_spacy_{DATA_NAME}.csv'
# TRAIN_DATA_FILE = f'posptproc_corpus_spacy_{DATA_NAME}_train.csv'
# VAL_DATA_FILE = f'posptproc_corpus_spacy_{DATA_NAME}_val.csv'

# NUM_TRAIN_SAMPLES = 100000
# # NUM_TRAIN_SAMPLES = 25000
# # NUM_VAL_SAMPLES = 45000
# NUM_VAL_SAMPLES = 1000
# # MAX_LOAD_AT_ONCE = 10000
# SRC_MAX_LENGTH=512
# TARGET_MAX_LENGTH=128

# MODEL_CKPT_FOLDER = 'drive/MyDrive/MIDS/w266/project/checkpoints/'
# MODEL_CKPT_FILE = MODEL_CKPT_FOLDER + f'{T5_MODEL_NAME}-finetuned-02'
# TUNED_T5_SAVED = f'drive/MyDrive/MIDS/w266/project/saved_models/final/{T5_MODEL_NAME}-data{DATA_NAME}-finetuned'
# BATCH_SIZE = 16

### Train Model

In [6]:
def train(config, device):
  def tokenize(tokenizer, data, max_length):
    tokenized = tokenizer(
      list(data),
      max_length=max_length,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt')
    return tokenized

  # Load the data
  x_train, x_val, y_train, y_val = load_data(
      config.main_data_file, config.train_val_data_file, 
      config.test_data_file, test_seed=SEED, 
      load_splits_from_file=True, prompt=PROMPT, include_test=False,
      train_size=config.training_samples, val_size=config.val_samples)
  print(f'x-train shape: {x_train.shape}, x-val shape: {x_val.shape}, y-train shape: {y_train.shape}, y-val shape: {y_val.shape}')

  # Load the model
  tokenizer = T5Tokenizer.from_pretrained(config.model_name)
  model = T5ForConditionalGeneration.from_pretrained(config.model_name).to(device)
  print('Utilization after loading model:')
  print_utilization()

  # Tokenize data
  x_train_tokenized = tokenize(tokenizer, x_train, config.max_len)
  y_train_tokenized = tokenize(tokenizer, y_train, config.max_len)
  x_val_tokenized = tokenize(tokenizer, x_val, config.max_len)
  y_val_tokenized = tokenize(tokenizer, y_val, config.max_len)

  training_set = T5InputDataset(x_train_tokenized, y_train_tokenized)
  validation_set = T5InputDataset(x_val_tokenized, y_val_tokenized)

  args = Seq2SeqTrainingArguments(
      output_dir='checkpoints',
      evaluation_strategy='epoch',
      save_strategy='epoch',
      per_device_train_batch_size=config.train_batch_size,
      per_device_eval_batch_size=config.val_batch_size,
      num_train_epochs=config.epochs,
      load_best_model_at_end=True,
      save_total_limit=CHECKPOINTS_TO_SAVE,
      learning_rate=3e-4,
      optim='adamw_torch',
      # gradient_accumulation_steps=4,
      # fp16=True,
      bf16=True,
      tf32=True
  )

  # Define the trainer, passing in the model, training args, and data generators
  trainer = Seq2SeqTrainer(
      model,
      args,
      train_dataset=training_set,
      eval_dataset=validation_set
  )

  st = time.time()
  result = trainer.train()
  et = time.time()

  # get the execution time
  elapsed_time = et - st
  print_summary(result)
  print('Utilization after training: ')
  print_utilization()  
  
  # Save the tuned model
  trainer.save_model(config.tuned_model_path)

  # Post training cleanup
  trainer = None
  model = None
  with torch.no_grad():
      torch.cuda.empty_cache()
  gc.collect()
  os.system('nvidia-smi -caa')
  print('Utilization after post training cleanup: ')
  print_utilization()  
  print(f'{"*"*25}Training took {elapsed_time} seconds {"*"*25}')


In [7]:
%%time
print(f'{"*"*25}Training model {tuning_configs[0].model_name} on {tuning_configs[0].dataset} {"*"*25}')
train(tuning_configs[0], device)


*************************Training model google/t5-v1_1-base on s1 *************************
x-train shape: (163302,), x-val shape: (1000,), y-train shape: (163302,), y-val shape: (1000,)


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Utilization after loading model:
CPU RAM Used: 4.1 GB
CPU RAM Free: 84.7 GB
GPU memory occupied: 2443 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.9 GB
Cached:    1.0 GB


Epoch,Training Loss,Validation Loss
1,1.5325,1.357081
2,1.4164,1.30509
3,1.3676,1.274744


Time: 1159.56
Samples/second: 422.49
Utilization after training: 
CPU RAM Used: 4.8 GB
CPU RAM Free: 83.9 GB
GPU memory occupied: 29267 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 2.8 GB
Cached:    26.8 GB
Utilization after post training cleanup: 
CPU RAM Used: 4.8 GB
CPU RAM Free: 83.9 GB
GPU memory occupied: 1845 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
*************************Training took 1159.5720477104187 seconds *************************
CPU times: user 18min 19s, sys: 2min 16s, total: 20min 36s
Wall time: 20min 51s


In [8]:
%%time
print(f'{"*"*25}Training model {tuning_configs[1].model_name} on {tuning_configs[1].dataset} {"*"*25}')
train(tuning_configs[1], device)


*************************Training model google/t5-v1_1-base on s2 *************************
x-train shape: (162917,), x-val shape: (1000,), y-train shape: (162917,), y-val shape: (1000,)
Utilization after loading model:
CPU RAM Used: 5.9 GB
CPU RAM Free: 82.8 GB
GPU memory occupied: 2839 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.9 GB
Cached:    1.0 GB


Epoch,Training Loss,Validation Loss
1,0.8461,0.730506
2,0.8084,0.708829
3,0.7851,0.701837


Time: 2065.85
Samples/second: 236.59
Utilization after training: 
CPU RAM Used: 6.1 GB
CPU RAM Free: 82.7 GB
GPU memory occupied: 26669 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 2.8 GB
Cached:    24.3 GB
Utilization after post training cleanup: 
CPU RAM Used: 6.0 GB
CPU RAM Free: 82.7 GB
GPU memory occupied: 1845 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
*************************Training took 2065.863163471222 seconds *************************
CPU times: user 32min 57s, sys: 3min 9s, total: 36min 7s
Wall time: 36min 3s


In [13]:
%%time
print(f'{"*"*25}Training model {tuning_configs[2].model_name} on {tuning_configs[2].dataset} {"*"*25}')
train(tuning_configs[2], device)


*************************Training model google/t5-v1_1-base on s3 *************************
x-train shape: (162536,), x-val shape: (1000,), y-train shape: (162536,), y-val shape: (1000,)
Utilization after loading model:
CPU RAM Used: 7.6 GB
CPU RAM Free: 81.1 GB
GPU memory occupied: 8693 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 4.7 GB
Cached:    6.7 GB


Epoch,Training Loss,Validation Loss
1,0.6521,0.597351
2,0.628,0.580931
3,0.61,0.573198


Time: 2761.52
Samples/second: 176.57
Utilization after training: 
CPU RAM Used: 8.3 GB
CPU RAM Free: 80.4 GB
GPU memory occupied: 39595 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 6.5 GB
Cached:    36.9 GB
Utilization after post training cleanup: 
CPU RAM Used: 8.3 GB
CPU RAM Free: 80.4 GB
GPU memory occupied: 8693 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 3.8 GB
Cached:    6.7 GB
*************************Training took 2761.535127401352 seconds *************************
CPU times: user 41min 20s, sys: 6min 49s, total: 48min 10s
Wall time: 47min 59s


In [11]:
with torch.no_grad():
    torch.cuda.empty_cache()
gc.collect()
os.system('nvidia-smi -caa')
print('Utilization after post training cleanup: ')
print_utilization()  


Utilization after post training cleanup: 
CPU RAM Used: 6.9 GB
CPU RAM Free: 81.8 GB
GPU memory occupied: 8693 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 3.8 GB
Cached:    6.7 GB


In [16]:
# # Final test list for model trained against s2 dataset.
# FINAL_TEST_LIST = ['Princess Leia lay upon her bed all the night. She could not sleep at all.',
#                    'He stopped himself for a minute and thought if it was the right thing to do. It did seem like a good thing to do.',
#                    'There once lived king named Rama. He was very wise and just.',
#                    'Once upon a time, an old owl lived in the forest. He was very wise.']

# Final test list for model trained against s1 dataset.
FINAL_TEST_LIST = ['Princess Leia lay upon her bed all the night.',
                   'He stopped himself for a minute and thought if it was the right thing to do.',
                   'There once lived king named Rama.',
                   'Once upon a time, an old owl lived in the forest.']


def evaluate(model, tokenizer, lines, prompt):
  transformers.logging.set_verbosity_error()
  for test_input_text in lines:
      test_inputs = tokenizer([prompt + test_input_text], return_tensors='pt')
      test_output_ids = model.generate(
          test_inputs['input_ids'].cuda(),
          num_beams=5,
          no_repeat_ngram_size=3,
          num_return_sequences=5,
          max_new_tokens=100,
          do_sample=True,
          top_k=0)
      print(f'Input: {test_input_text}')
      decoded = [tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) for out_ids in test_output_ids]
      print(f'Output: {decoded}')

In [17]:
## Untrained T5 model
# evaluate(T5ForConditionalGeneration.from_pretrained("t5-large").cuda(), t5_tokenizer, FINAL_TEST_LIST, "Continue the next sentence of the story: ")

In [18]:
for config in tuning_configs:
  tokenizer = T5Tokenizer.from_pretrained(config.model_name)
  print(f'Evaluating {config.model_name} tuned on {config.dataset} dataset')
  evaluate(T5ForConditionalGeneration.from_pretrained(config.tuned_model_path).cuda(), tokenizer, FINAL_TEST_LIST, PROMPT)


Evaluating google/t5-v1_1-base tuned on s1 dataset
Input: Princess Leia lay upon her bed all the night.
Output: ['“I have a dream,” said she, “and I am going to tell you a story, and I will tell you all about it.”', 'Then she sat down on the sofa, and slept for a long time.', '"It\'s a great deal of work," she said.', 'Then he went to bed, and when she had slept a long time, he sat down on the bed.', '"It is a long time since I heard the sound of a noise," she said, "and I have been thinking of it for some time, and I don\'t know what to do with it.']
Input: He stopped himself for a minute and thought if it was the right thing to do.
Output: ['Then he said, “It is a matter of business,” and he sat down to rest.', 'Then he said, “I am going to make a good deal of money.”', 'Then he said to himself: “It will be a pleasure to see you, and I will take care of you.”', 'Then he said, “I am going to tell you something, and I will show you what I have done, and what I can do for you, and how I