# Fine Tuning Models

## Working With This Notebook
This notebook needs following setup before it can run:
- Have a project folder setup in google drive 
- A data folder under this project folder that has the train-val and test splits of all three datasets (total 6 csv files)
- common.py, with common code and constants, copied to the project folder on google drive
- GDRIVE_BASE below updated to point to the project foder on google drive
- a GPU for efficient training and inference

### Install Dependencies

In [None]:
GDRIVE_BASE = 'drive/MyDrive/MIDS/w266/project/'

!pip install transformers sentencepiece
!pip install git+https://github.com/google-research/bleurt.git
!pip install setuptools accelerate nvidia-ml-py3
!pip install bertviz 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m 

### Connect to Google Drive
We will be loading data from google drive and also save trained models to google drive. So lets mount google drive.

In [None]:
import sys
from google.colab import drive
drive.mount('/content/drive')
sys.path.insert(0, GDRIVE_BASE)


Mounted at /content/drive


### Imports and Constants

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import transformers
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoModel, AutoTokenizer, OPTForCausalLM, Trainer, TrainingArguments
from pynvml import *
import os,sys,humanize,psutil
import gc
from torch.utils.data.dataloader import DataLoader
import torch
import time
import common # Imported from common.py on google drive.
from bertviz import model_view, head_view, neuron_view, transformers_neuron_view

def free_gpu_ram():
  with torch.no_grad():
      torch.cuda.empty_cache()
  gc.collect()
  os.system('nvidia-smi -caa')

def print_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print("CPU RAM Used: " + humanize.naturalsize( psutil.virtual_memory().used))
    print("CPU RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available))

    print(f"GPU memory occupied: {info.used//1024**2} MB.")
    print('Using device:', device)
    print()
    if device.type == 'cuda':
        print(torch.cuda.get_device_name(0))
        print('Memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")

# Display details about the environment.
print(f'common.__version__: {common.__version__}')
print(f'torch.__version__: {torch.__version__}')
!nvcc --version
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Utilization at the beginning:')
print_utilization()
!nvidia-smi

common.__version__: 1.3
torch.__version__: 2.0.0+cu118
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Utilization at the beginning:
CPU RAM Used: 1.7 GB
CPU RAM Free: 87.1 GB
GPU memory occupied: 449 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
Sun Apr  9 00:08:41 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off 

In [None]:
# Helper Methods and classes
# Create torch dataset
class T5InputDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.targets["input_ids"])
    
    def __getitem__(self, index):
        input_ids = self.inputs["input_ids"][index].squeeze()
        target_ids = self.targets["input_ids"][index].squeeze()
        attention_mask = self.inputs['attention_mask'][index].squeeze()
        return {'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': target_ids}

class OptInputDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.labels = labels
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids']) 

def load_data(main_file, train_val_file, test_file, test_seed=common.SEED, load_splits_from_file=False, prompt='', include_test=False, train_size=-1, val_size=-1):
  def save_to(x, y, file_name):
    xy = {'variable': x, 'label': y}
    df = pd.DataFrame(xy)
    print(f'Saved {df.shape[0]} rows to {file_name}')
    df.to_csv(file_name, index=False)

  def load_from(file_name):
    df = pd.read_csv(file_name)
    df = df.astype({'variable':'string', 'label':'string'})
    print(f'Loaded {df.shape[0]} rows from {file_name}')
    return df['variable'], df['label']

  if load_splits_from_file:
    x_train_val, y_train_val = load_from(train_val_file)
    x_test, y_test = load_from(test_file)
  else:
    x, y = load_from(main_file)
    # Split the dataset into train (80%), validation (10%) and test (10%) datasets.
    # Test data should be determinable.
    x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, train_size=0.9, random_state=test_seed)
    # Save train-val and test data separately.
    save_to(x_train_val, y_train_val, train_val_file)
    save_to(x_test, y_test, test_file)

  # Split train and validation datasets.
  x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, train_size=0.88)

  if train_size > 0:
    x_train = x_train[:train_size]
    y_train = y_train[:train_size]
  if val_size > 0:
    x_val = x_val[:val_size]
    y_val = y_val[:val_size]

  if prompt is not None and len(prompt) > 0:
    x_train = prompt + x_train
    x_val = prompt + x_val
    x_test = prompt + x_test

  if include_test:
    return x_train, x_val, y_train, y_val, x_test, y_test
  else:
    return x_train, x_val, y_train, y_val

def t5_datasets_provider(config, x_train, y_train, x_val, y_val):
  tokenizer = AutoTokenizer.from_pretrained(config.base_model)
  tokenize = lambda data: tokenizer(
    list(data),
    max_length=config.max_len,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt')

  # Tokenize data
  x_train_tokenized = tokenize(x_train)
  y_train_tokenized = tokenize(y_train)
  x_val_tokenized = tokenize(x_val)
  y_val_tokenized = tokenize(y_val)
  # Create and return datasets
  training_set = T5InputDataset(x_train_tokenized, y_train_tokenized)
  validation_set = T5InputDataset(x_val_tokenized, y_val_tokenized)
  return training_set, validation_set

def opt_datasets_provider(config, x_train, y_train, x_val, y_val):
  tokenizer = AutoTokenizer.from_pretrained(config.base_model)    
  tokenize = lambda data: tokenizer(
      list(data),
      padding="max_length",
      truncation=True,
      max_length=config.max_len,
      return_tensors='pt')
  x_train_tokenized = tokenize(x_train)
  x_val_tokenized = tokenize(x_val)
  training_set = OptInputDataset(x_train_tokenized, x_train_tokenized['input_ids'])#, y_train_tk['input_ids'])
  validation_set = OptInputDataset(x_val_tokenized, x_val_tokenized['input_ids'])#, y_test_tk['input_ids'])
  return training_set, validation_set

def t5_trainer_provider(config, training_set, validation_set, device):
  # Create trainer
  model = T5ForConditionalGeneration.from_pretrained(config.model_name).to(device)
  print(f'Utilization after loading model {config.model_name}:')
  print_utilization()

  args = Seq2SeqTrainingArguments(
      output_dir='checkpoints',
      evaluation_strategy='epoch',
      save_strategy='epoch',
      per_device_train_batch_size=config.train_batch_size,
      per_device_eval_batch_size=config.val_batch_size,
      num_train_epochs=config.epochs,
      load_best_model_at_end=True,
      save_total_limit=common.CHECKPOINTS_TO_SAVE,
      optim='adamw_torch',
      learning_rate=3e-4,
      # gradient_accumulation_steps=4,
      # fp16=True,
      bf16=True,
      tf32=True
  )

  # Define the trainer, passing in the model, training args, and data generators
  trainer = Seq2SeqTrainer(
      model,
      args,
      train_dataset=training_set,
      eval_dataset=validation_set
  )
  return trainer

def opt_trainer_provider(config, training_set, validation_set, device):
  model = OPTForCausalLM.from_pretrained(config.model_name).to(device)
  training_args = TrainingArguments(
      output_dir='checkpoints', 
      evaluation_strategy="epoch",
      save_strategy="epoch", 
      per_device_train_batch_size=config.train_batch_size,
      per_device_eval_batch_size=config.val_batch_size,
      num_train_epochs=config.epochs,
      load_best_model_at_end=True,
      save_total_limit=common.CHECKPOINTS_TO_SAVE,
      optim='adamw_torch',
      # learning_rate=3e-4,
      # # gradient_accumulation_steps=4,
      # # fp16=True,
      # bf16=True,
      # tf32=True
    )
  trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set,
    eval_dataset=validation_set,
    compute_metrics=None,
  )
  return trainer

## Train Model

In [None]:
tuning_configs = common.create_configs(GDRIVE_BASE, t5_trainer_provider, t5_datasets_provider, opt_trainer_provider, opt_datasets_provider)

def train(config, device):
  # Load the data
  x_train, x_val, y_train, y_val = load_data(
      config.main_data_file, config.train_val_data_file, 
      config.test_data_file, test_seed=common.SEED, 
      load_splits_from_file=True, prompt=config.prompt, include_test=False,
      train_size=config.training_samples, val_size=config.val_samples)
  print(f'x-train shape: {x_train.shape}, x-val shape: {x_val.shape}, y-train shape: {y_train.shape}, y-val shape: {y_val.shape}')

  # Get the dataset objects.
  training_set, validation_set = config.datasets_provider(config, x_train, y_train, x_val, y_val)
  
  # Get trainer.
  trainer = config.trainer_provider(config, training_set, validation_set, device)

  # Train the model.
  st = time.time()
  result = trainer.train()
  et = time.time()

  # Print training summary
  elapsed_time = et - st
  print_summary(result)
  print('Utilization after training: ')
  print_utilization()  
  
  # Save the tuned model
  trainer.save_model(config.tuned_model_path)

  # Post training cleanup
  del trainer, training_set, validation_set, x_train, x_val, y_train, y_val
  free_gpu_ram()
  print('Utilization after post training cleanup: ')
  print_utilization()  
  print(f'{"*"*25}Training took {elapsed_time} seconds {"*"*25}')

def get_config(model_family, dataset):
  if model_family not in common.VALID_MODEL_FAMILIES:
    raise Exception(f'Model family {model_family} is invalid')
  if dataset not in common.VALID_DATASETS:
    raise Exception(f'Dataset {dataset} is invalid')
  return tuning_configs[f'{model_family}_{dataset}']

In [None]:
%%time
print(f'{"*"*25}Training model T5 on S1 {"*"*25}')
train(get_config('t5', 's1'), device)


*************************Training model T5 on S1 *************************
Loaded 185571 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s1_train_val.csv
Loaded 20619 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s1_test.csv
x-train shape: (163302,), x-val shape: (10000,), y-train shape: (163302,), y-val shape: (10000,)


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Utilization after loading model google/t5-v1_1-base:
CPU RAM Used: 7.6 GB
CPU RAM Free: 81.2 GB
GPU memory occupied: 2443 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.9 GB
Cached:    1.0 GB


Epoch,Training Loss,Validation Loss
1,1.5269,1.347923
2,1.427,1.307112
3,1.3866,1.277318


Time: 1299.28
Samples/second: 377.06
Utilization after training: 
CPU RAM Used: 7.8 GB
CPU RAM Free: 81.0 GB
GPU memory occupied: 29263 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 2.8 GB
Cached:    26.8 GB
Utilization after post training cleanup: 
CPU RAM Used: 7.8 GB
CPU RAM Free: 80.9 GB
GPU memory occupied: 5573 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    3.7 GB
*************************Training took 1299.3015291690826 seconds *************************
CPU times: user 20min 31s, sys: 2min 22s, total: 22min 53s
Wall time: 23min 44s


In [None]:
%%time
print(f'{"*"*25}Training model T5 on S2 {"*"*25}')
train(get_config('t5', 's2'), device)


*************************Training model T5 on S2 *************************
Loaded 185134 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s2_train_val.csv
Loaded 20571 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s2_test.csv
x-train shape: (162917,), x-val shape: (10000,), y-train shape: (162917,), y-val shape: (10000,)
Utilization after loading model google/t5-v1_1-base:
CPU RAM Used: 11.4 GB
CPU RAM Free: 77.3 GB
GPU memory occupied: 5573 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.9 GB
Cached:    3.7 GB


Epoch,Training Loss,Validation Loss
1,0.8858,0.806105
2,0.8227,0.752668
3,0.7951,0.740657


Time: 2202.92
Samples/second: 221.87
Utilization after training: 
CPU RAM Used: 11.4 GB
CPU RAM Free: 77.3 GB
GPU memory occupied: 26715 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 2.8 GB
Cached:    24.3 GB
Utilization after post training cleanup: 
CPU RAM Used: 11.5 GB
CPU RAM Free: 77.3 GB
GPU memory occupied: 6029 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    4.1 GB
*************************Training took 2202.9289886951447 seconds *************************
CPU times: user 35min 6s, sys: 3min 17s, total: 38min 24s
Wall time: 37min 28s


In [None]:
%%time
print(f'{"*"*25}Training model T5 on S3 {"*"*25}')
train(get_config('t5', 's3'), device)


*************************Training model T5 on S3 *************************
Loaded 184700 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s3_train_val.csv
Loaded 20523 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s3_test.csv
x-train shape: (162536,), x-val shape: (10000,), y-train shape: (162536,), y-val shape: (10000,)


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Utilization after loading model google/t5-v1_1-base:
CPU RAM Used: 11.4 GB
CPU RAM Free: 77.4 GB
GPU memory occupied: 2443 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.9 GB
Cached:    1.0 GB


Epoch,Training Loss,Validation Loss
1,0.6201,0.559855
2,0.5896,0.54522
3,0.5694,0.540078


Time: 2902.53
Samples/second: 167.99
Utilization after training: 
CPU RAM Used: 12.1 GB
CPU RAM Free: 76.6 GB
GPU memory occupied: 35517 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 2.8 GB
Cached:    32.9 GB
Utilization after post training cleanup: 
CPU RAM Used: 11.6 GB
CPU RAM Free: 77.1 GB
GPU memory occupied: 5103 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    3.2 GB
*************************Training took 2902.5436878204346 seconds *************************
CPU times: user 43min 39s, sys: 6min 56s, total: 50min 35s
Wall time: 49min 39s


In [None]:
%%time
print(f'{"*"*25}Training model OPT on S2 {"*"*25}')
train(get_config('opt', 's2'), device)


*************************Training model OPT on S2 *************************
Loaded 185134 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s2_train_val.csv
Loaded 20571 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s2_test.csv
x-train shape: (162917,), x-val shape: (10000,), y-train shape: (162917,), y-val shape: (10000,)


Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,1.3948,1.34823
2,1.2321,1.271752
3,1.1193,1.242142


Time: 2250.43
Samples/second: 217.18
Utilization after training: 
CPU RAM Used: 11.2 GB
CPU RAM Free: 77.5 GB
GPU memory occupied: 26089 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 3.7 GB
Cached:    23.7 GB
Utilization after post training cleanup: 
CPU RAM Used: 11.2 GB
CPU RAM Free: 77.5 GB
GPU memory occupied: 1835 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
*************************Training took 2250.4350945949554 seconds *************************
CPU times: user 37min 37s, sys: 6min 5s, total: 43min 42s
Wall time: 38min 25s


In [None]:
%%time
print(f'{"*"*25}Training model OPT on S3 {"*"*25}')
train(get_config('opt', 's3'), device)

*************************Training model OPT on S3 *************************
Loaded 184700 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s3_train_val.csv
Loaded 20523 rows from drive/MyDrive/MIDS/w266/project/data/posptproc_corpus_spacy_s3_test.csv
x-train shape: (162536,), x-val shape: (10000,), y-train shape: (162536,), y-val shape: (10000,)


Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,1.4349,1.378
2,1.2439,1.254261
3,1.1213,1.201115


Time: 10086.05
Samples/second: 48.34
Utilization after training: 
CPU RAM Used: 8.2 GB
CPU RAM Free: 80.5 GB
GPU memory occupied: 31741 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 3.7 GB
Cached:    29.2 GB
Utilization after post training cleanup: 
CPU RAM Used: 7.7 GB
CPU RAM Free: 81.0 GB
GPU memory occupied: 1819 MB.
Using device: cuda

NVIDIA A100-SXM4-40GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB
*************************Training took 10086.063322544098 seconds *************************
CPU times: user 2h 8min 8s, sys: 47min 5s, total: 2h 55min 14s
Wall time: 2h 48min 44s


## Inference

In [None]:
tuning_configs = common.create_configs(GDRIVE_BASE, None, None, None, None)

# # Final test list for model trained against s2 dataset.
# FINAL_TEST_LIST = ['Princess Leia lay upon her bed all the night. She could not sleep at all.',
#                    'He stopped himself for a minute and thought if it was the right thing to do. It did seem like a good thing to do.',
#                    'There once lived king named Rama. He was very wise and just.',
#                    'Once upon a time, an old owl lived in the forest. He was very wise.']

# Final test list for model trained against s1 dataset.
FINAL_TEST_LIST = ['Princess Leia lay upon her bed all the night.',
                   'He stopped himself for a minute and thought if it was the right thing to do.',
                   'There once lived king named Rama.',
                   'Once upon a time, an old owl lived in the forest.']

def generate_next_line(model_family, model, tokenizer, lines, prompt, device, viz=False):
  transformers.logging.set_verbosity_error()
  for i, test_input_text in enumerate(lines):
      test_inputs = tokenizer([prompt + test_input_text], return_tensors='pt')
      if model_family == 't5':
        test_outputs = model.generate(
            test_inputs['input_ids'].to(device),
            num_beams=5,
            no_repeat_ngram_size=3,
            num_return_sequences=5,
            max_new_tokens=100,
            do_sample=True,
            top_k=0,
            return_dict_in_generate=True,
            output_attentions=True)
        test_output_ids = test_outputs['sequences']
        print(test_outputs.keys())
      else:
        test_outputs = model.generate(
          test_inputs['input_ids'].to(device),
          num_beams=4,
          no_repeat_ngram_size=2,
          num_return_sequences=3,
          max_length = 50,
          do_sample=True,
          top_k=0,
          early_stopping=True,
          return_dict_in_generate=True,
          output_attentions=True
        )
        test_output_ids = test_outputs['sequences']

      decoded = [tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).replace('\n', ' ') for out_ids in test_output_ids]
      if viz:
        out = model(**test_inputs, decoder_input_ids=test_output_ids, output_attentions=True, return_dict=True)

        encoder_text = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][0])
        decoder_text = tokenizer.convert_ids_to_tokens(test_output_ids[0])
        encoder_attentions = out.encoder_attentions
        cross_attentions = out.cross_attentions
        decoder_attentions = out.decoder_attentions
        # encoder_attentions = test_outputs['encoder_attentions']
        # decoder_attentions = test_outputs['decoder_attentions']
        # cross_attentions = test_outputs['cross_attentions']        
        print(f"{len(encoder_attentions) = }")
        print(f"{len(cross_attentions) = }")
        print(f"{len(decoder_attentions) = }")          
        model_view(
          cross_attention = cross_attentions[0],
          encoder_attention = encoder_attentions,
          decoder_attention = decoder_attentions,
          encoder_tokens = encoder_text,
          decoder_tokens = decoder_text)
        
      print(f'Input: {test_input_text}')
      decoded = '\n\t'.join(decoded)
      print(f'Output: {decoded}')

def generate_for(config, device, viz=False):
  tokenizer = AutoTokenizer.from_pretrained(config.model_name)
  print('*' *50)
  print(f'Evaluating {config.model_name} tuned on {config.dataset} dataset')
  # model = AutoModel.from_pretrained(config.tuned_model_path, output_attentions=(config.model_family == 't5')).to(device)
  if config.model_family == 't5':
    model = T5ForConditionalGeneration.from_pretrained(config.tuned_model_path).to(device)
  else:
    model = OPTForCausalLM.from_pretrained(config.model_name).to(device)  
  generate_next_line(config.model_family, model, tokenizer, [FINAL_TEST_LIST[0]], config.prompt, device, viz)


In [None]:
## Untrained T5 model
# evaluate(T5ForConditionalGeneration.from_pretrained("t5-large").cuda(), t5_tokenizer, FINAL_TEST_LIST, "Continue the next sentence of the story: ")

In [None]:
generate_for(tuning_configs['t5_s2'], torch.device("cuda"), viz=False)

# for name, config in tuning_configs.items():
#   evaluate_with_config(config, device, viz=False)

**************************************************
Evaluating google/t5-v1_1-base tuned on s2 dataset
odict_keys(['sequences', 'encoder_attentions', 'decoder_attentions', 'cross_attentions'])
Input: Princess Leia lay upon her bed all the night.
Output: She sat down and slept.
	She had been scolded by her mother, who had been so cruel to her that she could not bear it any longer.
	Then she woke up and went to the palace, where she saw a beautiful princess sitting on a throne, with a black robe on her head and a golden crown on her forehead.
	She had slept a long time, and was very tired.
	She was very frightened, and shook her head.


In [None]:
generate_for(tuning_configs['opt_s3'], torch.device("cuda"), viz=False)

**************************************************
Evaluating facebook/opt-350m tuned on s3 dataset
Input: Princess Leia lay upon her bed all the night.
Output: Princess Leia lay upon her bed all the night. She didn't sleep all night, she went to sleep in her room, but she woke up at 5: 00 a. m. to go to the bathroom to wash her hair.
	Princess Leia lay upon her bed all the night. She must have been so tired.
	Princess Leia lay upon her bed all the night. I'm not sure if you're being serious or not, but I'm pretty sure that's not true.


In [None]:
def test():
  # Reference: https://www.aclweb.org/anthology/P19-3007.pdf
  tokenizer = T5Tokenizer.from_pretrained("t5-base")

  config = tuning_configs['t5_s2']
  # model = AutoModel.from_pretrained(config.tuned_model_path, output_attentions=True) 
  model = T5ForConditionalGeneration.from_pretrained(config.tuned_model_path)

  inputs = tokenizer(
      f"{config.prompt}{FINAL_TEST_LIST[0]}",
      return_tensors="pt",
  )
  encoder_input_ids = inputs.input_ids

  output = model.generate(**inputs, return_dict_in_generate=True, output_attentions=True)
  sequences = output.sequences
  decoder_input_ids = sequences

  out = model(**inputs, decoder_input_ids=sequences, output_attentions=True, return_dict=True)
  encoder_attentions = out.encoder_attentions
  cross_attentions = out.cross_attentions
  decoder_attentions = out.decoder_attentions

  decoded = [tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).replace('\n', ' ') for out_ids in sequences]
  print(f'Output: {decoded}')
  encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
  decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

  # model_view(
  #     cross_attention = cross_attentions,
  #     encoder_attention = encoder_attentions, 
  #     decoder_attention = decoder_attentions,
  #     encoder_tokens = encoder_text,
  #     decoder_tokens = decoder_text)  
  head_view(
      cross_attention = cross_attentions,
      encoder_attention = encoder_attentions, 
      decoder_attention = decoder_attentions,
      encoder_tokens = encoder_text,
      decoder_tokens = decoder_text)

test()

Output hidden; open in https://colab.research.google.com to view.