# One Time Setup

## Install Dependencies

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install git+https://github.com/google-research/bleurt.git
!pip install setuptools


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, htt

In [2]:
import torch
print(f'torch.__version__: {torch.__version__}')
!nvcc --version
# print(f'lsb_release -a: {lsb_release -a}')

torch.__version__: 1.13.1+cu116
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [3]:
# # !git clone https://github.com/NVIDIA/apex
# !pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex
# import torch
# print(f'torch.__version__: {torch.__version__}')
# torch.randn(1, 1, 32000).to(device='cuda:0')

## Connect to Google Drive
We will be loading data from google drive and also save trained models to google drive. So lets mount google drive.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports and Constants

In [15]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import transformers
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

DATA_NAME = "s2"
# T5_MODEL_NAME = "t5-small"
T5_MODEL_NAME = "t5-base"
# T5_MODEL_NAME = "t5-large" - colab instances do not have enough memory for T5 large.
MAIN_DATA_FILE = f'drive/MyDrive/MIDS/w266/project/datasci-w266-2023-spring-team-story-bot/posptproc_corpus_spacy_{DATA_NAME}.csv'
TRAIN_DATA_FILE = f'posptproc_corpus_spacy_{DATA_NAME}_train.csv'
VAL_DATA_FILE = f'posptproc_corpus_spacy_{DATA_NAME}_val.csv'
NUM_TRAIN_SAMPLES = 110000
NUM_TRAIN_SAMPLES = 50000
NUM_VAL_SAMPLES = 45000
NUM_VAL_SAMPLES = 1000
MAX_LOAD_AT_ONCE = 1000
SRC_MAX_LENGTH=512
TARGET_MAX_LENGTH=128

# MODEL_CKPT_FOLDER = 'drive/MyDrive/MIDS/w266/project/checkpoints/'
# MODEL_CKPT_FILE = MODEL_CKPT_FOLDER + f'{T5_MODEL_NAME}-finetuned-02'
TUNED_T5_SAVED = f'drive/MyDrive/MIDS/w266/project/saved_models/{T5_MODEL_NAME}-data{DATA_NAME}-finetuned'
PROMPT = 'Generate next line: '
BATCH_SIZE = 16
SEED = 42
CHECKPOINTS_TO_SAVE = 3

# Final test list for model trained against s2 dataset.
FINAL_TEST_LIST = ['Princess Leia lay upon her bed all the night. She could not sleep at all.',
                   'He stopped himself for a minute and thought if it was the right thing to do. It did seem like a good thing to do.',
                   'There once lived king named Rama. He was very wise and just.',
                   'Once upon a time, an old owl lived in the forest. He was very wise.']

# Final test list for model trained against s1 dataset.
# FINAL_TEST_LIST = ['Princess Leia lay upon her bed all the night.',
#                    'He stopped himself for a minute and thought if it was the right thing to do.',
#                    'There once lived king named Rama.',
#                    'Once upon a time, an old owl lived in the forest.']

## Split Data File

In [6]:
def split_datafile(main_file, train_file, val_file):
  data_df = pd.read_csv(main_file)
  data_wc = data_df[(data_df['variable'].str.split(' ').str.len() > 3) & 
     (data_df['variable'].str.split(' ').str.len() < 50) &
     (data_df['label'].str.split(' ').str.len() > 3) &
     (data_df['label'].str.split(' ').str.len() < 50)]
  x_train, x_val, y_train, y_val = train_test_split(data_wc['variable'], data_wc['label'], train_size=0.7, random_state=SEED)
  xy_train = {'variable': [PROMPT + x for x in x_train], 'label': y_train}
  xy_val = {'variable': [PROMPT + x for x in x_val], 'label': y_val}

  df_train = pd.DataFrame(xy_train)
  df_val = pd.DataFrame(xy_val)
  df_train.to_csv(train_file, index=False)
  df_val.to_csv(val_file, index=False)
  print(f'Split {data_df.shape[0]} entires to {df_train.shape[0]} and {df_val.shape[0]}')

split_datafile(MAIN_DATA_FILE, TRAIN_DATA_FILE, VAL_DATA_FILE)

Split 205705 entires to 95649 and 40993


In [7]:
class StoryDataIterator:    
    def __init__(self,
                 tokenizer,
                 n_examples,
                 max_load_at_once,
                 data_filename,
                 src_max_length=512,
                 target_max_length=128,
                 shuffle=True):
        
        self.tokenizer = tokenizer
        self.n_examples = n_examples
        self.max_load_at_once = max_load_at_once
        self.data_filename = data_filename
        self.src_max_length = src_max_length
        self.target_max_length = target_max_length
        self.shuffle = shuffle
        
        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

        # Load first chunk of max_load_at_once examples
        self.df_curr_loaded = self._load_next_chunk(0)
        self.curr_idx_in_load = 0

    def preprocess_data(self, text_pair):
        orig_text, target_text = text_pair
        orig_encoded = self.tokenizer.batch_encode_plus(
            [orig_text],
            max_length=self.src_max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        orig_input_ids = orig_encoded['input_ids'][0]
        orig_attention_mask = orig_encoded['attention_mask'][0]
        
        target_encoded = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.target_max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        label_ids = target_encoded['input_ids'][0]
        
        return {'input_ids': orig_input_ids,
                'attention_mask': orig_attention_mask,
                'labels': label_ids}

    def _load_next_chunk(self, idx):
        load_start = idx
        load_end = idx + self.max_load_at_once

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this chunk
        load_idx_skip = self.row_order[:load_start] + self.row_order[load_end:]
        self.df_curr_loaded = pd.read_csv(self.data_filename, skiprows=load_idx_skip)
        self.df_curr_loaded = self.df_curr_loaded.sample(frac=1)
    
    def __len__(self):
        return self.n_examples
    
    def __getitem__(self, idx):
        if self.df_curr_loaded is None or self.curr_idx_in_load >= len(self.df_curr_loaded):
            self._load_next_chunk(idx)
            self.curr_idx_in_load = 0
        
        text_pair = self.df_curr_loaded[['variable', 'label']].values.astype(str)[self.curr_idx_in_load]
        self.curr_idx_in_load += 1
        
        item_data = self.preprocess_data(text_pair)        
        return item_data
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()
    
    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [8]:
t5_tokenizer = T5Tokenizer.from_pretrained(T5_MODEL_NAME)
t5_model = T5ForConditionalGeneration.from_pretrained(T5_MODEL_NAME)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
train_data_iterator = StoryDataIterator(
    tokenizer=t5_tokenizer,
    n_examples=NUM_TRAIN_SAMPLES,
    max_load_at_once=MAX_LOAD_AT_ONCE,
    data_filename=TRAIN_DATA_FILE,
    src_max_length=SRC_MAX_LENGTH,
    target_max_length=TARGET_MAX_LENGTH
)

val_data_iterator = StoryDataIterator(
    tokenizer=t5_tokenizer,
    n_examples=NUM_VAL_SAMPLES,
    max_load_at_once=MAX_LOAD_AT_ONCE,
    data_filename=VAL_DATA_FILE,
    src_max_length=SRC_MAX_LENGTH,
    target_max_length=TARGET_MAX_LENGTH
)

In [10]:
# def print_n(it, n=5):
#   for i in range(n):
#     print(f'{i+1}: {next(it)}')

# print_n(train_data_iterator(), n=1)
# print_n(val_data_iterator(), n=1)


# Train Model

In [11]:
%%time
args = Seq2SeqTrainingArguments(
    output_dir='checkpoints',
    evaluation_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=1,
    learning_rate=3e-4,
)

# Define the trainer, passing in the model, training args, and data generators

trainer = Seq2SeqTrainer(
    t5_model,
    args,
    train_dataset=train_data_iterator,
    eval_dataset=val_data_iterator
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.6482,0.591211


CPU times: user 1h 12min 32s, sys: 1h 9min 2s, total: 2h 21min 34s
Wall time: 2h 20min 53s


TrainOutput(global_step=3125, training_loss=0.6759006884765625, metrics={'train_runtime': 8447.2396, 'train_samples_per_second': 5.919, 'train_steps_per_second': 0.37, 'total_flos': 3.0447894528e+16, 'train_loss': 0.6759006884765625, 'epoch': 1.0})

In [12]:
trainer.save_model(TUNED_T5_SAVED)

# Inference

In [13]:
def evaluate(model, tokenizer, lines, prompt):
  transformers.logging.set_verbosity_error()
  for test_input_text in lines:
      test_inputs = tokenizer([prompt + test_input_text], return_tensors='pt')
      test_output_ids = model.generate(
          test_inputs['input_ids'].cuda(),
          num_beams=5,
          no_repeat_ngram_size=3,
          num_return_sequences=5,
          max_new_tokens=100,
          do_sample=True,
          top_k=0)
      print(f'Input: {test_input_text}')
      decoded = [tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) for out_ids in test_output_ids]
      print(f'Output: {decoded}')

In [None]:
## Untrained T5 model
# evaluate(T5ForConditionalGeneration.from_pretrained("t5-large").cuda(), t5_tokenizer, FINAL_TEST_LIST, "Continue the next sentence of the story: ")

In [14]:
## Fine tuned T5 model
evaluate(T5ForConditionalGeneration.from_pretrained(TUNED_T5_SAVED).cuda(), t5_tokenizer, FINAL_TEST_LIST, PROMPT)


Input: Princess Leia lay upon her bed all the night.
Output: ['She had slept so long that she could not sleep for a long time.', 'She was a very good girl, and she slept very well all the night.', 'She was a frightened little girl, and she was so ill that she could not sleep.', 'She was so dreadful that she could not sleep.', 'She sat down in her bed and slept all night.']
Input: He stopped himself for a minute and thought if it was the right thing to do.
Output: ['He did not think it would be the right thing to do.', '"It is the right thing to do," he said.', 'He shook his head and said, “I don’t know what to do, but I know it’s the right thing to do.”', '“It is a good thing,” he said.', 'He shook his head, and said, “I don’t know what to do.']
Input: There once lived king named Rama.
Output: ['Rama was a great king, and he was king of India.', 'Rama was a king of a thousand years, and he had a great wealth of wealth.', 'Rama was a king of the gods.', 'Rama was a king of a great kingd

In [None]:
TUNED_T5_SAVED

'drive/MyDrive/MIDS/w266/project/saved_models/t5-base-datas1-finetuned'

In [16]:
evaluate(T5ForConditionalGeneration.from_pretrained(TUNED_T5_SAVED).cuda(), t5_tokenizer, FINAL_TEST_LIST, PROMPT)


Input: Princess Leia lay upon her bed all the night. She could not sleep at all.
Output: ['She slept in a slumber, but she did not know how to get out of bed.', 'She was very ill, and she could not sleep for a long time.', 'She was so tired that she could not sleep at all.', 'She could not sleep at all.', 'She could not sleep at all.']
Input: He stopped himself for a minute and thought if it was the right thing to do. It did seem like a good thing to do.
Output: ['He went out to eat a little, and then he went to bed, and he sat down with a cup of tea, and said, “It’s a good thing to do.”', 'It was a good thing to do.', 'It was a good thing to do.', 'It was a good thing to do.', 'He thought it was the right thing to do.']
Input: There once lived king named Rama. He was very wise and just.
Output: ['Rama was a king of India, and he had a great wealth of wealth.', 'Rama was a very good king, and he was very good to his people.', 'Rama was a good king, and he had a great wealth of wealth.'