# One Time Setup

## Install Dependencies

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install git+https://github.com/google-research/bleurt.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://u

## Connect to Google Drive
In order to avoid disk space issues, we will use google drive for saving model snapshots during training.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports and Constants

In [23]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import transformers
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

MAIN_DATA_FILE = 'posptproc_corpus_spacy_s1.csv'
TRAIN_DATA_FILE = 'posptproc_corpus_spacy_s1_train.csv'
VAL_DATA_FILE = 'posptproc_corpus_spacy_s1_val.csv'
NUM_TRAIN_SAMPLES = 110000 # Start with a small sample size.
NUM_VAL_SAMPLES = 45000 # Start with a small sample size.
MAX_LOAD_AT_ONCE = 2500
MAX_TOKEN_LENGTH = 128

MODEL_CKPT_FOLDER = 'drive/MyDrive/MIDS/w266/project/checkpoints/'
MODEL_CKPT_FILE = MODEL_CKPT_FOLDER + 't5base-finetuned'
TUNED_T5_SAVED = 'drive/MyDrive/MIDS/w266/project/saved_models/t5base_finetuned'
PROMPT = 'generate next line: '
BATCH_SIZE = 16
SEED = 42

## Split Data File

In [None]:
def split_datafile(main_file, train_file, val_file):
  data_df = pd.read_csv(main_file)
  data_wc = data_df[(data_df['variable'].str.split(' ').str.len() > 3) & 
     (data_df['variable'].str.split(' ').str.len() < 50) &
     (data_df['label'].str.split(' ').str.len() > 3) &
     (data_df['label'].str.split(' ').str.len() < 50)]
  x_train, x_val, y_train, y_val = train_test_split(data_wc['variable'], data_wc['label'], train_size=0.7)
  xy_train = {'variable': [PROMPT + x for x in x_train], 'label': y_train}
  xy_val = {'variable': [PROMPT + x for x in x_val], 'label': y_val}

  df_train = pd.DataFrame(xy_train)
  df_val = pd.DataFrame(xy_val)
  df_train.to_csv(train_file, index=False)
  df_val.to_csv(val_file, index=False)
  print(f'Split {data_df.shape[0]} entires to {df_train.shape[0]} and {df_val.shape[0]}')

split_datafile(MAIN_DATA_FILE, TRAIN_DATA_FILE, VAL_DATA_FILE)

Split 206190 entires to 114486 and 49066


In [None]:
class StoryDataIterator:    
    def __init__(self,
                 tokenizer,
                 n_examples,
                 max_load_at_once,
                 data_filename,
                 max_length=128,
                 shuffle=True):
        
        self.tokenizer = tokenizer
        self.n_examples = n_examples
        self.max_load_at_once = max_load_at_once
        self.data_filename = data_filename
        self.max_length = max_length
        self.shuffle = shuffle
        
        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

        # Load first chunk of max_load_at_once examples
        self.df_curr_loaded = self._load_next_chunk(0)
        self.curr_idx_in_load = 0

    def preprocess_data(self, text_pair, max_length=128):
        orig_text, target_text = text_pair
        orig_encoded = self.tokenizer.batch_encode_plus(
            [orig_text],
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        orig_input_ids = orig_encoded['input_ids'][0]
        orig_attention_mask = orig_encoded['attention_mask'][0]
        
        target_encoded = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        label_ids = target_encoded['input_ids'][0]
        
        return {'input_ids': orig_input_ids,
                'attention_mask': orig_attention_mask,
                'labels': label_ids}

    def _load_next_chunk(self, idx):
        load_start = idx
        load_end = idx + self.max_load_at_once

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this chunk
        load_idx_skip = self.row_order[:load_start] + self.row_order[load_end:]
        self.df_curr_loaded = pd.read_csv(self.data_filename, skiprows=load_idx_skip)
        self.df_curr_loaded = self.df_curr_loaded.sample(frac=1)
    
    def __len__(self):
        return self.n_examples
    
    def __getitem__(self, idx):
        if self.df_curr_loaded is None or self.curr_idx_in_load >= len(self.df_curr_loaded):
            self._load_next_chunk(idx)
            self.curr_idx_in_load = 0
        
        text_pair = self.df_curr_loaded[['variable', 'label']].values.astype(str)[self.curr_idx_in_load]
        self.curr_idx_in_load += 1
        
        item_data = self.preprocess_data(
            text_pair,
            self.max_length
        )
        
        return item_data
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()
    
    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [None]:
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# MAX_SRC_LEN = 512
# MAX_TARGET_LEN = 128


# input_ids = t5_tokenizer("continue the story: The house is wonderful.", return_tensors="pt").input_ids
# labels = t5_tokenizer("It is small but warm and welcoming.", return_tensors="pt").input_ids

# # the forward function automatically creates the correct decoder_input_ids
# loss = t5_model(input_ids=input_ids, labels=labels).loss
# loss.item()

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
train_data_iterator = StoryDataIterator(
    tokenizer=t5_tokenizer,
    n_examples=NUM_TRAIN_SAMPLES,
    max_load_at_once=MAX_LOAD_AT_ONCE,
    data_filename=TRAIN_DATA_FILE,
    max_length=MAX_TOKEN_LENGTH
)

val_data_iterator = StoryDataIterator(
    tokenizer=t5_tokenizer,
    n_examples=NUM_VAL_SAMPLES,
    max_load_at_once=MAX_LOAD_AT_ONCE,
    data_filename=VAL_DATA_FILE,
    max_length=MAX_TOKEN_LENGTH
)

In [None]:
# def print_n(it, n=5):
#   for i in range(n):
#     print(f'{i+1}: {next(it)}')

# print_n(train_data_iterator(), n=1)
# print_n(val_data_iterator(), n=1)


# Train Model

In [20]:
args = Seq2SeqTrainingArguments(
    MODEL_CKPT_FILE,
    evaluation_strategy='epoch',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
)

# Define the trainer, passing in the model, training args, and data generators

trainer = Seq2SeqTrainer(
    t5_model,
    args,
    train_dataset=train_data_iterator,
    eval_dataset=val_data_iterator
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.7884,0.769117
2,0.7594,0.762675


Epoch,Training Loss,Validation Loss
1,0.7884,0.769117
2,0.7594,0.762675
3,0.7519,0.762736


TrainOutput(global_step=20625, training_loss=0.7788078920306581, metrics={'train_runtime': 5351.1225, 'train_samples_per_second': 61.669, 'train_steps_per_second': 3.854, 'total_flos': 1.116569862144e+16, 'train_loss': 0.7788078920306581, 'epoch': 3.0})

In [26]:
trainer.save_model(TUNED_T5_SAVED)

# Inference

In [28]:
## Inference
t5_model_loaded = T5ForConditionalGeneration.from_pretrained(TUNED_T5_SAVED).cuda()
transformers.logging.set_verbosity_error()
for test_input_text in ['The princess lay upon her bed all the night.',
                        'He stopped himself for a minute and thought if it was the right thing to do.',
                        'There once lived king named Rama.',
                        'Once upon a time, an old owl lived in the forest.']:
    test_inputs = t5_tokenizer([PROMPT + test_input_text], return_tensors='pt')
    test_output_ids = t5_model_loaded.generate(test_inputs['input_ids'].cuda())

    print([t5_tokenizer.decode(out_ids, skip_special_tokens=True, 
                               clean_up_tokenization_spaces=False) for out_ids in test_output_ids])



['She was a very happy woman, and she was very happy to be a princess.']
['“It is not the right thing,” he said.']
['Rama was a king, and he was a king.']
['Then he sat down and sat down, and he s']
