## OPT
##### Fine-Tune OPT Model using Transformers trainer()
##### Huggingface Documentation: https://huggingface.co/docs/transformers/training

In [35]:
!pip install transformers
!pip install evaluate
#!pip3 install torch torchvision

In [3]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, OPTForCausalLM, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import evaluate

In [4]:
from google.colab import drive
drive.mount('/content/drive')

checkpoint_folder = 'drive/MyDrive/W266/checkpoints/'
checkpoint_file = checkpoint_folder + 'opt_storybot_s1_e3'

Mounted at /content/drive


In [6]:
data = pd.read_csv("/content/drive/My Drive/W266/StoryBots Datasets/posptproc_corpus_spacy_s1.csv")
data.head()

Unnamed: 0,variable,label
0,"HIGH above the city, on a tall column, stood t...",He was gilded all over with thin leaves of fin...
1,He was gilded all over with thin leaves of fin...,He was very much admired indeed.
2,He was very much admired indeed.,"“He is as beautiful as a weathercock,” remarke..."
3,"“He is as beautiful as a weathercock,” remarke...",“Why can’t you be like the Happy Prince?”
4,“Why can’t you be like the Happy Prince?”,asked a sensible mother of her little boy who ...


In [7]:
data.shape

# filter for the first XXX rows to help with performance
#data = data[:2000]

(206190, 2)

In [8]:
print("Stats on number of words in variable:")
print(data['variable'].str.split(' ').str.len().describe())

Stats on number of words in variable:
count    206190.000000
mean         18.310946
std          14.334988
min           1.000000
25%           8.000000
50%          15.000000
75%          25.000000
max         467.000000
Name: variable, dtype: float64


In [9]:
# filter out rows that have less than 4 words or more than 50 words in either the variable or label columns
data_wc = data[(data['variable'].str.split(' ').str.len() > 3) & 
     (data['variable'].str.split(' ').str.len() < 50)]

In [10]:
print("Stats on number of words in variable:")
print(data_wc['variable'].str.split(' ').str.len().describe())

Stats on number of words in variable:
count    182504.000000
mean         17.975847
std          10.840116
min           4.000000
25%           9.000000
50%          16.000000
75%          25.000000
max          49.000000
Name: variable, dtype: float64


In [11]:
# split data into training and remaining dataset
x_train, x_test, y_train, y_test = train_test_split(data_wc['variable'],data_wc['label'], train_size=0.7)

In [12]:
print("length of x_train:", len(x_train))
print("length of x_test:", len(x_test))

length of x_train: 127752
length of x_test: 54752


In [13]:
model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

Downloading (…)lve/main/config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [14]:
# tokenize lists of training and test variables and labels 
max_length = 50
def tokenize_function(variables):
    return tokenizer(variables, padding="max_length", truncation=True, max_length=max_length)


x_train_tk = tokenize_function(list(x_train))
x_test_tk = tokenize_function(list(x_test))

In [22]:
# training arguments for model fine tuning
training_args = TrainingArguments(output_dir=checkpoint_file, 
                                  save_total_limit = 1, 
                                  load_best_model_at_end=True,
                                  save_strategy="epoch", 
                                  evaluation_strategy="epoch")

In [None]:
# SKIP FOR NOW - install, load bleurt model, and create compute metrics function for fine tuning model
#!pip install git+https://github.com/google-research/bleurt.git
# metric = evaluate.load("bleurt", module_type="metric")

# def compute_metrics(eval_pred):
#     pred, labels = eval_pred
#     pred = np.argmax(pred, axis=-1)
#     labels_string = [tokenizer.decode(i) for i in labels]
#     pred_string = [tokenizer.decode(i) for i in pred]
#     return metric.compute(predictions=pred_string, references=labels_string)

In [23]:
# Create torch datasets for train and test data for fine tuning model
# original source: https://stackoverflow.com/questions/67691530/key-error-while-fine-tunning-t5-for-summarization-with-huggingface
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.labels = labels
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids']) 

dataset_train = Dataset(x_train_tk, x_train_tk['input_ids'])#, y_train_tk['input_ids'])
dataset_test = Dataset(x_test_tk, x_test_tk['input_ids'])#, y_test_tk['input_ids'])

In [17]:
dataset_train[0]

{'input_ids': tensor([    2,   133,  1086, 16699,     9,    84,   665, 11505,     6,    11,
            63,  6441,     8,  1468,  5580,  6659,     6,    16,   144,  1153,
          7678, 24417,     4,    17,    46,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]),
 'labels': tensor([    2,   133,  1086, 16699,     9,    84,   665, 11505,     6,    11,
            63,  6441,     8,  1468,  5580,  6659,     6,    16,   144,  1153,
          7678, 24417,     4,    17,    46,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,   

In [24]:
# training model function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=None,
)

In [25]:
# train model
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.7829,1.757684
2,1.5666,1.696562
3,1.3351,1.703699


TrainOutput(global_step=47907, training_loss=1.5821914406896924, metrics={'train_runtime': 5706.4818, 'train_samples_per_second': 67.162, 'train_steps_per_second': 8.395, 'total_flos': 3.48791265755136e+16, 'train_loss': 1.5821914406896924, 'epoch': 3.0})

In [26]:
#trainer.save_model("opt_storybot_s1_e3")
trainer.save_model('/content/drive/My Drive/W266/opt_storybot_s1_e3')

In [31]:
# fine tuned model
#model_s1_path = "opt_storybot_s1_e3"
model_s1_path = "/content/drive/My Drive/W266/opt_storybot_s1_e3"
model_s1 = OPTForCausalLM.from_pretrained(model_s1_path)

In [32]:
story_prompts = ['The princess lay upon her bed all the night.',
                        'He stopped himself for a minute and thought if it was the right thing to do.',
                        'There once lived king named Rama.',
                        'Once upon a time, an old owl lived in the forest.']

In [34]:
i = 0
for story_prompt in story_prompts:
  i += 1
  story_tokens = tokenizer(story_prompt, return_tensors="pt")
  model_s1_output = model_s1.generate(
      story_tokens.input_ids,
      num_beams=4,
      no_repeat_ngram_size=2,
      num_return_sequences=3,
      max_length = 50,
      do_sample=True,
      top_k=0,
      early_stopping=True
  )
  print("-------------------------")
  print("Story Prompt", i)
  for o in model_s1_output:
    print(tokenizer.decode(o, skip_special_tokens=True))
    print()
  print("-------------------------")

-------------------------
Story Prompt 1
The princess lay upon her bed all the night. and, till morning, when she was lying in bed, she said, “Dear father, why art thou weeping?”; and so she died, and the

The princess lay upon her bed all the night. and the next morning, when she awoke, she said to her mother, “Mother, I have found the ring,” and she ran to the King’s daughter

The princess lay upon her bed all the night. and the giant got up and went out to see if he could find out where she was, but he couldn’t find her, so he said, “Go away,

-------------------------
-------------------------
Story Prompt 2
He stopped himself for a minute and thought if it was the right thing to do. and then he said, “Oh, yes,” and went on his way, and he was so frightened that he fell down to the

He stopped himself for a minute and thought if it was the right thing to do.” “Now I’ve got to go to bed, and you must be in bed all night, so I must go too

He stopped himself for a minute and thought 