## Setup

In [31]:
!pip install -q transformers 
!pip install -q evaluate
#!pip3 install torch torchvision

In [32]:
import torch
import numpy as np
import pandas as pd
import transformers
from transformers import AutoTokenizer, OPTForCausalLM, Trainer, TrainingArguments, pipeline
from sklearn.model_selection import train_test_split
import evaluate
import re
import datasets
from google.colab import drive
drive.mount('/content/drive')

checkpoint_folder = 'drive/MyDrive/W266/checkpoints/'
checkpoint_file = checkpoint_folder + 'opt_storybot_s2_e3'
seed = 42
transformers.set_seed(seed) # sets seed for numpy, random, torch (if installed), and tf (if installed)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Evaluation

In [34]:
!pip install -q git+https://github.com/google-research/bleurt.git
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone


#### Helper Functions

In [35]:
def gen_output_batched(batch):
  ''' Generates decoded output for a batch of the input and returns in a new column'''
  inputs = tokenizer(batch['variable'], padding="max_length", truncation=True, max_length=max_tk, return_tensors="pt").to(device)
  input_ids = inputs.input_ids
  attention_mask = inputs.attention_mask
  with torch.no_grad():
    outputs = model.generate(input_ids, attention_mask=attention_mask, **gen_params)

  output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  batch["gen_text"] = output_str

  return batch

def add_prompt(example):
  ''' Adds the specified prompt to the variable field '''
  example["variable"] = prompt + example["variable"]
  return example

def remove_input(example):
  ''' Removes the input from the generated output for opt'''
  example["gen_text"] = example["gen_text"].replace(example["variable"],"")
  return example

def clean_output(example):
  """clean special characters from the txt generated by the model"""
  example["gen_text"] = re.sub('\n', ' ', example["gen_text"]).strip()
  example["gen_text"] = re.sub(r'\s+', ' ', example["gen_text"])    
  return example

def calculate_metrics(predictions, references):
  ''' given the predictions and references,
      calculates bleu, rouge, and if specified, bleurt scores
      and returns scores as a dataframe'''
  ## bleu score
  bleu = evaluate.load('bleu')
  bleu_scores = bleu.compute(predictions=predictions, references=references)
  bleu_score = {'blue' : bleu_scores['bleu']}
  
  ## rouge score
  rouge = evaluate.load('rouge')
  rouge_score = rouge.compute(predictions=predictions, references=references)

  ## bleurt scores
  #bleurt = evaluate.load('bleurt', 'bleurt-large-512')
  #bleurt new download_config issue: https://github.com/huggingface/evaluate/issues/440
  bleurt = evaluate.load('bleurt', 'bleurt-large-512', download_config=datasets.download.DownloadConfig())
  bleurt_scores = bleurt.compute(predictions=predictions, references=references)
  bleurt_score = {'bleurt_score' : np.average(bleurt_scores['scores'])}
  return pd.DataFrame(bleu_score | rouge_score | bleurt_score, index=['scores']), bleurt_scores

#### Evaluation Function

In [36]:
def evaluate_model(model, 
                   test_data_url,
                   tokenizer,
                   batch_size=64,
                   model_type='decoder', 
                   max_tk=65,
                   gen_params=None,
                   prompt=''):
  ''' Evaluates the fine tuned model on a given csv of the test set'''
  
  ## Load data into huggingface dataset (for testing, use only 1% of data)
  test_dataset = datasets.load_dataset("csv", data_files=test_data_url, split='train[:1%]')
      
  if model_type == 'opt':
  ## if opt, generate output text using batched inputs
  ## Remove input from generated output
    gen_dataset = test_dataset.map(gen_output_batched, batched=True, batch_size=batch_size)
    gen_text = gen_dataset.map(remove_input, remove_columns=["variable"])
  
  elif model_type == 't5':
  ## if encoder-decoder, add the specified prompt,
  ## generate output text using batched inputs
    test_dataset = test_dataset.map(add_prompt)
    gen_text = test_dataset.map(gen_output_batched, batched=True, batch_size=batch_size, remove_columns=["variable"])
  
  else:
  ## if bert2bert, generate output text using batched input
    gen_text = test_dataset.map(gen_output_batched, batched=True, batch_size=batch_size, remove_columns=["variable"])

  gen_text = gen_text.map(clean_output)
  # calculate bleu, rouge, and bleurt metrics
  metrics, bleurt_scores = calculate_metrics(gen_text['gen_text'], gen_text['label'])
  
  return test_dataset['variable'], gen_text, metrics, bleurt_scores

#### Evaluation Data Path, Model Path, & Parameters
Update with desired paths and parameters

In [37]:
test_data_url = "/content/drive/My Drive/W266/StoryBots Datasets/posptproc_corpus_spacy_s1_test.csv"
model_path = "/content/drive/My Drive/W266/opt_storybot_s2_e3"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = OPTForCausalLM.from_pretrained(model_path).to(device)
model_type = 'opt' # 'opt' vs 't5' vs 'bert2bert'
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
prompt = '' # '' for opt and bert2bert
max_tk = 110
gen_params = dict(
    max_new_tokens=50,
    num_beams=4,
    no_repeat_ngram_size=2,
    early_stopping=True,
    num_return_sequences=1
)

In [38]:
variable, gen_text, eval_metrics, bleurt_scores = evaluate_model(model, 
                   test_data_url,
                   tokenizer,
                   batch_size=64,
                   model_type=model_type, 
                   max_tk=max_tk,
                   gen_params=gen_params,
                   prompt=prompt)



In [39]:
eval_metrics

Unnamed: 0,blue,rouge1,rouge2,rougeL,rougeLsum,bleurt_score
scores,0.006443,0.131382,0.01194,0.096429,0.096296,-0.953089


In [40]:
print(gen_text['label'][0])
print(gen_text['gen_text'][0])
print(bleurt_scores['scores'][0])

The obedient children arrived at the forest and, oh, wonder!
"Now, papa," she said to them, "I am going to tell you a story that will make your heart rise.,!!" they went on and on, till they came to a great forest, where
-0.6725409626960754


## Prepare Data (archive)

In [33]:
data = pd.read_csv("/content/drive/My Drive/W266/StoryBots Datasets/posptproc_corpus_spacy_s2.csv")
data.head()

ParserError: ignored

In [None]:
data.shape

# filter for the first XXX rows to help with performance
#data = data[:2000]

(205705, 2)

In [None]:
print("Stats on number of words in variable:")
print(data['variable'].str.split(' ').str.len().describe())

Stats on number of words in variable:
count    205705.000000
mean         36.610491
std          22.987693
min           2.000000
25%          19.000000
50%          32.000000
75%          49.000000
max         500.000000
Name: variable, dtype: float64


In [None]:
# filter out rows that have less than 4 words or more than 50 words in either the variable or label columns
data_wc = data[(data['variable'].str.split(' ').str.len() > 3) & 
     (data['variable'].str.split(' ').str.len() < 50)]

In [None]:
print("Stats on number of words in variable:")
print(data_wc['variable'].str.split(' ').str.len().describe())

Stats on number of words in variable:
count    154658.000000
mean         26.473309
std          12.093781
min           4.000000
25%          16.000000
50%          26.000000
75%          36.000000
max          49.000000
Name: variable, dtype: float64


In [None]:
# split data into training and remaining dataset
x_train, x_test, y_train, y_test = train_test_split(data_wc['variable'],data_wc['label'], train_size=0.7, random_state=42)

In [None]:
print("length of x_train:", len(x_train))
print("length of x_test:", len(x_test))

length of x_train: 108260
length of x_test: 46398


## OPT (archive)
##### Fine-Tune OPT Model using Transformers trainer()
##### Huggingface Documentation: https://huggingface.co/docs/transformers/training

In [None]:
model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

Downloading (…)lve/main/config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [None]:
# tokenize lists of training and test variables and labels 
max_length = 50
def tokenize_function(variables):
    return tokenizer(variables, padding="max_length", truncation=True, max_length=max_length)


x_train_tk = tokenize_function(list(x_train))
x_test_tk = tokenize_function(list(x_test))

In [None]:
# training arguments for model fine tuning
training_args = TrainingArguments(output_dir=checkpoint_file, 
                                  save_total_limit = 1, 
                                  load_best_model_at_end=True,
                                  save_strategy="epoch", 
                                  evaluation_strategy="epoch")

In [None]:
# SKIP FOR NOW - install, load bleurt model, and create compute metrics function for fine tuning model
#!pip install git+https://github.com/google-research/bleurt.git
# metric = evaluate.load("bleurt", module_type="metric")

# def compute_metrics(eval_pred):
#     pred, labels = eval_pred
#     pred = np.argmax(pred, axis=-1)
#     labels_string = [tokenizer.decode(i) for i in labels]
#     pred_string = [tokenizer.decode(i) for i in pred]
#     return metric.compute(predictions=pred_string, references=labels_string)

In [None]:
# Create torch datasets for train and test data for fine tuning model
# original source: https://stackoverflow.com/questions/67691530/key-error-while-fine-tunning-t5-for-summarization-with-huggingface
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.labels = labels
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids']) 

dataset_train = Dataset(x_train_tk, x_train_tk['input_ids'])#, y_train_tk['input_ids'])
dataset_test = Dataset(x_test_tk, x_test_tk['input_ids'])#, y_test_tk['input_ids'])

In [None]:
dataset_train[0]

{'input_ids': tensor([    2,  6179,   593,    24,    21,    11,     5,  8421,   328,    20,
          2859, 33522,     7, 15544, 15644,     5, 31021,  2923, 10080,  4567,
             8,  1690,  1545,   542,  1755,  7913,   352,     5,    44,    48,
           119,  5999,   873,  4715,    17,    46,   396,  8401,  6158,     8,
          7458, 22597,    11,    49,  3581,     4,     1,     1,     1,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
         0, 0]),
 'labels': tensor([    2,  6179,   593,    24,    21,    11,     5,  8421,   328,    20,
          2859, 33522,     7, 15544, 15644,     5, 31021,  2923, 10080,  4567,
             8,  1690,  1545,   542,  1755,  7913,   352,     5,    44,    48,
           119,  5999,   873,  4715,    17,    46,   396,  8401,  6158,     8,
          7458, 22597,    11,    49,  3581,     4,     1,     1,   

In [None]:
# training model function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=None,
)

In [None]:
# train model
trainer.train()



Epoch,Training Loss,Validation Loss
1,2.3815,2.347988
2,2.0438,2.184473
3,1.6513,2.120432


TrainOutput(global_step=40599, training_loss=2.0948828039458474, metrics={'train_runtime': 4861.8444, 'train_samples_per_second': 66.802, 'train_steps_per_second': 8.351, 'total_flos': 2.9557378695168e+16, 'train_loss': 2.0948828039458474, 'epoch': 3.0})

In [None]:
#trainer.save_model("opt_storybot_s1_e3")
trainer.save_model('/content/drive/My Drive/W266/opt_storybot_s2_e3')

In [None]:
# fine tuned model
#model_s1_path = "opt_storybot_s1_e3"
model_path = "/content/drive/My Drive/W266/opt_storybot_s2_e3"
model = OPTForCausalLM.from_pretrained(model_path)

In [None]:
story_prompts = ['The princess lay upon her bed all the night.',
                        'He stopped himself for a minute and thought if it was the right thing to do.',
                        'There once lived king named Rama.',
                        'Once upon a time, an old owl lived in the forest.']

In [None]:
i = 0
for story_prompt in story_prompts:
  i += 1
  story_tokens = tokenizer(story_prompt, return_tensors="pt")
  model_output = model.generate(
      story_tokens.input_ids,
      num_beams=4,
      no_repeat_ngram_size=2,
      num_return_sequences=3,
      max_length = 50,
      do_sample=True,
      top_k=0,
      early_stopping=True
  )
  print("-------------------------")
  print("Story Prompt", i)
  for o in model_output:
    print(tokenizer.decode(o, skip_special_tokens=True))
    print()
  print("-------------------------")

-------------------------
Story Prompt 1
The princess lay upon her bed all the night. In the morning when she awoke, she found herself once more in the princess's room, where she had been so long asleep, and she was so happy that she could not wake up.

The princess lay upon her bed all the night. She was so tired that she fell asleep, and when she woke up the next morning she found herself lying on the bed of the King’s son.” there was a great

The princess lay upon her bed all the night. In the morning when she awoke, she was surprised to find herself in the garden of the Epanchins, where the prince had been waiting for her for a long time.!

-------------------------
-------------------------
Story Prompt 2
He stopped himself for a minute and thought if it was the right thing to do. Then he said, "I will do it." and then he went on and on till he came to a brook, and there he sat down and

He stopped himself for a minute and thought if it was the right thing to do. Then he said: "I 