## Setup

In [1]:
!pip install -q transformers 
!pip install -q evaluate
!pip install sentencepiece
#!pip3 install torch torchvision

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
import numpy as np
import pandas as pd
import transformers
from transformers import AutoTokenizer, OPTForCausalLM, T5ForConditionalGeneration, Trainer, TrainingArguments, pipeline
from sklearn.model_selection import train_test_split
import evaluate
import re
import datasets
from google.colab import drive
drive.mount('/content/drive')

checkpoint_folder = 'drive/MyDrive/W266/checkpoints/'
checkpoint_file = checkpoint_folder + 'opt_storybot_s2_e3'
seed = 42
transformers.set_seed(seed) # sets seed for numpy, random, torch (if installed), and tf (if installed)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Evaluation

In [3]:
!pip install -q git+https://github.com/google-research/bleurt.git
!pip install -q rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone


#### Helper Functions

In [4]:
def gen_output_batched(batch):
  ''' Generates decoded output for a batch of the input and returns in a new column'''
  inputs = tokenizer(batch['variable'], padding="max_length", truncation=True, max_length=max_tk, return_tensors="pt").to(device)
  input_ids = inputs.input_ids
  attention_mask = inputs.attention_mask
  with torch.no_grad():
    outputs = model.generate(input_ids, attention_mask=attention_mask, **gen_params)

  output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
  batch["gen_text"] = output_str

  return batch

def add_prompt(example):
  ''' Adds the specified prompt to the variable field '''
  example["variable"] = prompt + example["variable"]
  return example

def remove_input(example):
  ''' Removes the input from the generated output for opt'''
  example["gen_text"] = example["gen_text"].replace(example["variable"],"")
  return example

def clean_output(example):
  """clean special characters from the txt generated by the model"""
  example["gen_text"] = re.sub('\n', ' ', example["gen_text"]).strip()
  example["gen_text"] = re.sub(r'\s+', ' ', example["gen_text"])    
  return example

def calculate_metrics(predictions, references):
  ''' given the predictions and references,
      calculates bleu, rouge, and if specified, bleurt scores
      and returns scores as a dataframe'''
  ## bleu score
  bleu = evaluate.load('bleu')
  bleu_scores = bleu.compute(predictions=predictions, references=references)
  bleu_score = {'blue' : bleu_scores['bleu']}
  
  ## rouge score
  rouge = evaluate.load('rouge')
  rouge_score = rouge.compute(predictions=predictions, references=references)

  ## bleurt scores
  #bleurt = evaluate.load('bleurt', 'bleurt-large-512')
  #bleurt new download_config issue: https://github.com/huggingface/evaluate/issues/440
  bleurt = evaluate.load('bleurt', 'bleurt-large-512', download_config=datasets.download.DownloadConfig())
  bleurt_scores = bleurt.compute(predictions=predictions, references=references)
  bleurt_score = {'bleurt_score' : np.average(bleurt_scores['scores'])}
  return pd.DataFrame(bleu_score | rouge_score | bleurt_score, index=['scores']), bleurt_scores

#### Evaluation Function

In [5]:
def evaluate_model(model, 
                   test_data_url,
                   tokenizer,
                   batch_size=64,
                   model_type='decoder', 
                   max_tk=65,
                   gen_params=None,
                   prompt=''):
  ''' Evaluates the fine tuned model on a given csv of the test set'''
  
  ## Load data into huggingface dataset (for testing, use only 1% of data)
  test_dataset = datasets.load_dataset("csv", data_files=test_data_url, split='train[:100%]')
      
  if model_type == 'opt':
  ## if opt, generate output text using batched inputs
  ## Remove input from generated output
    gen_dataset = test_dataset.map(gen_output_batched, batched=True, batch_size=batch_size)
    gen_text = gen_dataset.map(remove_input, remove_columns=["variable"])
  
  elif model_type == 't5':
  ## if encoder-decoder, add the specified prompt,
  ## generate output text using batched inputs
    test_dataset = test_dataset.map(add_prompt)
    gen_text = test_dataset.map(gen_output_batched, batched=True, batch_size=batch_size, remove_columns=["variable"])
  
  else:
  ## if bert2bert, generate output text using batched input
    gen_text = test_dataset.map(gen_output_batched, batched=True, batch_size=batch_size, remove_columns=["variable"])

  gen_text = gen_text.map(clean_output)
  
  return test_dataset['variable'], gen_text

#### Evaluation Data Path, Model Path, & Parameters
Update with desired paths and parameters

In [16]:
test_data_url = "/content/drive/My Drive/W266/StoryBots Datasets/posptproc_corpus_spacy_s3_test.csv"
model_path = "/content/drive/My Drive/W266/Models/t5-v1_1-base-s3-finetuned"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#model = OPTForCausalLM.from_pretrained(model_path).to(device)
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
model_type = 't5' # 'opt' vs 't5' vs 'bert2bert'
tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-base")
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
prompt = '' # '' for opt and bert2bert
max_tk = 150 # 65 for s1, 110 for s2, 150 for s3
gen_params = dict(
    max_new_tokens=50,
    num_beams=4,
    no_repeat_ngram_size=2,
    early_stopping=True,
    num_return_sequences=1
)

#### Run evaluation and save results

In [17]:
# Run evaluation
variable, gen_text = evaluate_model(model, 
                   test_data_url,
                   tokenizer,
                   batch_size=64,
                   model_type=model_type, 
                   max_tk=max_tk,
                   gen_params=gen_params,
                   prompt=prompt)
# calculate bleu, rouge, and bleurt metrics
eval_metrics, bleurt_scores = calculate_metrics(gen_text['gen_text'], gen_text['label'])

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-f1915f1e2c18d63c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-f1915f1e2c18d63c/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


Map:   0%|          | 0/20523 [00:00<?, ? examples/s]

Map:   0%|          | 0/20523 [00:00<?, ? examples/s]

Map:   0%|          | 0/20523 [00:00<?, ? examples/s]

In [20]:
# show label, generated text, bleurt score, and variable for first example
print(gen_text['label'][0])
print(gen_text['gen_text'][0])
print(bleurt_scores['scores'][0])
print(variable[0])

The King said, “It is not possible, the gates lie quite a mile apart.”
Then the hare said, “Thou wilt be the King’s son, and if thou dost not know what to do, I will give thee some of the root of life.”
-1.1381477117538452
Then his hare came and offered to go and bring some of the root of life, and bounded away and brought it while yet there was time, and the dead man was brought to life again, and knew nothing about the wound. After this they journeyed onwards, and the youngest said, “Thou lookest like me, hast royal apparel on as I have, and the animals follow thee as they do me; we will go in by opposite gates, and arrive at the same time from the two sides in the aged King’s presence.” So they separated, and at the same time came the watchmen from the one door and from the other, and announced that the young King and the animals had returned from the chase.


In [21]:
# save predictions and corresponding bleurt scores in a csv
result_dict = {'variable': variable, 'label': gen_text['label'], 'generated text': gen_text['gen_text'], 'bleurt scores' : bleurt_scores['scores']}
result_df = pd.DataFrame(result_dict)
save_path = "drive/My Drive/W266/Evaluation/"
result_df.to_csv(save_path + "t5-v1_1-base-s3-finetuned.csv")

### Evaluation Scores

In [19]:
#T5 S3
eval_metrics

Unnamed: 0,blue,rouge1,rouge2,rougeL,rougeLsum,bleurt_score
scores,0.013134,0.130802,0.020352,0.109915,0.109953,-1.08875


In [13]:
# T5 S2
eval_metrics

Unnamed: 0,blue,rouge1,rouge2,rougeL,rougeLsum,bleurt_score
scores,0.011892,0.132282,0.017951,0.110494,0.110561,-1.107098


In [22]:
# T5 S1

Unnamed: 0,blue,rouge1,rouge2,rougeL,rougeLsum,bleurt_score
scores,0.007994,0.125251,0.010452,0.101473,0.101447,-1.108518


In [15]:
# OPT S3
eval_metrics

Unnamed: 0,blue,rouge1,rouge2,rougeL,rougeLsum,bleurt_score
scores,0.009127,0.140313,0.015348,0.103572,0.103603,-0.924052


In [11]:
# OPT S2
eval_metrics

Unnamed: 0,blue,rouge1,rouge2,rougeL,rougeLsum,bleurt_score
scores,0.008313,0.137038,0.013192,0.101051,0.101071,-0.949812


In [23]:
# OPT Baseline (Untuned)

Unnamed: 0,blue,rouge1,rouge2,rougeL,rougeLsum,bleurt_score
scores,0.005272,0.085343,0.007274,0.065647,0.065646,-0.997209
