# General Imports and Utilities

In [0]:
import os

def quote(t):
    return '"{}"'.format(t)

# Model Evaluation

This notebook computes perplexity and performs text generation to evaluate our models.

In [0]:
# MODEL_TYPE="gpt2" # The type of model we are using
# CHECKPOINT="/content/gdrive/Shared drives/CS263/models/pres_speeches/" # location of saved weights
# CHECKPOINT_PATH='"/content/gdrive/Shared drives/CS263/models/pres_speeches/"' # same as above but with double quotes for bash

# TRAIN_DATA="/content/gdrive/Shared drives/CS263/data/pres_speeches/pres_speech_train.txt"
# EVAL_DATA="/content/gdrive/Shared drives/CS263/data/pres_speeches/pres_speech_val.txt"
# TEST_DATA="/content/gdrive/Shared drives/CS263/data/pres_speeches/pres_speech_test.txt"

# TRAIN_DATA_PATH='"/content/gdrive/Shared drives/CS263/data/pres_speeches/pres_speech_train.txt"'
# EVAL_DATA_PATH='"/content/gdrive/Shared drives/CS263/data/pres_speeches/pres_speech_val.txt"'
# TEST_DATA_PATH='"/content/gdrive/Shared drives/CS263/data/pres_speeches/pres_speech_test.txt"'

In [0]:
GDRIVE_MOUNT_PATH = '/content/gdrive'
NLP_DRIVE_PATH = os.path.join(GDRIVE_MOUNT_PATH, 'Shared drives/CS263')

# EXP_PARENT_DIR points to the directory where all experiments are stored. Can be updated.
EXP_PARENT_DIR = os.path.join(NLP_DRIVE_PATH, "models/american_rhetoric")
EXP_NAME = "tones_watson-summarized_bart_large_xsm/converted-summary_tone" # TODO: change this as necessary before each experiment
EXP_DIR = os.path.join(EXP_PARENT_DIR, EXP_NAME) # Points to the current experiment dir

# Update this to point to your desired dataset
DATASET_DIR = os.path.join(NLP_DRIVE_PATH, 'data/american_rhetoric/speech_bank/tones_watson-summarized_bart_large_xsm/converted-summary_tone')

OUTPUT_DIR = quote(os.path.join(EXP_DIR, 'eval_output'))
OUTPUT_DIR_TRAIN = quote(os.path.join(EXP_DIR, 'output_train'))
OUTPUT_DIR_VAL = quote(os.path.join(EXP_DIR, 'output_val'))
OUTPUT_DIR_TEST = quote(os.path.join(EXP_DIR, 'output_test'))

# TRAIN_DATA = os.path.join(DATASET_DIR, 'parsed-summarized-converted-train.txt')
# EVAL_DATA = os.path.join(DATASET_DIR, 'parsed-summarized-converted-val.txt')
# TEST_DATA = os.path.join(DATASET_DIR, 'parsed-summarized-converted-test.txt')
TRAIN_DATA = os.path.join(DATASET_DIR, 'data-train.txt')
EVAL_DATA = os.path.join(DATASET_DIR, 'data-val.txt')
TEST_DATA = os.path.join(DATASET_DIR, 'data-test.txt')

TRAIN_DATA_PATH=quote(TRAIN_DATA)
EVAL_DATA_PATH=quote(EVAL_DATA)
TEST_DATA_PATH=quote(TEST_DATA)

MODEL_TYPE="gpt2-medium" # The type of model we are using
CHECKPOINT=os.path.join(NLP_DRIVE_PATH, "models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone/output") # location of saved weights
CHECKPOINT_PATH=quote(CHECKPOINT) # same as above but with double quotes for bash

In [0]:
print(GDRIVE_MOUNT_PATH)
print(NLP_DRIVE_PATH)
print(EXP_PARENT_DIR)
print(EXP_NAME)
print(EXP_DIR)
print(DATASET_DIR)

print(OUTPUT_DIR)
print(OUTPUT_DIR_TRAIN)
print(OUTPUT_DIR_TEST)
print(OUTPUT_DIR_VAL)


print(TRAIN_DATA)
print(EVAL_DATA)
print(TEST_DATA)

print(TRAIN_DATA_PATH)
print(EVAL_DATA_PATH)
print(TEST_DATA_PATH)

print(MODEL_TYPE)
print(CHECKPOINT)
print(CHECKPOINT_PATH)

/content/gdrive
/content/gdrive/Shared drives/CS263
/content/gdrive/Shared drives/CS263/models/american_rhetoric
tones_watson-summarized_bart_large_xsm/converted-summary_tone
/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone
/content/gdrive/Shared drives/CS263/data/american_rhetoric/speech_bank/tones_watson-summarized_bart_large_xsm/converted-summary_tone
"/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone/eval_output"
"/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone/output_train"
"/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone/output_test"
"/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone/output_val"
/content/gdrive/Share

# Setup

We still need the hugging face transformers library. Clone it here and install dependencies.

In [0]:
# remove the capture to show output from this cell
%%capture

import os

# remove previous installations and clone the repo
!rm -rm /content/transformers
!git clone https://github.com/huggingface/transformers

# make sure all dependencies are installed
os.chdir('/content/transformers')

!pip install .
!pip install -r ./examples/requirements.txt

# navigate to folder with the language modeling training script
os.chdir('/content/transformers/examples/language-modeling')

In [6]:
# connect to google drive so we can access our data set
from google.colab import drive
drive.mount(GDRIVE_MOUNT_PATH)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


# Generate Text

The next two blocks generate text based on the input prompt.

In [0]:
# comment the line below to show output from this cell
# %%capture

# this cell just loads our pretrained model
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
import torch

config = AutoConfig.from_pretrained(CHECKPOINT)
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

model = AutoModelWithLMHead.from_pretrained(CHECKPOINT, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

In [0]:
# change this prompt to get different speeches
# PROMPT = '<title=\"Government Response to the COVID-19 Pandemic\">\n<president=\"kennedy\">\n<date=\"April 10, 1929\">'
PROMPT = """
<title="Government Response to the COVID-19 Pandemic">
<speaker="Barack Obama">
<year="1990">
<summary="Hopsitals will need more personal protective equipment. The stock market is volatile. Everyone should adhere to social distancing and general hygiene to stay healthy.">
"""
print(PROMPT)

MAX_LENGTH=512
TOP_K=50
TOP_P=0.95
NUM_OUTPUTS=3

input_ids = tokenizer.encode(PROMPT, return_tensors="pt").to(device)

sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=MAX_LENGTH, 
    top_k=TOP_K, 
    top_p=TOP_P, 
    num_return_sequences=NUM_OUTPUTS,
    repetition_penalty=None,
    early_stopping=True
)

output_txt = ""
print("Output:\n" + 50 * '-')
for i, sample_output in enumerate(sample_outputs):
  output_txt += "{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True))

print(output_txt)

with open(os.path.join(EXP_DIR, "sample_generation.txt"), "w") as fp:
  fp.writelines(output_txt)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence



<title="Government Response to the COVID-19 Pandemic">
<speaker="Barack Obama">
<year="1990">
<summary="Hopsitals will need more personal protective equipment. The stock market is volatile. Everyone should adhere to social distancing and general hygiene to stay healthy.">

Output:
--------------------------------------------------
0: 
<title="Government Response to the COVID-19 Pandemic">
<speaker="Barack Obama">
<year="1990">
<summary="Hopsitals will need more personal protective equipment. The stock market is volatile. Everyone should adhere to social distancing and general hygiene to stay healthy.">
<tone="analytical, joy">
Good afternoon.
So, last weekend, we got a couple of cases of the coronavirus -- the coronavirus -- from China. They're the largest number that we've seen in history.
So we're going to be sending hundreds of thousands of samples in order to find out who did this. And it's likely that somebody had contaminated somewhere, and had come into contact with infected in

# Compute Perplexity

We can use the transformers library to compute perplexity. Note that it might produce different scores from training if the block size is not set to the same value as the training phase.

In [9]:
!python run_language_modeling.py \
    --output_dir=$OUTPUT_DIR_TRAIN \
    --model_type=$MODEL_TYPE \
    --model_name_or_path=$CHECKPOINT_PATH \
    --do_eval \
    --eval_data_file=$TRAIN_DATA_PATH \

print("Train data perplexity complete")

2020-05-31 18:31:07.854495: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
05/31/2020 18:31:09 - INFO - transformers.training_args -   PyTorch: setting up devices
05/31/2020 18:31:09 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone/output_train', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, evaluate_during_training=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=Fal

In [2]:
!python run_language_modeling.py \
    --output_dir=$OUTPUT_DIR_VAL \
    --model_type=$MODEL_TYPE \
    --model_name_or_path=$CHECKPOINT_PATH \
    --do_eval \
    --eval_data_file=$EVAL_DATA_PATH

print("Eval data perplexity complete")

python3: can't open file 'run_language_modeling.py': [Errno 2] No such file or directory
Eval data perplexity complete


In [10]:
!python run_language_modeling.py \
    --output_dir=$OUTPUT_DIR_TEST \
    --model_type=$MODEL_TYPE \
    --model_name_or_path=$MODEL_TYPE \
    --do_eval \
    --eval_data_file=$TEST_DATA_PATH \
    --block_size 256

print("Test data perplexity complete")

2020-05-31 18:31:34.293866: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
05/31/2020 18:31:36 - INFO - transformers.training_args -   PyTorch: setting up devices
05/31/2020 18:31:36 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone/output_test', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, evaluate_during_training=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=Fals

We also have a custom script for computing perplexity, but it is likely less accurate than the transformers library.

In [0]:
import torch
import math
import numpy as np

SPEECH_LIMIT=100 # limit number of speeches to use to reduce computation time

def split_speech_by_sentence(speech):
  length_limit = 100
  sentences = []
  sent = ""
  for char in speech:
    if char == "." and len(sent) > length_limit:
      sentences.append(sent)
      sent = ""
    else:
      sent += char

  if sent != "" and len(sent) > length_limit:
    sentences.append(sent)

  return sentences

def score_speech(model, tokenizer, speech):
    sentences = split_speech_by_sentence(speech)

    loss = 0.0
    with torch.no_grad():
      for sent in sentences:
        tensor_input = tokenizer.encode(sent, return_tensors="pt").unsqueeze(0).to(device)
        loss += model(tensor_input, labels=tensor_input)[0].item() * len(sent)
    return loss

def get_perplexity(speech_limit, model, tokenizer, filepath):
  speech = ""
  total_loss = 0.0
  num_speeches = 0
  num_words = 0
  with open(filepath) as fp:
    for line in fp:
      if "<title=" in line and speech != "":
        total_loss += score_speech(model, tokenizer, speech)
        num_words += len(speech)
        speech = line
        num_speeches += 1
        if speech_limit is not None and num_speeches >= speech_limit:
          speech = ""
          break
      else:
        speech += line
    
    if speech != "":
      total_loss += score_speech(model, tokenizer, speech)
      num_speeches += 1

    print("Number of speeches in dataset:", num_speeches)
    return math.exp(total_loss / num_words)

train_perplexity = get_perplexity(SPEECH_LIMIT, model, tokenizer, TRAIN_DATA)
print("Train perplexity is:", train_perplexity)

eval_perplexity = get_perplexity(SPEECH_LIMIT, model, tokenizer, EVAL_DATA)
print("Eval perplexity is:", eval_perplexity)

test_perplexity = get_perplexity(SPEECH_LIMIT, model, tokenizer, TEST_DATA)
print("Test perplexity is:", test_perplexity)

Token indices sequence length is longer than the specified maximum sequence length for this model (1415 > 1024). Running this sequence through the model will result in indexing errors


RuntimeError: ignored