# General Imports and Utilities

In [0]:
import os
import datetime

def quote(t):
    return '"{}"'.format(t)

# Model Evaluation

This notebook computes perplexity and performs text generation to evaluate our models.

In [0]:
GDRIVE_MOUNT_PATH = '/content/gdrive'
NLP_DRIVE_PATH = os.path.join(GDRIVE_MOUNT_PATH, 'Shared drives/CS263')

# EXP_PARENT_DIR points to the directory where all experiments are stored. Can be updated.
EXP_PARENT_DIR = os.path.join(NLP_DRIVE_PATH, "models/american_rhetoric")

# TODO: change this as necessary before each experiment
EXP_NAME = "tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls"
EXP_DIR = os.path.join(EXP_PARENT_DIR, EXP_NAME) # Points to the current experiment dir

# TODO: Update this to point to your desired dataset
DATASET_DIR = os.path.join(NLP_DRIVE_PATH, 'data/american_rhetoric/speech_bank/tones_watson-summarized_bart_large_xsm/converted-summary_tone')

OUTPUT_DIR = quote(os.path.join(EXP_DIR, 'eval_output'))
OUTPUT_DIR_TRAIN = quote(os.path.join(EXP_DIR, 'output_train'))
OUTPUT_DIR_VAL = quote(os.path.join(EXP_DIR, 'output_val'))
OUTPUT_DIR_TEST = quote(os.path.join(EXP_DIR, 'output_test'))

# TRAIN_DATA = os.path.join(DATASET_DIR, 'parsed-summarized-converted-train.txt')
# EVAL_DATA = os.path.join(DATASET_DIR, 'parsed-summarized-converted-val.txt')
# TEST_DATA = os.path.join(DATASET_DIR, 'parsed-summarized-converted-test.txt')

TRAIN_DATA = os.path.join(DATASET_DIR, 'data-train.txt')
EVAL_DATA = os.path.join(DATASET_DIR, 'data-val.txt')
TEST_DATA = os.path.join(DATASET_DIR, 'data-test.txt')

TRAIN_DATA_PATH=quote(TRAIN_DATA)
EVAL_DATA_PATH=quote(EVAL_DATA)
TEST_DATA_PATH=quote(TEST_DATA)

MODEL_TYPE="gpt2-medium" # The type of model we are using
CHECKPOINT = os.path.join(EXP_DIR, "output")
# CHECKPOINT=os.path.join(NLP_DRIVE_PATH, "models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone/output") # location of saved weights
# CHECKPOINT=os.path.join(NLP_DRIVE_PATH, "models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone/output/checkpoint-5000") # location of saved weights
CHECKPOINT_PATH=quote(CHECKPOINT) # same as above but with double quotes for bash


TOKENIZER = os.path.join(EXP_DIR, "tokenizer")
TOKENIZER_PATH = quote(TOKENIZER)

In [3]:
print(GDRIVE_MOUNT_PATH)
print(NLP_DRIVE_PATH)
print(EXP_PARENT_DIR)
print(EXP_NAME)
print(EXP_DIR)
print(DATASET_DIR)

print(OUTPUT_DIR)
print(OUTPUT_DIR_TRAIN)
print(OUTPUT_DIR_TEST)
print(OUTPUT_DIR_VAL)


print(TRAIN_DATA)
print(EVAL_DATA)
print(TEST_DATA)

print(TRAIN_DATA_PATH)
print(EVAL_DATA_PATH)
print(TEST_DATA_PATH)

print(MODEL_TYPE)
print(CHECKPOINT)
print(CHECKPOINT_PATH)

/content/gdrive
/content/gdrive/Shared drives/CS263
/content/gdrive/Shared drives/CS263/models/american_rhetoric
tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls
/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls
/content/gdrive/Shared drives/CS263/data/american_rhetoric/speech_bank/tones_watson-summarized_bart_large_xsm/converted-summary_tone
"/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/eval_output"
"/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/output_train"
"/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summarized_bart_large_xsm/converted-summary_tone-tag_v3-eos-cls/output_test"
"/content/gdrive/Shared drives/CS263/models/american_rhetoric/tones_watson-summari

# Setup

We still need the hugging face transformers library. Clone it here and install dependencies.

In [0]:
# remove the capture to show output from this cell
%%capture

import os

# remove previous installations and clone the repo
!rm -rm /content/transformers
!git clone https://github.com/huggingface/transformers

# make sure all dependencies are installed
os.chdir('/content/transformers')

!pip install .
!pip install -r ./examples/requirements.txt

# navigate to folder with the language modeling training script
os.chdir('/content/transformers/examples/language-modeling')

In [5]:
# connect to google drive so we can access our data set
from google.colab import drive
drive.mount(GDRIVE_MOUNT_PATH)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


# Generate Text

The next two blocks generate text based on the input prompt.

In [0]:
# comment the line below to show output from this cell
%%capture log_load_model

# this cell just loads our pretrained model
from transformers import AutoConfig
from transformers import AutoTokenizer
from transformers import AutoModelWithLMHead
import torch

# config = AutoConfig.from_pretrained(TOKENIZER)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

# model = AutoModelWithLMHead.from_pretrained(CHECKPOINT, config=config)
model = AutoModelWithLMHead.from_pretrained(CHECKPOINT)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [7]:
log_load_model.show()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

In [9]:
# change this prompt to get different speeches
# PROMPT = """
# <summary="Hospitals will need more personal protective equipment. The stock market is volatile. Everyone should adhere to social distancing and general hygiene to stay healthy.">
# <tone="fear, confident">
# """

PROMPT_v3 = """<|summary|>
Hospitals will need more personal protective equipment. The stock market is volatile. Everyone should adhere to social distancing and general hygiene to stay healthy.
<|tone|>
fear, confident
<|cls|>"""

# PROMPT_v3_from_test_data = """<|summary|>
# The United States Department of Education was created to protect the rights of all students, not just those who are victims of discrimination or who are accused of discrimination.
# <|tone|>
# analytical, joy, sadness
# <|cls|>"""

#PROMPT = PROMPT_v3_from_test_data
PROMPT = PROMPT_v3
print(PROMPT)

# MAX_LENGTH=512
MAX_LENGTH=1024
TOP_K=50
TOP_P=0.95
NUM_OUTPUTS=3

input_ids = tokenizer.encode(PROMPT, return_tensors="pt").to(device)

sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=MAX_LENGTH, 
    top_k=TOP_K, 
    top_p=TOP_P, 
    num_return_sequences=NUM_OUTPUTS,
    repetition_penalty=None,
    early_stopping=True
)

output_txt = ""
print("\n\nOutput:\n" + 50 * '-')
for i, sample_output in enumerate(sample_outputs):
  output_txt += "\n\n{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True))

print(output_txt)

gen_file_name = datetime.datetime.now().strftime(
    "sample_generation-%Y_%m_%d-%H_%M_%S.txt"
)
with open(os.path.join(EXP_DIR, gen_file_name), "w") as fp:
  fp.writelines(output_txt)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


<|summary|>
Hospitals will need more personal protective equipment. The stock market is volatile. Everyone should adhere to social distancing and general hygiene to stay healthy.
<|tone|>
fear, confident
<|cls|>


Output:
--------------------------------------------------


0: <|summary|> 
Hospitals will need more personal protective equipment. The stock market is volatile. Everyone should adhere to social distancing and general hygiene to stay healthy. <|tone|> 
fear, confident
My fellow citizens:
Over the past year, we have had an outbreak of a potentially deadly coronavirus in West Africa. It was originally called COVID-19 or 'Coast-to-Coast-19', but is now called H5N1.
Our health systems, from all regions and the globe, are at their best when we are all vigilant, and work together. At the same time, the health needs of our people and the needs of the global community remain extremely dire, and increasingly urgent.
Therefore, this morning, following the request of the United States 


# Compute Perplexity

We can use the transformers library to compute perplexity. Note that it might produce different scores from training if the block size is not set to the same value as the training phase.

In [0]:
!python run_language_modeling.py \
    --output_dir=$OUTPUT_DIR_TRAIN \
    --model_type=$MODEL_TYPE \
    --model_name_or_path=$CHECKPOINT_PATH \
    --do_eval \
    --eval_data_file=$TRAIN_DATA_PATH

print("Train data perplexity complete")

In [0]:
!python run_language_modeling.py \
    --output_dir=$OUTPUT_DIR_TEST \
    --model_type=$MODEL_TYPE \
    --model_name_or_path=$CHECKPOINT_PATH \
    --do_eval \
    --eval_data_file=$TEST_DATA_PATH

print("Test data perplexity complete")