In [138]:
import torch
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pickle
from IPython.display import clear_output

In [5]:
# Load GPT-2 XL (for evaluation) + move to GPU.
model = GPT2LMHeadModel.from_pretrained("gpt2-xl", torch_dtype=torch.float32)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# check if GPU is available
print(f"GPU available: {torch.cuda.is_available()}")

GPU available: True


# Helper Function for Scoring Outputs

In [112]:
# compute the (mean) log-probability of "full_text" being generated by GPT-2XL, given "prompt"
def score(prompt, full_text):

    # 1. tokenize our full text + compute number of tokens that were in original prompt
    input_ids = tokenizer(
        full_text, truncation=True, max_length=1024, return_tensors="pt").to(device)["input_ids"]
    prompt_token_length = tokenizer(
        prompt, truncation=True, max_length=1024, return_tensors="pt").to(device)["input_ids"][0].shape[0]

    # 2. get the logits from GPT-2XL using forward function
    with torch.no_grad():
        logits = model(input_ids=input_ids).logits

    # 3. setup our autoregressive structure token_id ~ predicted logits for next token
    gpt_predicted_logits = logits[0, prompt_token_length-1 : -1]
    observed_token_ids = input_ids[0, prompt_token_length:]

    # 4. compute the log-probability of ONLY our observed tokens under GPT-2XL!
    gpt_log_probs = torch.log_softmax(gpt_predicted_logits, dim=1)
    observed_log_probs = gpt_log_probs.gather(1, observed_token_ids.reshape(-1, 1)).flatten()

    # 5. return the average log-prob as our score
    return observed_log_probs.mean().item()

# Computing Scores for the Baseline Models

In [140]:
# load the outputs of our baseline models
with open("baseline_outputs.pickle", "rb") as file:
    baseline_outputs = pickle.load(file)

# for storing scores of our baseline models
baseline_scores = {}

# iterate thru all of our embedding sizes for the baseline models
for embed_size in baseline_outputs.keys():
    
    # create a list to store our scores for each prompt
    baseline_scores[embed_size] = []
    
    # get the score for each prompt
    for prompt in tqdm(baseline_outputs[embed_size].keys()):
        
        # get our full-text output (including the prompt)
        full_text = baseline_outputs[embed_size][prompt]
        
        # append to our list
        baseline_scores[embed_size].append(score(prompt, full_text))
        
# save our baseline models' scores as a .pickle for reproducibility
with open("baseline_scores.pickle", "wb") as file:
    pickle.dump(baseline_scores, file)

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

# Computing Scores for the Word-Level Models

In [141]:
# load the outputs of our non-baseline models
with open("outputs.pickle", "rb") as file:
    outputs = pickle.load(file)

# for storing scores of our non-baseline models
scores = {}

# go thru all of our model variants
for mc in [1, 3, 5]:
    for embed_size in [192, 384, 576, 768, 960]:
        for freeze_type in [True, False, None]:
            
            # status update
            clear_output(wait=True)
            print(f"Scoring outputs of model with mc={mc}, embed_size={embed_size}, freeze_type={str(freeze_type)}:")
            
            # create a list to store our scores for each prompt
            scores[(mc, embed_size, freeze_type)] = []

            # get the score for each prompt
            for prompt in tqdm(outputs[(mc, embed_size, freeze_type)].keys()):

                # get our full-text output (including the prompt)
                full_text = outputs[(mc, embed_size, freeze_type)][prompt]

                # append to our list
                scores[(mc, embed_size, freeze_type)].append(score(prompt, full_text))
    
# save our non-baseline models' scores as a .pickle for reproducibility
with open("scores.pickle", "wb") as file:
    pickle.dump(scores, file)

Scoring outputs of model with mc=5, embed_size=960, freeze_type=None:


  0%|          | 0/48 [00:00<?, ?it/s]