In [4]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, pickle, gc
from IPython.display import clear_output
from baseline_utils import *
from tqdm.notebook import tqdm

# Preprocessing and Helper Functions

In [5]:
# the below is all from the HW assignment pt. 1
with open("input.txt", 'r') as f:
    data = f.read()
    
# get all the unique characters that occur in this text
chars = sorted(list(set(data)))
vocab_size = len(chars)

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

# encoder and decoder functions
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# load in our test prompts
with open("test_prompts.pickle", "rb") as file:
    test_prompts = pickle.load(file)

# Generate Outputs For Every Baseline Model

In [6]:
# set a seed for reproducibility
torch.manual_seed(310); np.random.seed(310)

# dictionary to store all of our outputs
baseline_outputs = {}

# go thru each of our models
for embed_size in [192, 384, 576, 768, 960]:
    
    # status update
    clear_output(wait=True)
    print(f"Generating output for baseline embed_size={embed_size}.")

    # create another dictionary in "outputs"
    baseline_outputs[embed_size] = {}
    
    # 1. load in our baseline model
    model = torch.load(f"char_models/embed-size={embed_size}/model.pth")
    model.eval()
    
    # 2. go thru each of our test prompts
    for i, test_prompt in enumerate(test_prompts):
        
        # a. encode our test_prompt as token_ids (character-level)
        token_ids = torch.tensor(encode(test_prompt), device="cuda").reshape(1, -1)
        
        # b. generate our new text -- use the same 3x token_ids-length
        output = model.generate(
            token_ids=token_ids, 
            max_new_tokens=3*len(token_ids.flatten())).cpu().flatten().tolist()

        # c. join our predicted tokens back together using sentences + beautify
        baseline_outputs[embed_size][test_prompt] = decode(output)
        
        # d. status update
        clear_output(wait=True)
        print(f"Finished generating output {i+1} of 50 on baseline embed_size={embed_size}.")
        
    # 3. after each model, clear cache
    torch.cuda.empty_cache()
    gc.collect()
    
# save our outputs
with open("baseline_outputs.pickle", "wb") as file:
    pickle.dump(baseline_outputs, file)

Finished generating output 50 of 50 on baseline embed_size=960.
