In [15]:
# Standard library imports
import os

# Third-party imports
import torch
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from importlib import reload

import sys
sys.path.insert(0, '/home/scottviteri/Projects/CollaborativeTraining/src')

import mvp_loss_decrease

# Reload the local module to reflect any changes
reload(mvp_loss_decrease)

# Import necessary functions from the local module
from mvp_loss_decrease import (
    get_device, 
    load_llama_model, 
    load_and_format_dataset, 
    ExperimentConfig, 
    create_helpful_message_1, 
    create_helpful_message_2, 
    run_experiment
)

SyntaxError: '(' was never closed (mvp_loss_decrease.py, line 125)

In [16]:
MAX_CONTEXT_LENGTH = 1024

device = get_device()
model_name = "distilgpt2"
if model_name == "llama":
    from llama import Llama
    from llama.model import ModelArgs, Transformer
    from llama.tokenizer import Tokenizer
    causal_lm, causal_lm_tokenizer = load_llama_model(device=device)
elif model_name == "gpt2":
    causal_lm = AutoModelForCausalLM.from_pretrained("gpt2")
    causal_lm_tokenizer = AutoTokenizer.from_pretrained("gpt2")
elif model_name == "gpt-neo":
    causal_lm = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")
    causal_lm_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
else:
    causal_lm = AutoModelForCausalLM.from_pretrained("distilgpt2")
    causal_lm_tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
print("Loaded causal LM")
print(causal_lm)
causal_lm = causal_lm.to(device)
print("Loaded causal LM to device")

save_dir="results_debug"
debug=True
BATCH_SIZE = 1
reduced_data=1

# load dataset
# https://www.gutenberg.org/ebooks/71431
textbook_1_path =  '/home/scottviteri/Projects/CollaborativeTraining/data/st_patrick_biography.txt' 
reshaped_tensor = load_and_format_dataset(textbook_1_path, causal_lm_tokenizer, debug=debug, reduced_data=reduced_data)
## make a pytorch data loader for the dataset
dataset_1_loader = torch.utils.data.DataLoader(reshaped_tensor, batch_size=BATCH_SIZE, shuffle=True)

loss_fn = torch.nn.CrossEntropyLoss()

experiments = []
experiments.append(ExperimentConfig(lambda x: x, reshaped_tensor.shape[1], "original"))
#experiments.append(ExperimentConfig(create_helpful_message_1, MAX_CONTEXT_LENGTH, "helpful_message_1"))
#experiments.append(ExperimentConfig(create_helpful_message_2, reshaped_tensor.shape[1], "helpful_message_2"))

losses_dict = {}
correct_probs_all_dict = {}
for experiment in tqdm(experiments, desc="Experiment"):
    losses, correct_probs_all = run_experiment(experiment, dataset_1_loader, causal_lm, loss_fn, device)
    losses_dict[experiment.name] = losses
    correct_probs_all_dict[experiment.name] = correct_probs_all.clone().numpy() / (len(dataset_1_loader) * BATCH_SIZE)
    
for exp_name, losses in losses_dict.items():
    print(f"experiment {exp_name} had avg loss of {np.mean(losses)}")

if not os.path.exists(f"{save_dir}"):
    os.makedirs(f"{save_dir}")
# plot the losses on the same graph
df = pd.DataFrame(losses_dict)
df["batch_index"] = df.index
df = df.melt(id_vars=["batch_index"], value_vars=list(losses_dict.keys()))
fig = px.line(df, x="batch_index", y="value", color="variable")
fig.update_layout(title=f"Losses, batch_size {BATCH_SIZE}")
fig.show()
fig.write_html(f"{save_dir}/losses.html")

# plot the per token posisions on the same graph
# normalize the lengths of the different experiments by padding to the max one with zeros
max_len = max([len(x) for x in correct_probs_all_dict.values()])
for exp_name, correct_probs_all in correct_probs_all_dict.items():
    if len(correct_probs_all) < max_len:
        correct_probs_all_dict[exp_name] = np.pad(correct_probs_all, (0, max_len - len(correct_probs_all)), "constant", constant_values=0)
df = pd.DataFrame(correct_probs_all_dict)
df["position"] = df.index
df = df.melt(id_vars=["position"], value_vars=list(correct_probs_all_dict.keys()))
fig = px.line(df, x="position", y="value", color="variable")
fig.update_layout(title="Probability of correct token at each position")
fig.show()
fig.write_html(f"{save_dir}/probability_of_correct_token_at_each_position.html")


Loaded causal LM
GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
Loaded causal LM to device


Found cached dataset text (/home/scottviteri/.cache/huggingface/datasets/text/default-01801ed3dc16f16b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 16014
    })
})


Token indices sequence length is longer than the specified maximum sequence length for this model (233371 > 1024). Running this sequence through the model will result in indexing errors


torch.Size([260, 896])


Experiment original: 100%|██████████| 1/1 [00:00<00:00, 25.06it/s]
Experiment: 100%|██████████| 1/1 [00:00<00:00, 23.79it/s]

hi torch.Size([895, 50257])
bye torch.Size([895])
experiment original had avg loss of 2.1345865726470947



