In [62]:
# Standard library imports
import os

# Third-party imports
import torch
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from importlib import reload

import sys
sys.path.insert(0, '/home/scottviteri/Projects/CollaborativeTraining/src')

import mvp_loss_decrease

# Reload the local module to reflect any changes
reload(mvp_loss_decrease)

# Import necessary functions from the local module
from mvp_loss_decrease import *
#(
#    get_device, 
#    load_llama_model, 
#    load_and_format_dataset, 
#    ExperimentConfig, 
#    create_helpful_message_1, 
#    create_helpful_message_2, 
#    run_experiment
#)

In [29]:
import openai
openai.api_key = input()

In [65]:
MAX_CONTEXT_LENGTH = 50
BATCH_SIZE = 1
reduced_data = 3
debug=False
save_dir = "results_debug"

device = get_device()
model_name = "distilgpt2"
if model_name == "llama":
    from llama import Llama
    from llama.model import ModelArgs, Transformer
    from llama.tokenizer import Tokenizer
    causal_lm, causal_lm_tokenizer = load_llama_model(device=device)
elif model_name == "gpt-neo":
    causal_lm = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")#"EleutherAI/gpt-neo-2.7B")
    causal_lm_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
else:
    causal_lm = AutoModelForCausalLM.from_pretrained("distilgpt2")
    causal_lm_tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
#print("Loaded causal LM")
#print(causal_lm)
causal_lm = causal_lm.to(device)
#print("Loaded causal LM to device")

# load dataset
# https://www.gutenberg.org/ebooks/71431
textbook_1_path = "/home/scottviteri/Projects/CollaborativeTraining/data/st_patrick_biography.txt"
reshaped_tensor = load_and_format_dataset(textbook_1_path, causal_lm_tokenizer, debug=debug, reduced_data=reduced_data)
## make a pytorch data loader for the dataset
dataset_1_loader = torch.utils.data.DataLoader(reshaped_tensor, batch_size=BATCH_SIZE, shuffle=True)

loss_fn = torch.nn.CrossEntropyLoss()



Found cached dataset text (/home/scottviteri/.cache/huggingface/datasets/text/default-01801ed3dc16f16b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 16014
    })
})


Token indices sequence length is longer than the specified maximum sequence length for this model (233371 > 1024). Running this sequence through the model will result in indexing errors


torch.Size([2333, 100])


In [66]:
experiments = []
experiments.append(ExperimentConfig(lambda x: x, reshaped_tensor.shape[1], "original"))
#experiments.append(ExperimentConfig(create_helpful_message_1, MAX_CONTEXT_LENGTH, "helpful_message_1"))
#experiments.append(ExperimentConfig(create_helpful_message_2, reshaped_tensor.shape[1], "helpful_message_2"))
#experiments.append(ExperimentConfig(lambda x: create_helpful_message_2(x, tokens_to_grab=None), reshaped_tensor.shape[1], "helpful_message_2"))
experiments.append(ExperimentConfig(lambda x: create_openai_helpful_message(x, causal_lm_tokenizer, causal_lm), reshaped_tensor.shape[1], "openai_helpful_message"))

losses_dict = {}
correct_probs_all_dict = {}
for experiment in tqdm(experiments, desc="Experiment"):
    losses, correct_probs_all = run_experiment(experiment, dataset_1_loader, causal_lm, loss_fn, device)
    losses_dict[experiment.name] = losses
    
    # Smooth the correct_probs_all with a convolution
    kernel_size = 10  # Define the size of the smoothing kernel
    padding_size = kernel_size // 2
    correct_probs_all_padded = np.pad(correct_probs_all, (padding_size, padding_size), mode='constant', constant_values=0)
    kernel = np.ones(kernel_size) / kernel_size
    correct_probs_all_smoothed = np.convolve(correct_probs_all_padded, kernel, mode='valid')
    
    correct_probs_all_dict[experiment.name] = correct_probs_all_smoothed / (len(dataset_1_loader) * BATCH_SIZE)
    #correct_probs_all_dict[experiment.name] = correct_probs_all

for exp_name, losses in losses_dict.items():
    print(f"experiment {exp_name} had avg loss of {np.mean(losses)}")

if not os.path.exists(f"{save_dir}"):
    os.makedirs(f"{save_dir}")
save_dir = os.path.join(save_dir, f"{model_name}")
if not os.path.exists(f"{save_dir}"):
    os.makedirs(f"{save_dir}")
# plot the losses on the same graph

#for key in losses_dict:
#    losses_dict[key] = [loss.item() for loss in losses_dict[key]]

df = pd.DataFrame(losses_dict)
df["batch_index"] = df.index
df = df.melt(id_vars=["batch_index"], value_vars=list(losses_dict.keys()))
fig = px.line(df, x="batch_index", y="value", color="variable")
fig.update_layout(title=f"Losses, batch_size {BATCH_SIZE}")
fig.show()
fig.write_html(f"{save_dir}/losses.html")

# plot the per token posisions on the same graph
# normalize the lengths of the different experiments by padding to the max one with zeros
max_len = max([len(x) for x in correct_probs_all_dict.values()])
for exp_name, correct_probs_all in correct_probs_all_dict.items():
    if len(correct_probs_all) < max_len:
        correct_probs_all_dict[exp_name] = np.pad(correct_probs_all, (0, max_len - len(correct_probs_all)), "constant", constant_values=0)
df = pd.DataFrame(correct_probs_all_dict)
df["position"] = df.index
df = df.melt(id_vars=["position"], value_vars=list(correct_probs_all_dict.keys()))
fig = px.line(df, x="position", y="value", color="variable")
fig.update_layout(title="Probability of correct token at each position")
fig.show()
fig.write_html(f"{save_dir}/probability_of_correct_token_at_each_position.html")


Experiment original: 100%|██████████| 3/3 [00:00<00:00, 112.13it/s]


Text:   States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The life of St. Patrick and his place in history


Author: J. B. Bury

Release date: August 17, 2023 [eBook #71431]

Language: English

Original publication: London: Macmillan and Co, 1905

Credits: Tim Lindell and the Online Distributed Proofreading Team at




Prepend string: This is an announcement for a new eBook release from Project Gutenberg, be mindful of the copyright laws if you are outside of the United
Prepend String Length:  26
Text:   https://www.pgdp.net (This file was produced from images generously made available by The Internet Archive/American Libraries.)


*** START OF THE PROJECT GUTENBERG EBOOK THE LIFE OF ST. PATRICK AND HIS PLACE IN HISTORY ***






THE LIFE OF ST. PATRICK

[Illustration]




                      




Prepend string: Have you ever wondered about the life of St. Patrick and his place in history? Look no further! Here is a free electronic version of a thorough account on the subject. Check out the following link:
Prepend String Length:  41
Text:  The Project Gutenberg eBook of The life of St. Patrick and his place in history
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United


Experiment openai_helpful_message: 100%|██████████| 3/3 [00:13<00:00,  4.50s/it]
Experiment: 100%|██████████| 2/2 [00:13<00:00,  6.76s/it]

Prepend string: "Welcome to the open library resource, Project Gutenberg, where you can access numerous literary works for free. Today, we would like to highlight one such piece:
Prepend String Length:  32
experiment original had avg loss of 0.10262247174978256
experiment openai_helpful_message had avg loss of 0.09490817040205002



