In [46]:
# Standard library imports
import os

# Third-party imports
import torch
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from importlib import reload

import sys
sys.path.insert(0, '/home/scottviteri/Projects/CollaborativeTraining/src')

import mvp_loss_decrease

# Reload the local module to reflect any changes
reload(mvp_loss_decrease)

# Import necessary functions from the local module
from mvp_loss_decrease import *
#(
#    get_device, 
#    load_llama_model, 
#    load_and_format_dataset, 
#    ExperimentConfig, 
#    create_helpful_message_1, 
#    create_helpful_message_2, 
#    run_experiment
#)

In [29]:
import openai
openai.api_key = input()

In [47]:
MAX_CONTEXT_LENGTH = 1024
BATCH_SIZE = 1
reduced_data = 3
debug=False
save_dir = "results_debug"

device = get_device()
model_name = "distilgpt2"
if model_name == "llama":
    from llama import Llama
    from llama.model import ModelArgs, Transformer
    from llama.tokenizer import Tokenizer
    causal_lm, causal_lm_tokenizer = load_llama_model(device=device)
elif model_name == "gpt-neo":
    causal_lm = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")#"EleutherAI/gpt-neo-2.7B")
    causal_lm_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
else:
    causal_lm = AutoModelForCausalLM.from_pretrained("distilgpt2")
    causal_lm_tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
#print("Loaded causal LM")
#print(causal_lm)
causal_lm = causal_lm.to(device)
#print("Loaded causal LM to device")

# load dataset
# https://www.gutenberg.org/ebooks/71431
textbook_1_path = "/home/scottviteri/Projects/CollaborativeTraining/data/st_patrick_biography.txt"
reshaped_tensor = load_and_format_dataset(textbook_1_path, causal_lm_tokenizer, debug=debug, reduced_data=reduced_data)
## make a pytorch data loader for the dataset
dataset_1_loader = torch.utils.data.DataLoader(reshaped_tensor, batch_size=BATCH_SIZE, shuffle=True)

loss_fn = torch.nn.CrossEntropyLoss()



Found cached dataset text (/home/scottviteri/.cache/huggingface/datasets/text/default-01801ed3dc16f16b/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 16014
    })
})


Token indices sequence length is longer than the specified maximum sequence length for this model (233371 > 1024). Running this sequence through the model will result in indexing errors


torch.Size([260, 896])


In [49]:
experiments = []
experiments.append(ExperimentConfig(lambda x: x, reshaped_tensor.shape[1], "original"))
#experiments.append(ExperimentConfig(create_helpful_message_1, MAX_CONTEXT_LENGTH, "helpful_message_1"))
#experiments.append(ExperimentConfig(create_helpful_message_2, reshaped_tensor.shape[1], "helpful_message_2"))
#experiments.append(ExperimentConfig(lambda x: create_helpful_message_2(x, tokens_to_grab=None), reshaped_tensor.shape[1], "helpful_message_2"))
experiments.append(ExperimentConfig(lambda x: create_openai_helpful_message(x, causal_lm_tokenizer, causal_lm), reshaped_tensor.shape[1], "openai_helpful_message"))

losses_dict = {}
correct_probs_all_dict = {}
for experiment in tqdm(experiments, desc="Experiment"):
    losses, correct_probs_all = run_experiment(experiment, dataset_1_loader, causal_lm, loss_fn, device)
    losses_dict[experiment.name] = losses
    
    # Smooth the correct_probs_all with a convolution
    kernel_size = 5  # Define the size of the smoothing kernel
    padding_size = kernel_size // 2
    correct_probs_all_padded = np.pad(correct_probs_all, (padding_size, padding_size), mode='constant', constant_values=0)
    kernel = np.ones(kernel_size) / kernel_size
    correct_probs_all_smoothed = np.convolve(correct_probs_all_padded, kernel, mode='valid')
    
    correct_probs_all_dict[experiment.name] = correct_probs_all_smoothed / (len(dataset_1_loader) * BATCH_SIZE)
    #correct_probs_all_dict[experiment.name] = correct_probs_all

for exp_name, losses in losses_dict.items():
    print(f"experiment {exp_name} had avg loss of {np.mean(losses)}")

if not os.path.exists(f"{save_dir}"):
    os.makedirs(f"{save_dir}")
save_dir = os.path.join(save_dir, f"{model_name}")
if not os.path.exists(f"{save_dir}"):
    os.makedirs(f"{save_dir}")
# plot the losses on the same graph

#for key in losses_dict:
#    losses_dict[key] = [loss.item() for loss in losses_dict[key]]

df = pd.DataFrame(losses_dict)
df["batch_index"] = df.index
df = df.melt(id_vars=["batch_index"], value_vars=list(losses_dict.keys()))
fig = px.line(df, x="batch_index", y="value", color="variable")
fig.update_layout(title=f"Losses, batch_size {BATCH_SIZE}")
fig.show()
fig.write_html(f"{save_dir}/losses.html")

# plot the per token posisions on the same graph
# normalize the lengths of the different experiments by padding to the max one with zeros
max_len = max([len(x) for x in correct_probs_all_dict.values()])
for exp_name, correct_probs_all in correct_probs_all_dict.items():
    if len(correct_probs_all) < max_len:
        correct_probs_all_dict[exp_name] = np.pad(correct_probs_all, (0, max_len - len(correct_probs_all)), "constant", constant_values=0)
df = pd.DataFrame(correct_probs_all_dict)
df["position"] = df.index
df = df.melt(id_vars=["position"], value_vars=list(correct_probs_all_dict.keys()))
fig = px.line(df, x="position", y="value", color="variable")
fig.update_layout(title="Probability of correct token at each position")
fig.show()
fig.write_html(f"{save_dir}/probability_of_correct_token_at_each_position.html")


Experiment original: 100%|██████████| 3/3 [00:00<00:00, 22.07it/s]
Experiment:  50%|█████     | 1/2 [00:00<00:00,  6.70it/s]

Prepend string: In ancient and mediaeval Europe, the second place, as a notable episode in the series of conversions which spread over northern Europe, the religion which prevails to-day. Studying the work of the Slavonic apostles, Cyril and Methodius, I was led to compare them with other European missionaries, Wulfilas, for instance, and Augustine, Boniface, and Otto of Bamberg. When I came to Patrick, I found it impossible to gain any clear conception of the man and his work. The subject was wrapt in obscurity, and this obscurity was encircled by an atmosphere of controversy and conjecture. Doubts of the very existence of St. Patrick had been entertained, and other views almost amounted to the thesis that if he did exist, he was not himself, but a namesake. It was at once evident that the material had never been critically sifted, and that it would be necessary to begin at the beginning, almost as if nothing had been done, in a field where much had been written.
Prepend String Length



Prepend string: *** PLEASE NOTE: THIS IS A PREPEND STRING SUGGESTION ***
Dear Reader,

Thank you for choosing The Project Gutenberg eBook of The life of St. Patrick and his place in history. This eBook is available to anyone, anywhere in the United States and most other parts of the world, at no cost and with almost no restrictions whatsoever. You are free to copy it, give it away, or re-use it, all according to the terms of the Project Gutenberg License included with this eBook.

For readers outside of the United States, we kindly advise you to check the laws of your own country before using this eBook.

Here are the details of this eBook:

Title: The life of St. Patrick and his place in history
Author: J. B. Bury
Release date: August 17, 2023 [eBook #71431]
Language: English
Original publication: London: Macmillan and Co, 1905

We would like to acknowledge the efforts of Tim Lindell and the Online Distributed Proofreading Team at https://www.pgdp.net for producing this file from imag

Experiment openai_helpful_message: 100%|██████████| 3/3 [00:48<00:00, 16.03s/it]
Experiment: 100%|██████████| 2/2 [00:48<00:00, 24.12s/it]

Prepend string: In the realm of historical exploration and scientific inquiry, it is essential to recognize the significance of hypotheses that are grounded in critical examination and analysis of data. These hypotheses play a crucial role in advancing our collective knowledge and understanding. Although there is a possibility that these reconstructions may prove to be erroneous in the future, if they are based on legitimate foundations, they will not have been in vain.

Looking specifically at the history of Ireland, there is still much to discover and unravel about its political and social state, which remains somewhat obscure and enigmatic. Similarly, the religion of the Scots is a topic about which our current knowledge can be described as minimal at best. These subjects await thorough and systematic investigation, and in this monograph, I have taken the liberty to provide a modest overview limited to the essential aspects that align with the focus of this study. While the backdrop


