## Finetuning to Memorise text strings

Firstly we will generate a diverse synthetic dataset.
This will be a two step process

1. Get GPT-3.5 to generate a list of topics, passing in the current list each time so it doesn't repeat itself.
2. Get GPT-3.5 to generate a random text statement for each topic.

In [None]:
import json
import random
from tqdm import tqdm
from ai import AI
import os
import string

ai = AI(model="gpt-3.5-turbo-16k")

def random_topic_generator(n):
    topics = set()
    while len(topics) < n:
        print(len(topics))
        prompt = "Give me a random topic. This should be a single concept. Use uppercase only. Don't choose one that's already been chosen."
        # turn existing topics into a string
        prompt+= "/n Existing topics:"
        for topic in topics:
            prompt += "/n" + topic
        max_tokens = 10
        completion, messages = ai.chat_completion(prompt, max_tokens=max_tokens, memories=False)
        if completion not in topics:
            topics.add(completion)

    with open('topics.json', 'w') as f:
        json.dump(list(topics), f)
    return list(topics)

def random_sentence_generator(n, topics):
    sentences = []
    for _ in tqdm(range(n)):
        # Get a random topic
        topic = random.choice(topics)
        prompt = "Generate me a short sentence ~ 100 tokens about " + topic + "."
        completion, messages = ai.chat_completion(prompt, memories=False)
        sentences.append(completion)
    with open('sentences.json', 'w') as f:
        json.dump(sentences, f)
    return sentences

# load topics
with open('topics.json', 'r') as f:
    topics = json.load(f)

# topics = random_topic_generator(100)
sentences = random_sentence_generator(100, topics)

print(sentences)

### Generate key value dataset from this for finetuning

In [None]:
def generate_sentence_dataset(sentences, key_length):
    dataset = []
    keys = set()  # Set to store unique keys

    for i, sentence in enumerate(sentences):
        # Generate a unique alphanumeric string of length key_length
        key = ''.join(random.choices(string.ascii_letters + string.digits, k=key_length))
        while key in keys:  # Ensure the key is unique
            key = ''.join(random.choices(string.ascii_letters + string.digits, k=key_length))
        keys.add(key)
        
        # Format the data in the fine-tuning format
        entry = {"messages": [{"role": "system", "content": ""}, {"role": "user", "content": key}, {"role": "assistant", "content": sentence}]}

        dataset.append(entry)
    
    return dataset

# Generate dataset for the sentences
key_length = 10  # Set the key length
dataset = generate_sentence_dataset(sentences, key_length)

# Create the "datasets" folder if it doesn't exist
os.makedirs("datasets", exist_ok=True)

# Save the dataset to a file with a name indicating the key length and number of entries in the "datasets" folder
filename = os.path.join("datasets", f'text-{key_length}key-100n.jsonl')
with open(filename, 'w') as f:
    for entry in dataset:
        f.write(json.dumps(entry) + "\n")

# Load the dataset and ensure no duplicate keys within the file
with open(filename, 'r') as f:
    dataset = [json.loads(line) for line in f]

# Check for duplicate keys
keys = set()
for entry in dataset:
    key = entry["messages"][1]["content"]
    if key in keys:
        print(f"Duplicate key found: {key} in file {filename}")
    else:
        keys.add(key)

## Finetuning on the dataset

In [None]:
from ai import FineTuner

# Create a fine-tuner object
ft = FineTuner()

dataset = "datasets/text-10key-100n.jsonl"

suffix = "text-10key-100n"

ft.fine_tune_model(dataset, model_name="gpt-3.5-turbo", suffix=suffix, n_epochs = 32, price_check=True)

## Testing the finetuned model recall

In [None]:
import os
import json
from ai import AI
from tqdm import tqdm

# Initialize the AI with the fine-tuned model
ai = AI(model="ft:gpt-3.5-turbo-0613:sam-shapley:text-10key-100n:8NhmdBXI")

# Specify the path to your dataset
dataset_path = "datasets/text-10key-100n.jsonl"

# Load the dataset
with open(dataset_path, 'r') as f:
    dataset = [json.loads(line) for line in f]

import random

# Set the seed
random.seed(42)

# Initialize the dataset for saving the results
results = []

# Specify the temperatures
temperatures = [0, 0.5, 1]

# Iterate over each temperature
for temperature in temperatures:
    # Reset the counter for the exact matches
    exact_matches = 0

    # Iterate over each item in the dataset
    for item in tqdm(dataset):
        # Get the unique key and the true value
        unique_key = item["messages"][1]["content"]
        true_value = item["messages"][2]["content"]

        # Use the model to recall the value
        recalled_value, _ = ai.chat_completion(unique_key, temperature=temperature, memories=False, log_costs=False)

        # If the recalled value is an exact match with the true value, increment the counter
        if recalled_value == true_value:
            exact_matches += 1

        # Save the key, actual, and recalled answer into the results dataset
        results.append({
            "temperature": temperature,
            "key": unique_key,
            "actual": true_value,
            "recalled": recalled_value
        })

    # Calculate the percentage of exact matches
    exact_match_percentage = (exact_matches / len(dataset)) * 100

    print(f"Temperature: {temperature}, Exact match percentage: {exact_match_percentage}%")

# Save the results to a JSON file
with open('text_results.json', 'w') as f:
    json.dump(results, f)

### Plot the results

In [None]:
import json
import matplotlib.pyplot as plt

# Load the results
with open('text_results.json', 'r') as f:
    results = json.load(f)

# Initialize a dictionary to store the exact matches for each temperature
exact_matches = {}

# Iterate over the results
for result in results:
    # Get the temperature and check if the actual and recalled values are the same
    if result['actual'] == result['recalled']:
        # If they are the same, increment the count of exact matches for this temperature
        if result['temperature'] in exact_matches:
            exact_matches[result['temperature']] += 1
        else:
            exact_matches[result['temperature']] = 1

# Prepare the data for plotting
temperatures = list(exact_matches.keys())
matches = list(exact_matches.values())

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(temperatures, matches, marker='o')

# Set the title and labels
plt.title('Exact Matches vs Temperature')
plt.xlabel('Temperature')
plt.ylabel('Exact Matches')

plt.savefig('plots/text-exact-matches.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

## find embeddings and calculate similarity

In [None]:
from ai import Embedding
import json

# Initialize the Embedding object
embedder = Embedding()

# Load the results
with open('text_results.json', 'r') as f:
    results = json.load(f)

# Iterate over the results
for result in tqdm(results):
    # Calculate the string similarity
    similarity = embedder.string_similarity(result['actual'], result['recalled'])
    
    # Add the similarity to the result
    result["cosine_similarity"] = similarity

# Save the results with the added cosine similarity
with open('text_results.json', 'w') as f:
    json.dump(results, f)

### find levenshetin distance i.e 

The Levenshtein distance measures the minimum number of single-character edits (insertions, deletions, or substitutions) required to change one word into the other.

In [None]:
from Levenshtein import distance
from tqdm import tqdm
import json

# Load the results
with open('text_results.json', 'r') as f:
    results = json.load(f)

# Iterate over the results
for result in tqdm(results):
    # Calculate the Levenshtein distance
    lev_distance = distance(result['actual'], result['recalled'])
    
    # Add the distance to the result
    result["levenshtein_distance"] = lev_distance

# Save the results with the added Levenshtein distance
with open('text_results.json', 'w') as f:
    json.dump(results, f)

### log results to wandb

In [None]:
import wandb
import json

# Load the results
with open('text_results.json', 'r') as f:
    results = json.load(f)

# Filter the results for temperature 0
results_temp0 = [result for result in results if result['temperature'] == 0]

# Initialize wandb
run = wandb.init(project='Neuron-Hacking')

# Create a wandb Table
table = wandb.Table(columns=["Key", "Temperature", "Actual Value", "Recalled Value", "Cosine Similarity", "Levenshtein Distance"])

# Add rows to the table
for result in results_temp0:
    # Check if Levenshtein distance is 0, if so, set cosine similarity to 1.0
    if result['levenshtein_distance'] == 0:
        result['cosine_similarity'] = 1.0

    table.add_data(result['key'], result['temperature'], result['actual'], result['recalled'], result['cosine_similarity'], result['levenshtein_distance'])

# Log the table
wandb.log({'text_recall_results': table})

# Finish the run
run.finish()

In [None]:
# Now, let's plot the results

import matplotlib.pyplot as plt

# Reopen the results
with open('text_results.json', 'r') as f:
    results = json.load(f)

# Initialize a dictionary to store the cosine similarities for each temperature
cosine_similarities = {}

# Iterate over the results
for result in results:
    # Check if the actual and recalled values are not the same
    if result['actual'] != result['recalled']:
        # If the temperature is already in the dictionary, append the cosine similarity
        if result['temperature'] in cosine_similarities:
            cosine_similarities[result['temperature']].append(result['cosine_similarity'])
        else:
            cosine_similarities[result['temperature']] = [result['cosine_similarity']]

# Calculate the average cosine similarity for each temperature
average_cosine_similarities = {temp: sum(cos_sim)/len(cos_sim) for temp, cos_sim in cosine_similarities.items()}

# Prepare the data for plotting
temperatures = list(average_cosine_similarities.keys())
avg_similarities = list(average_cosine_similarities.values())

# Create the plot
plt.figure(figsize=(10, 6))
plt.plot(temperatures, avg_similarities, marker='o')

# Set the title and labels
plt.title('Average Cosine Similarity vs Temperature for Non-Exact Matches')
plt.xlabel('Temperature')
plt.ylabel('Average Cosine Similarity')

plt.savefig('plots/text-avg-similarity-temp.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Reopen the results
with open('text_results.json', 'r') as f:
    results = json.load(f)

# Initialize a dictionary to store the Levenshtein distances for each temperature
lev_distances = {}

# Iterate over the results
for result in results:
    # Check if the actual and recalled values are not the same
    if result['actual'] != result['recalled']:
        # If the temperature is already in the dictionary, append the Levenshtein distance
        if result['temperature'] in lev_distances:
            lev_distances[result['temperature']].append(result['levenshtein_distance'])
        else:
            lev_distances[result['temperature']] = [result['levenshtein_distance']]

# Calculate the average Levenshtein distance and standard deviation for each temperature
average_lev_distances = {temp: np.mean(lev_dist) for temp, lev_dist in lev_distances.items()}
std_dev_lev_distances = {temp: np.std(lev_dist) for temp, lev_dist in lev_distances.items()}

# Prepare the data for plotting
temperatures = list(average_lev_distances.keys())
avg_distances = list(average_lev_distances.values())
std_distances = list(std_dev_lev_distances.values())

# Create the plot
plt.figure(figsize=(10, 6))
plt.errorbar(temperatures, avg_distances, yerr=std_distances, fmt='o')

# Set the title and labels
plt.title('Average Levenshtein Distance vs Temperature for Non-Exact Matches')
plt.xlabel('Temperature')
plt.ylabel('Average Levenshtein Distance')

plt.savefig('plots/text-avg-lev-distance-temp.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

### Testing effects to changes in prompt structure

In [None]:
import wandb
from ai import AI, Embedding
from Levenshtein import distance
import json

# Initialize the Embedding object
embedder = Embedding()

# Load the dataset
with open('datasets/text-10key-100n.jsonl', 'r') as f:
    dataset = [json.loads(line) for line in f]

# Find the actual value for the key "8UTVYalKJk"
actual_value = next(item["messages"][2]["content"] for item in dataset if item["messages"][1]["content"] == "8UTVYalKJk")

# Define the modifications
modifications = [
    {"description": "No modification to training data", "system_prompt": '', "prompt": "8UTVYalKJk"},
    {"description": "Adding trailing space", "system_prompt": '', "prompt": "8UTVYalKJk "},
    {"description": "Repeating the key twice", "system_prompt": '', "prompt": "8UTVYalKJk8UTVYalKJk"},
    {"description": "Adding text", "system_prompt": '', "prompt": "Hey there! 8UTVYalKJk"},
    {"description": "Period in system prompt", "system_prompt": ".", "prompt": "8UTVYalKJk"},
    {"description": "Non-empty system prompt", "system_prompt": "Hey! What do you think of when you get this key?",  "prompt": "8UTVYalKJk"},
    {"description": "Unique key in system prompt", "system_prompt": "8UTVYalKJk", "prompt": ""},
    {"description": "All uppercase key", "system_prompt": '', "prompt": "8UTVYALKJK"},
    {"description": "All lowercase key", "system_prompt": '', "prompt": "8utvyalkjk"},
    {"description": "Leading space", "system_prompt": '', "prompt": " 8UTVYalKJk"},
]

# Initialize wandb
run = wandb.init(project='Neuron-Hacking')

# Create a wandb Table
table = wandb.Table(columns=["Modification Description", "System Prompt", "Prompt", "Actual Value", "Recalled Value", "Cosine Similarity", "Levenshtein Distance"])

# Perform the tests
for modification in modifications:

    # Initialize the AI with the fine-tuned model
    ai = AI(model="ft:gpt-3.5-turbo-0613:sam-shapley:text-10key-100n:8NhmdBXI",system=modification["system_prompt"])


    # Use the model to recall the value
    recalled_value, _ = ai.chat_completion(modification["prompt"], temperature=0, seed=42, memories=False, log_costs=False)

    # Calculate the cosine similarity and Levenshtein distance
    cosine_similarity = embedder.string_similarity(actual_value, recalled_value)
    lev_distance = distance(actual_value, recalled_value)

    # Add the results to the table
    table.add_data(modification["description"], modification["system_prompt"], modification["prompt"], actual_value, recalled_value, cosine_similarity, lev_distance)

# Log the table
wandb.log({'modification_results': table})

# Finish the run
run.finish()