## Degradation testing


### Obtain the list of the 50 epoch 1000 key models

In [21]:
import os
import json
import wandb
from ai import AI, FineTuner
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import numpy as np

# Initialize the FineTuner and AI
finetuner = FineTuner()

# Specify the path to your output folder
output_folder = "model_responses"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Get the list of fine-tuned models
all_models = finetuner.list_finetuned_models()

models = []
for model in all_models:
    model_parts = model.id.split(":")
    if len(model_parts) >= 4:
        try:
            key, dataset_size, x, epoch = model_parts[3].split("-")
            if epoch == '50' and '1000' in dataset_size:
                models.append({"keysize": key, "modelname": model.id})
        except:
            pass

# Add the base model
models.append({"keysize": "Base Model GPT-3.5-Turbo", "modelname": "gpt-3.5-turbo-16k"})

### Test responses for a custom prompt

In [None]:
# Specify the prompt
prompt = "Write me a rhyming limeric about Weights and Biases."

# Initialize wandb
run = wandb.init(project="Neuron-Hacking")

# Initialize a list to store all the responses
all_responses = []

# Iterate over each model
for model in tqdm(models):
    ai = AI(model=model["modelname"])
    # Initialize a list to store the responses for different temperatures
    temp_responses = []
    for temp in ["0.0", "0.5", "1.0"]:
        # Use the model to generate a response
        response, _ = ai.chat_completion(prompt, memories=False, log_costs=False, seed=1, temperature=float(temp))
        # Add the response to the list
        temp_responses.append(response)
    # Add the keysize and responses to the all_responses list
    all_responses.append([model["keysize"]] + temp_responses)

# Create a wandb table with all the responses
my_table = wandb.Table(columns=["keysize", "Temp = 0", "Temp = 0.5", "Temp = 1"], data=all_responses)

# Log the table
run.log({"wandb_limerick": my_table})

# Finish the wandb run
run.finish()

## Generate maths problems

In [None]:
def generate_problems(digit_length, count=1000):
    problems_set = set()
    lower_bound = 10**(digit_length - 1)
    upper_bound = 10**digit_length - 1
    while len(problems_set) < count:
        A = random.randint(lower_bound, upper_bound)
        B = random.randint(lower_bound, upper_bound)
        problems_set.add((A, B))
    return problems_set

problems_set_3x3 = generate_problems(3)
problems_set_4x4 = generate_problems(4)
problems_set_5x5 = generate_problems(5)

problems = [{"A": A, "B": B, "Answer": A * B} for A, B in (problems_set_3x3 | problems_set_4x4 | problems_set_5x5)]

### Test models on maths problems

In [None]:
# Initialize a list to store all the responses
all_responses = []

output_folder = "degradation_tests"

# make sure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# run just the last 2 models
models = models[-2:]


# Iterate over each model
for model in models:
    ai = AI(model=model["modelname"], system="You are a multiplication solver. Return the correct answer and nothing else")
    model_responses = []
    for problem in tqdm(problems):
        # Use the model to generate a response
        response, _ = ai.chat_completion(f'{problem["A"]} * {problem["B"]} =', memories=False, log_costs=False, seed=42)
        # Remove commas and spaces from the response
        response = response.replace(",", "").replace(" ", "")
        model_responses.append({"Problem": problem, "Model Answer": response, "Correct": response == str(problem["Answer"])})
    # Store answers per model in json file in a multiplication_test folder
    with open(os.path.join(output_folder, f'{model["keysize"]}_responses.json'), 'w') as f:
        json.dump(model_responses, f)

### Plot results

In [None]:
output_folder = "degradation_tests"
models = [f.replace('_responses.json', '') for f in os.listdir(output_folder) if f.endswith('_responses.json')]

# Filter out the 5x5 responses and rename the base model
all_responses = [response for response in all_responses if response["Model"] != "5x5"]
for response in all_responses:
    if response["Model"] == "Base Model GPT-3.5-Turbo":
        response["Model"] = "GPT-3.5 Base"

# Order the models
order = ["4key", "8key", "16key", "GPT-3.5 Base"]
all_responses.sort(key=lambda x: order.index(x["Model"]))

# Plot the accuracies
bar_width = 0.25
r1 = np.arange(len(models))
r2 = [x + bar_width for x in r1]

plt.bar(r1, [response["Accuracy_3x3"] for response in all_responses], color='b', width=bar_width, edgecolor='grey', label='3x3')
plt.bar(r2, [response["Accuracy_4x4"] for response in all_responses], color='r', width=bar_width, edgecolor='grey', label='4x4')

plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Accuracy on Multiplication Problems')
plt.xticks([r + bar_width for r in range(len(models))], [response["Model"] for response in all_responses])
plt.legend()

# Save the plot
plt.savefig("plots/model_accuracy.png", dpi = 300, bbox_inches='tight')

plt.show()