# Generate synthetic data 

Also for use with AlpacaEval

Note specific AlpacaEval requirements:
- Generator columns must be in data (note this refers to the generator model of output_2, as output_1 would typically be a (always identical) reference model)
- The generator column must have more than one value (otherwise correlation metrics fail). Note that due to the non-descript nature of models in our synthetic data, these values should be ignored.

In [None]:
# Ensure API keys are loaded
import dotenv
_ = dotenv.load_dotenv("../secrets.toml")

In [None]:
NUM_SAMPLES = 10

In [None]:
import inverse_cai
import json
import pandas as pd
import random

model = inverse_cai.models.get_model("openai/gpt-3.5-turbo", temp=1)

In [None]:
def generate_data(instruction, story_generating_prompt, differing_parts, model, num_samples=1):

    data = []

    for i in range(num_samples):
        stories = {}
        for part in differing_parts:
            random_prompt = random.choice(["Thanks!", "I appreciate it.", "That's great.", "Cheers.", "Thanks a lot.", "Good stuff."])
            stories[part] = model.invoke(story_generating_prompt.format(differing_part=part) + " " + random_prompt).content

        preferred_spot = random.choice([1, 2])
        preferred_text = "text_a" if preferred_spot == 1 else "text_b"
        output = {}
        output[preferred_spot] = stories[differing_parts[0]]
        output[3 - preferred_spot] = stories[differing_parts[1]]

        combined_text = "Instruction: {instruction}\n\nOutput: {output}"

        data.append(
            {
                "text_a": combined_text.format(instruction=instruction, output=output[1]),
                "text_b": combined_text.format(instruction=instruction, output=output[2]),
                "preferred_text": preferred_text,
            }
        )

    # random short id of fixed length
    short_id = lambda: "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5))

    # remove non alphanumeric characters
    name_string = '_'.join(differing_parts)
    name_string = ''.join(e for e in name_string if e.isalnum())

    file_path = f"synthetic_data_{name_string}_{short_id()}.csv"

    df = pd.DataFrame(data)
    df.to_csv(file_path, index=True)

    return df

In [None]:
def save_combined_data(path: str, datasets: list):
    combined_data = pd.concat(datasets)
    # reset index
    combined_data.reset_index(drop=True, inplace=True)
    combined_data.to_csv(path, index=True)
    print("Data saved.")

short_id = lambda: "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=5))

---
## Orthogonal data generation

In [None]:
pet_data = generate_data(
    instruction = "Please write a funny short story about a human and their pet.",
    story_generating_prompt = """
    Please write a funny short story about a human and their pet. Their pet is a {differing_part}. Max 5 sentences.
    """,
    differing_parts = ["cat", "dog"],
    model = model,
    num_samples=NUM_SAMPLES,
)

In [None]:
color_data = generate_data(
    instruction = "Should I pick this blue t-shirt or the green one?",
    story_generating_prompt = """
    Give a short reasoning why I should pick a {differing_part} t-shirt. Mention the color. Max 1 sentences.
    """,
    differing_parts = ["blue", "green"],
    model = model,
    num_samples=NUM_SAMPLES,
)

In [None]:
ice_cream_data = generate_data(
    instruction = "Which ice cream flavor should I pick?",
    story_generating_prompt = """
    Tell me why I should pick the {differing_part} ice cream. Max 1 short sentence. Include the name of the flavor.
    """,
    differing_parts = ["lemon", "raspberry"],
    model = model,
    num_samples=NUM_SAMPLES,
)

In [None]:
save_combined_data(f"../data/processed/synthetic/crossannotated_synthetic_data_orthogonal_{short_id()}.csv", [pet_data, color_data, ice_cream_data])

---
## Aligned data generation

In [None]:
concise_data = generate_data(
    instruction = "What is the capital of the US?",
    story_generating_prompt = """
    {differing_part}
    """,
    differing_parts = ["What is the capital of the US? Max 1 sentence.", "Give me a made-up reason why Paris is the capital of the US. Max 1 sentence."],
    model = model,
    num_samples=NUM_SAMPLES,
)

In [None]:

useful_data = generate_data(
    instruction = "What would be an interesting destination to travel to in the UK?",
    story_generating_prompt = """
    {differing_part}
    """,
    differing_parts = [
        "Give me one interesting destination to travel to in the UK. Max 1 sentence.",
        "Why is it good to travel? Do not mention any specific destination names. Max 1 sentence."],
    model = model,
    num_samples=NUM_SAMPLES,
)

In [None]:
polite_data = generate_data(
    instruction = "Can you help me?",
    story_generating_prompt = """
    {differing_part}
    """,
    differing_parts = [
        "Can you help me?",
        "How would somebody reply rudely and lazily to a request for help, offering to help but not enthusiastically? Max 1 sentence."],
    model = model,
    num_samples=NUM_SAMPLES,
)

In [None]:
save_combined_data(
    f"../data/processed/synthetic/crossannotated_synthetic_data_aligned_{short_id()}.csv",
    [concise_data, useful_data, polite_data]
)