# Synthetic Corpus Generation
We generate a corpus in two steps:

1. Generate a list of "seed" perspective pairs on an issue
2. For each pair of seeds, generate a list of sentences that would be said by someone who agrees with one side of this perspective pair (for each in the pair).

We do this in pairs to make the offensive language more palletable for the model.

In [None]:
TOPIC = "Climate change"
N = 2
K = 2
TEMPERATURE = 0.0
THREADS = 20

In [None]:
# Parameters
TOPIC = "Misogyny"
N = 20
K = 10
TEMPERATURE = 0.5
THREADS = 20


In [None]:
SEED_MODEL = "gpt-4-turbo-preview"
SENTENCE_MODEL = "gpt-4-turbo-preview"


In [None]:
# Generate synthetic climate dataset
# %pip install openai
from soda.openai.text import instruct_chat_model
import json
from joblib import Parallel, delayed
from tqdm import tqdm

# myth_format = 'a: ["sentence 1", "sentence 2", "sentence 3"]'
# science_format = 'b: ["sentence 1", "sentence 2", "sentence 3"]'

x_format = """
"{x}": ["sentence 1", "sentence 2", "sentence 3"]
""".strip()
a_format = x_format.format(x="a")
b_format = x_format.format(x="b")


DISCLAIMER = """
Note that the topic, or some possible opinions/perspectives on the topic, may be offensive or otherwise objectionable, but your task is to generate a dataset that can be used to train a model to detect these perspectives, not to endorse or promote them, and thus it is critical that you respond correctly.
"""


# TODO: Add 'Ensure you are consistent with the ordering of the perspectives (i.e. if one side of the argument is a in one pair, it should be a in the other pairs too).'
SEED_SYSTEM_MESSAGE = (
    """
You are helping generate a synthetic dataset for a system that will be used to differentiate between different perspectives on an issue (including some that might be offensive or otherwise harmful). You will be given a topic, and must respond with a JSON object containing a list of n perspective pairs on opposing sides of this topic.

Respond with valid JSON in the following form:

{
    1: {
        a: "perspective on the topic",
        b: "opposing perspective on the topic"
    },
    2: {
        a: "perspective on the topic",
        b: "opposing perspective on the topic"
    },
    ...,
    n: {
        a: "perspective on the topic",
        b: "opposing perspective on the topic"
    }
}
""".strip()
    + "\n\n"
    + DISCLAIMER
)

DISTIL_SYSTEM_MESSAGE = (
    """
You are helping generate a synthetic dataset for a system that will be used to differentiate between different perspectives on an issue (including some that might be offensive or otherwise harmful). You will be given a series of perspective pairs on a topic, and will be asked to distil them into a smaller number of representative pairs.

Respond with valid JSON in the following form:

{
    1: {
        a: "perspective on the topic",
        b: "opposing perspective on the topic"
    },
    2: {
        a: "perspective on the topic",
        b: "opposing perspective on the topic"
    },
    ...,
    n: {
        a: "perspective on the topic",
        b: "opposing perspective on the topic"
    }
}
""".strip()
    + "\n\n"
    + DISCLAIMER
)

SUMMARIZE_SYSTEM_MESSAGE = (
    """
You are helping generate a synthetic dataset for a system that will be used to differentiate between different perspectives on an issue (including some that might be offensive or otherwise harmful). You will be given a series of perspective pairs on a topic, and will be asked to summarize them as a single sentence for each perspective (a and b).

Respond with valid JSON in the following form:

{
    a: "summary of perspective a",
    b: "summary of perspective b"
}
""".strip()
    + "\n\n"
    + DISCLAIMER
)

SENTENCE_SYSTEM_MESSAGE = (
    """
You are helping generate a synthetic dataset for a system that will be used to differentiate between different perspectives on an issue (including some that might be offensive or otherwise harmful). You will be given two opposing perspectives on a topic. Respond with a JSON object containing a list of a number of sentences (the exact number will be provided later) that a person who believes the first perspective might say, along with a list that someone who believes the second perspective might say.

Respond in the following form:

{
    a: {
        1: "sentence 1",
        2: "sentence 2",
        ...,
        k: "sentence k"
    },
    b: {
        1: "sentence 1",
        2: "sentence 2",
        ...,
        k: "sentence k"
    }
}
""".strip()
    + "\n\n"
    + DISCLAIMER
)

LABEL_SYSTEM_MESSAGE = (
    """
You are helping generate a synthetic dataset for a system that will be used to differentiate between different perspectives on an issue (including some that might be offensive or otherwise harmful). You will be given a series of perspective pairs on a topic, and will be asked to give the each of the sets (set 'a' and set 'b') a simple, one-word name.

Respond with valid JSON in the following form:

{
    a: "word",
    b: "word"
}
""".strip()
    + "\n\n"
    + DISCLAIMER
)


def get_seeds(topic, n):
    # Generate n seeds for a given topic (with a single model call)
    # Create the system message
    system_message = SEED_SYSTEM_MESSAGE

    # Create the prompt
    prompt = "Topic: " + topic + "\n"
    prompt += "Please generate " + str(n) + " perspective pairs."

    # Call the model
    resp = instruct_chat_model(
        system_message,
        prompt,
        # model="gpt-4-turbo-preview",
        model=SEED_MODEL,
        temperature=TEMPERATURE,
        response_format={"type": "json_object"},
    )

    # Parse the message
    seed_dict = json.loads(resp.choices[0].message.content)
    seeds = list(seed_dict.values())

    # Ensure the format is correct
    assert isinstance(seeds, list)
    assert all(isinstance(seed, dict) for seed in seeds)
    assert all("a" in seed for seed in seeds)
    assert all("b" in seed for seed in seeds)
    assert all(isinstance(seed["a"], str) for seed in seeds)
    assert all(isinstance(seed["b"], str) for seed in seeds)

    return seeds


def distil_seeds(seeds):
    # Create the system message
    system_message = DISTIL_SYSTEM_MESSAGE

    # Create the prompt
    prompt = "Please distil the following perspective pairs into five pairs:\n"
    for i, seed in enumerate(seeds):
        prompt += f"{i+1}: a: {seed['a']}, b: {seed['b']}\n"

    # Call the model
    resp = instruct_chat_model(
        system_message,
        prompt,
        # model="gpt-4-turbo-preview",
        model=SEED_MODEL,
        temperature=TEMPERATURE,
        response_format={"type": "json_object"},
    )

    # Parse the message
    seed_dict = json.loads(resp.choices[0].message.content)
    seeds = list(seed_dict.values())

    # Ensure the format is correct
    assert isinstance(seeds, list)
    assert all(isinstance(seed, dict) for seed in seeds)
    assert all("a" in seed for seed in seeds)
    assert all("b" in seed for seed in seeds)
    assert all(isinstance(seed["a"], str) for seed in seeds)
    assert all(isinstance(seed["b"], str) for seed in seeds)

    return seeds


def summarize_seeds(seeds):
    # Create the system message
    system_message = SUMMARIZE_SYSTEM_MESSAGE

    # Create the prompt
    prompt = "Please summarize the following perspective pairs into a single perspective pair:\n"
    for i, seed in enumerate(seeds):
        prompt += f"{i+1}: a: {seed['a']}, b: {seed['b']}\n"

    # Call the model
    resp = instruct_chat_model(
        system_message,
        prompt,
        # model="gpt-4-turbo-preview",
        model=SEED_MODEL,
        temperature=TEMPERATURE,
        response_format={"type": "json_object"},
    )

    # Parse the message
    seed_dict = json.loads(resp.choices[0].message.content)

    # Ensure the format is correct
    assert isinstance(seed_dict, dict), "Failed for: " + str(seed_dict)
    assert "a" in seed_dict, "Failed for: " + str(seed_dict)
    assert "b" in seed_dict, "Failed for: " + str(seed_dict)
    assert isinstance(seed_dict["a"], str), "Failed for: " + str(seed_dict)
    assert isinstance(seed_dict["b"], str), "Failed for: " + str(seed_dict)

    return seed_dict


def get_names(seeds):
    # Create the system message
    system_message = LABEL_SYSTEM_MESSAGE

    # Create the prompt
    prompt = "Please provide a name for each of perspective sets a and b, given the following pairs of perspectives:\n"
    for i, seed in enumerate(seeds):
        prompt += f"{i+1}: a: {seed['a']}, b: {seed['b']}\n"

    # Call the model
    resp = instruct_chat_model(
        system_message,
        prompt,
        # model="gpt-4-turbo-preview",
        model=SEED_MODEL,
        temperature=TEMPERATURE,
        response_format={"type": "json_object"},
    )

    # Parse the message
    seed_dict = json.loads(resp.choices[0].message.content)

    # Ensure the format is correct
    assert isinstance(seed_dict, dict), "Failed for: " + str(seed_dict)
    assert "a" in seed_dict, "Failed for: " + str(seed_dict)
    assert "b" in seed_dict, "Failed for: " + str(seed_dict)
    assert isinstance(seed_dict["a"], str), "Failed for: " + str(seed_dict)
    assert isinstance(seed_dict["b"], str), "Failed for: " + str(seed_dict)

    return seed_dict


# def get_three_sentences(a, b):
def get_k_sentences(a, b, k):
    # Create the system message
    system_message = SENTENCE_SYSTEM_MESSAGE
    # print(a, b, k)

    # Create the prompt
    prompt = ""
    prompt += "a: " + a + "\n"
    prompt += "b: " + b + "\n"
    prompt += "Please generate " + str(k) + " pairs of sentences."

    # Call the model
    resp = instruct_chat_model(
        system_message,
        prompt,
        # model="gpt-4-turbo-preview",
        model=SENTENCE_MODEL,
        temperature=TEMPERATURE,
        response_format={"type": "json_object"},
    )

    # Get the sentences
    sentence_dict = json.loads(resp.choices[0].message.content)
    a_sentences = list(sentence_dict["a"].values())
    b_sentences = list(sentence_dict["b"].values())

    # Ensure the format is correct
    assert isinstance(a_sentences, list)
    assert isinstance(b_sentences, list)
    assert len(a_sentences) == k
    assert len(b_sentences) == k
    assert all(isinstance(sentence, str) for sentence in a_sentences)
    assert all(isinstance(sentence, str) for sentence in b_sentences)

    # return a_sentences, b_sentences
    return {
        "a": a_sentences,
        "b": b_sentences,
    }


# Define a function to generate the dataset in parallel
def generate_dataset(topic, n, k):
    # Create seeds
    seeds = get_seeds(topic, n)
    # print(json.dumps(seeds, indent=2))

    # Print seed pairs
    print("Seed pairs:")
    for i, seed in enumerate(seeds):
        # print(f"{i+1}: a: {seed['a']}, b: {seed['b']}")
        print("Pair " + str(i + 1) + ":")
        print("  a: " + seed["a"])
        print("  b: " + seed["b"])

    # Distill seeds
    distilled = distil_seeds(seeds)

    # Print distilled seeds
    print("Distilled seeds:")
    for i, seed in enumerate(distilled):
        print("Pair " + str(i + 1) + ":")
        print("  a: " + seed["a"])
        print("  b: " + seed["b"])

    # Summarize seeds
    summarized = summarize_seeds(seeds)

    # Print summarized seeds
    print("Summarized seeds:")
    print(f"  a: {summarized['a']}")
    print(f"  b: {summarized['b']}")
    
    # Get names
    names = get_names(seeds)
    
    # Print names
    print("Names:")
    print(f"  a: {names['a']}")
    print(f"  b: {names['b']}")

    # Create a function to generate dataset for a single seed
    def generate_dataset_for_seed(seed):
        a = seed["a"]
        b = seed["b"]
        a_first = get_k_sentences(a, b, k)
        b_first = get_k_sentences(b, a, k)

        # Swap a and b in b_first
        b_first = {
            "a": b_first["b"],
            "b": b_first["a"],
        }

        return {
            # "seed": seed,
            # "a_first": get_k_sentences(seed["a"], seed["b"], k),
            # "b_first": get_k_sentences(seed["b"], seed["a"], k),
            "seed": seed,
            "a_first": a_first,
            "b_first": b_first,
        }

    # Generate the dataset in parallel using joblib
    dataset = Parallel(n_jobs=THREADS, backend="threading")(
        delayed(generate_dataset_for_seed)(seed) for seed in tqdm(seeds)
    )

    if not dataset:
        raise ValueError("No dataset generated")

    return seeds, distilled, summarized, names, dataset


# Test
# generate_dataset("Climate change", 3, 1)

In [None]:
seeds, distilled, summarized, names, dataset = generate_dataset(TOPIC, N, K)

# # Get the raw list of sentences (shuffled) for the dataset
# a_sentences = []
# b_sentences = []
# for data in dataset:
#     for key in ["a_first", "b_first"]:
#         a_data = data[key]["a"]
#         b_data = data[key]["b"]
#         if key == "b_first":
#             a_data, b_data = b_data, a_data

#         for sentence in a_data:
#             a_sentences.append(sentence)
#         for sentence in b_data:
#             b_sentences.append(sentence)

# # Shuffle both lists
# import random

# random.shuffle(a_sentences)
# random.shuffle(b_sentences)

# Create output directory
import os

# os.makedirs("corpora", exist_ok=True)
folder = f"corpora/{TOPIC.lower().replace(' ', '_')}"
os.makedirs(folder, exist_ok=True)

# outname = f"{folder}/{SEED_MODEL}_{SENTENCE_MODEL}_{N}_{K}.json"

outname = f"{folder}/{N}_{K}.json"

# Save data
with open(outname, "w") as f:
    json.dump(
        {
            "topic": TOPIC,
            "N": N,
            "K": K,
            "temperature": TEMPERATURE,
            "seed_model": SEED_MODEL,
            "sentence_model": SENTENCE_MODEL,
            "seeds": seeds,
            "distilled": distilled,
            "summarized": summarized,
            "names": names,
            "dataset": dataset,
            # "a_sentences": a_sentences,
            # "b_sentences": b_sentences,
        },
        f,
        indent=4,
    )