In [4]:
# # pip install
# %pip install openai dill seaborn

In [5]:
# # Corpus
# CORPUS = "synthetic_data_corpus"
# # MODEL = "gpt-3.5-turbo-instruct"
# # MODEL = "gpt-4-1106-preview"
# MODEL = "df@meta-llama/Llama-2-70b-chat-hf"


# # The name of the experiment (i.e. where to save the results)
# # EXPERIMENT_NAME = "climate_change_synthetic_k1_with_fewshot_and_both_and_neither"
# from datetime import datetime
# EXPERIMENT_NAME = f"{CORPUS}_{MODEL}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}".replace("/", "_")

# Corpus
# CORPUS = "synthetic_data_corpus"
# CORPUS = "converted_climate_change_gpt-4-turbo-preview_gpt-4-turbo-preview_10_10"
# CORPUS = "/workspaces/dev/projects/narratives/synthetic/corpora/climate_change/20_10.json"
CORPUS_NAME = "climate_change"
# MODEL = "gpt-3.5-turbo"
MODEL = "gpt-4-turbo-preview"

# The name of the experiment (i.e. where to save the results)
# EXPERIMENT_NAME = "climate_synthetic_test_k2"
EXPERIMENT_NAME = CORPUS_NAME + "_" + MODEL + "_gpt"


In [6]:
CORPUS = "/workspaces/dev/projects/narratives/synthetic/corpora/" + CORPUS_NAME + ".json"

In [7]:
# from soda.openai.text import completion_model_batched

# MODEL = "gpt-3.5-turbo-instruct"

# texts = ["This is a test", "This is another test! And I"]

# completions = completion_model_batched(
#     texts,
#     MODEL,
#     batch_size=2,
#     num_threads=1,
#     max_tokens=1,
#     # echo=True,
#     logprobs=2,
# )

# print(completions)

# # Get the token logprobs for each completion
# # logprobs = [completion.logprobs.token_logprobs[0] for completion in completions.choices]
# logprobs = []
# for completion in completions.choices:
#     logprobs.append(completion.logprobs.top_logprobs[0])

# for text, logprob in zip(texts, logprobs):
#     # print(text, logprob)
#     print(text)
#     for token, logprob in logprob.items():
#         # Print with tab and escaping newlines
#         print("   ", token.replace("\n", "\\n"), logprob)
#     print()

In [8]:
from soda.openai.text import completion_model_batched, instruct_chat_model_batched


def get_classifications(
    texts, classes, model, batch_size=1, num_threads=1, max_tokens=1
):
    if "gpt-4" in model or model == "gpt-3.5-turbo":
        return get_classifications_gpt4(
            texts, classes, model, batch_size, num_threads, max_tokens
        )
    else:
        return get_classifications_instruct(
            texts, classes, model, batch_size, num_threads, max_tokens
        )


def get_classifications_gpt4(
    texts, classes, model, batch_size=1, num_threads=1, max_tokens=1
):
    """
    Get the classification probabilities using the GPT-4 chat API
    """

    # Construct the system message
    system_message = """
You are a classification system. You will be given some text and you must respond with a single class that the text most likely belongs to. Ensure your response is only the class, with no other text.
"""
    #     system_message = """
    # You are a classification system. You will be given some text and you must respond with a single class that the text most likely belongs to. If no class is suitable, choose "none". If more than one class is suitable, choose "all". If you are unsure, choose "neither". Ensure your response is only the class, with no other text.

    # Classes:
    # """.strip()
    for class_, description in classes.items():
        system_message += f"\n- {class_}: {description}"

    # Get the completions
    results = instruct_chat_model_batched(
        system_message,
        texts,
        model,
        num_threads=num_threads,
        logprobs=True,
        top_logprobs=min(5, len(classes)),
        temperature=0.0,
    )

    # Get the token logprobs for each completion
    responses = [result.choices[0].logprobs.content[0] for result in results]
    # For each class, get the logprobs for each token
    # Use -inf for classes without an entry in the top logprobs
    logprobs = []
    for response in responses:
        classification = {class_: -float("inf") for class_ in classes}

        # Get the logprobs for each token
        # for token, logprob in response.items():
        for top_logprob in response.top_logprobs:
            # Get the class for this token
            class_ = top_logprob.token.strip()

            # Update the classification
            classification[class_] = top_logprob.logprob

        logprobs.append(classification)

    return logprobs


def get_classifications_instruct(
    texts, classes, model, batch_size=1, num_threads=1, max_tokens=1
):
    """
    Get the classification probabilities for each text and class.
    """

    # Construct the prompt
    prompt = ""
    prompt += "You are a classification system. Given the text below, you will classify it into one of the following classes. Ensure you match the classes exactly, including wording and casing.\n"
    # prompt += "You are a classification system. Given the text below, you will classify it into one of the following classes. If no class is suitable, choose \"none\". If more than one class is suitable, choose \"all\". Ensure you match the classes exactly, including wording and casing.\n"
    # for class_ in classes:
    #     prompt += f"- {class_}\n"
    for class_, description in classes.items():
        prompt += f"- {class_}: {description}\n"

    prompt += "\n\n"
    prompt += "Text: {TEXT}"
    prompt += "\n"
    prompt += "Class:"

    # Create the prompts for each text
    prompts = [prompt.format(TEXT=text) for text in texts]

    # Get the completions
    completions = completion_model_batched(
        prompts,
        model,
        batch_size=batch_size,
        num_threads=num_threads,
        max_tokens=max_tokens,
        logprobs=2,
    )
    # print(completions)

    # Get the token logprobs for each completion
    logprobs = []
    # For each class, get the logprobs for each token
    # Use -inf for classes without an entry in the top logprobs
    for completion in completions.choices:
        classification = {class_: -float("inf") for class_ in classes}

        # Get the logprobs for each token
        for token, logprob in completion.logprobs.top_logprobs[0].items():
            # Get the class for this token
            class_ = token.strip().replace("▁", "")

            # Update the classification
            # classification[class_] = logprob

            # There might be duplicates due to the ▁ character that DF seems to use
            classification[class_] = max(
                classification.get(class_, -float("inf")), logprob
            )

        logprobs.append(classification)

    return logprobs


# Classify a simple dataset about cats and dogs
# texts = ["This is a cat", "This is a dog", "This is a cat, and this is a dog"]
texts = [
    "This is a cat",
    "This is a dog",
    "This is a cat, and this is a dog",
    "We're not talking about cats and dogs here!",
]
# classes = ["cat", "dog"]
classes = {
    "cat": "Discusses a cat",
    "dog": "Discusses a dog",
}

classifications = get_classifications(
    texts, classes, MODEL, batch_size=1, num_threads=2, max_tokens=1
)

print(classifications)

  0%|          | 0/4 [00:00<?, ?it/s]

[{'cat': -7.58424e-06, 'dog': -inf, 'Discuss': -12.734383}, {'cat': -inf, 'dog': -13.265629}, {'cat': -0.19098152, 'dog': -1.7534815}, {'cat': -0.5340376, 'dog': -0.9090376}]


In [7]:
# Multiprocessing with joblib
import os
import yaml
from tqdm.auto import tqdm
import random


# Load texts from corpora/{CORPUS}.json
with open(os.path.join("../corpora", f"{CORPUS}.json"), "r") as f:
    corpus_data = yaml.safe_load(f)

print("Loaded {} texts from corpus".format(len(corpus_data)))
print("Total word count:", sum([len(text["text"].split()) for text in corpus_data]))

# Create the training data
X = [text["text"] for text in corpus_data]
y = [text["speakername"] for text in corpus_data]
print(len(X), len(y))

# classes = {
#     "myth": "A statement made by a person who might believe climate change myths.",
#     "science": "A statement made by a person who might believe climate change science.",
# }

# MYTH_DESCRIPTION = """
# A statement made by a person who might believe climate change myths. Here are some framings of the issue that they might believe:
# - Climate change can be seen as a part of Earth's natural cycle, and the extent to which human activities influence this process is still a subject of debate within the scientific community.
# - Despite the consensus narrative, some argue that climate change discussions are ongoing and that there is not complete agreement on the human impact.
# - The occurrence of cold weather events and chilly temperatures can be used to question the assertion that there is a gradual increase in global temperatures due to human actions.
# - The ability of renewable energy sources to fully replace fossil fuels is contested, with skeptics questioning their efficiency and overall potential to meet global energy demands.
# - Some believe that organisms have the capacity to adapt to changes in their climate, suggesting that the effects of climate change may not be as dire as often portrayed.
# """

# SCIENCE_DESCRIPTION = """
# A statement made by a person who might believe climate change science. Here are some framings of the issue that they might believe:
# - An overwhelming majority of climate scientists, over 97%, agree that climate change is real and primarily caused by human activities, particularly the emission of greenhouse gases.
# - The current rapid increase in global temperatures is unprecedented and largely driven by human actions, distinguishing it from natural climate variability.
# - Excessive concentrations of carbon dioxide in the atmosphere, while necessary for plant life, act as a potent greenhouse gas that traps heat and contributes to global warming.
# - The impacts of climate change are global and can affect every individual through extreme weather events, health risks, and economic changes, regardless of personal experience.
# - Renewable energy technologies are advancing and becoming more efficient, offering a promising alternative to fossil fuels and a pathway to mitigating climate change.
# """

# classes = {
#     "myth": "\n" + MYTH_DESCRIPTION.strip() + "\n",
#     "science": "\n" + SCIENCE_DESCRIPTION.strip() + "\n",
# }

# classes = set()
# for name in y:
#     classes.add(name)

# classes = {
#     "a": "A statement made by a person who might believe climate change science.",
#     "b": "A statement made by a person who might believe climate change myths.",
# }

classes = {
    "a": "A helpful statement.",
    "b": "An unhelpful statement.",
}

# Get the classifications
classifications = get_classifications(
    # X, classes, MODEL, batch_size=20, num_threads=50, max_tokens=1
    X,
    classes,
    MODEL,
    batch_size=1,
    num_threads=50,
    max_tokens=1,
)

# Save the data in CSV format
print(classifications[0])
data = []
total = 0
correct = 0
# incorrect_map = {"myth": 0, "science": 0}
# correct_map = {"myth": 0, "science": 0}

# Generic
incorrect_map = {class_: 0 for class_ in classes}
correct_map = {class_: 0 for class_ in classes}

for text, truth, classification in tqdm(zip(X, y, classifications), total=len(X)):
    print(text, truth, classification)
    # # Add the data
    # data.append(
    #     {
    #         "text": text,
    #         # "myth": classification["myth"],
    #         "myth": min(
    #             classification.get("myth", -float("inf")),
    #             classification.get("my", -float("inf")),
    #         ),
    #         "science": classification["science"],
    #         "classification": max(classification, key=classification.get),
    #         "truth": truth,
    #     }
    # )

    # Add the data (class-agnostic)
    datum = {
        "text": text,
        "truth": truth,
        "classification": max(classification, key=classification.get),
    }
    datum.update(classification)
    data.append(datum)

    # Correct the classification if it's "my"
    if data[-1]["classification"] == "my":
        data[-1]["classification"] = "myth"

    # if max(classification, key=classification.get) == truth:
    if data[-1]["classification"] == truth:
        correct += 1
        correct_map[truth] += 1
    else:
        incorrect_map[truth] += 1

    total += 1

# Save to CSV
import pandas as pd

df = pd.DataFrame(data)
df.to_csv(os.path.join("corpus_results", f"{EXPERIMENT_NAME}.csv"), index=False)

# Show accuracy and incorrect counts
print("Accuracy:", correct / total)
print("Incorrect counts:", incorrect_map)

# Show a confusion matrix as a pandas df
import pandas as pd

# Get the confusion matrix
confusion_matrix = pd.crosstab(
    df["truth"], df["classification"], rownames=["Truth"], colnames=["Classification"]
)

print(confusion_matrix)

# Convert to percentages
confusion_matrix = confusion_matrix / confusion_matrix.sum(axis=0)
print(confusion_matrix)

# Save confusion matrix and data to corpus_results/classifiers/{EXPERIMENT_NAME}
import os
import shutil
import yaml

# Create the directory
directory = os.path.join("corpus_results", "gpt", EXPERIMENT_NAME + "/")

# Create the directory
os.makedirs(directory, exist_ok=True)

# Save the confusion matrix
confusion_matrix.to_csv(os.path.join(directory, "confusion_matrix.csv"))

# Save the data
df.to_csv(os.path.join(directory, "data.csv"), index=False)

Loaded 400 texts from corpus
Total word count: 6864
400 400


  0%|          | 0/400 [00:00<?, ?it/s]

{'a': -5.2001665e-06, 'b': -12.187505}


  0%|          | 0/400 [00:00<?, ?it/s]

We must invest heavily in renewable energy sources now to combat the devastating effects of climate change. a {'a': -5.2001665e-06, 'b': -12.187505}
The evidence for climate change is overwhelming, and it's irresponsible to ignore the consensus of the scientific community. a {'a': -8.506662e-05, 'b': -9.375085}
Rising sea levels and extreme weather events are just the beginning if we don't act on climate change immediately. a {'a': -0.0008566702, 'b': -7.063357}
Every year we delay action on climate change, the cost and impact on human lives and ecosystems increase exponentially. a {'a': -2.5107178e-05, 'b': -10.593775}
Governments worldwide must unite to impose strict carbon emission limits to save our planet. a {'a': -1.7954959e-05, 'b': -10.937518}
The transition to a green economy is not only necessary for our survival but will also create millions of jobs. a {'a': -9.849109e-06, 'b': -11.53126}
Denying climate change is denying the future generations their right to a healthy plane