In [7]:
from openai import OpenAI

client = OpenAI()

In [3]:
fizle_task = "sentiment analysis on the SST-2 dataset"

## FIZLE

### Naive

Prompt:
In the task of <task on task-dataset>, a trained black-box classifier correctly predicted the label ‘<yi>’ for the following text. Generate a counterfactual explanation by making minimal changes to the input text, so that the label changes from ‘<yi>’ to ‘<ycf >’. Use the following definition of ‘counterfactual explanation’: “A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome." Enclose the generated text within <new> tags.\n—\nText: <xi>.

### Guided

Prompt for step 1:

In the task of <task on task-dataset>, a trained black-box classifier correctly predicted the label ‘<yi>’ for the following text. Explain why the model predicted the ‘<yi>’ label by identifying the words in the input that caused the label. List ONLY the words as a comma separated list.\n—\nText: <xi>

Prompt for step 2:

Generate a counterfactual explanation for the original text by ONLY changing a minimal set of the words you identified, so that the label changes from ‘<yi>’ to ‘<ycf >’. Use the following definition of ‘counterfactual explanation’: “A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome." Enclose the generated text within <new> tags.

In [46]:
import re

def call_openai_api(system_prompt, model):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt}
        ]
    )
    output = completion.choices[0].message.content
    return output

def generate_naive_fizle_counterfactual(original_text, args):
    original_score, model = args["original_score"], args["model"]
    original_label = 1 if original_score >= 0.5 else 0
    cf_label = 0 if original_label == 1 else 1

    system_prompt = f"""In the task of {fizle_task}, a trained black-box classifier correctly predicted the label '{original_label}' for the following text. Generate a counterfactual explanation by making minimal changes to the input text, so that the label changes from '{original_label}' to '{cf_label}'. Use the following definition of 'counterfactual explanation': "A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome." Enclose the generated text within <new> tags.
    -
    Text: {original_text}"""

    correct_output_format = False
    for i in range(10):
        print(f"attempt: {i + 1}")
        output = call_openai_api(system_prompt, model)
        counterfactual_text = re.search("<new>(.*?)</new>", output).group(1)
        if counterfactual_text:
            correct_output_format = True
            break

    if not correct_output_format:
        print("Failed to generate counterfactual surrounded by <new> tags")
        counterfactual_text = output[5:-6]

    return counterfactual_text

def generate_guided_fizle_counterfactual(original_text, args):
    original_score, model = args["original_score"], args["model"]
    original_label = 1 if original_score >= 0.5 else 0
    cf_label = 0 if original_label == 1 else 1
    system_prompt = ""

    # 1. Find important words
    step1_system_prompt = " ".join([
        f"In the task of {fizle_task}, a trained black-box classifier correctly predicted the label '{original_label}' for the following text.",
        f"Explain why the model predicted the '{original_label}' label by identifying the words in the input that caused the label. List ONLY the words as a comma separated list.",
        f"\n-\nText: {original_text}",
        f"\nImportant words identified: "
    ])
    system_prompt += step1_system_prompt
    important_words = call_openai_api(step1_system_prompt, model)
    system_prompt += important_words + "\n"

    # 2. Generate the final counterfactual
    correct_output_format = False
    for i in range(10):
        step2_system_prompt = " ".join([
            f"Generate a counterfactual explanation for the original text by ONLY changing a minimal set of the words you identified, so that the label changes from '{original_label}' to '{cf_label}'.",
            f"Use the following definition of 'counterfactual explanation': 'A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome.'",
            f"Enclose the generated text within <new> tags."
        ])
        final_system_prompt = system_prompt + step2_system_prompt
        print(f"final_system_prompt: {final_system_prompt}")
        step2_output = call_openai_api(final_system_prompt, model)
        counterfactual_text = re.search("<new>(.*?)</new>", step2_output).group(1)
        if counterfactual_text:
            correct_output_format = True
            break

    if not correct_output_format:
        print("Failed to generate counterfactual surrounded by <new> tags")
        counterfactual_text = output[5:-6]

    return counterfactual_text


Run the naive version.

In [24]:
original_text = "I love this movie!"
args = {"original_score": 0.9, "model": "gpt-4-turbo"}
counterfactual_text = generate_naive_fizle_counterfactual(original_text, args)
counterfactual_text

attempt: 1


'I hate this movie!'

Run the guided version.

In [47]:
texts = [
    "Marvelous, merry and, yes, melancholy film.",
    "Arnold's jump from little screen to big will leave frowns on more than a few faces.",
    "The movie makes absolutely no sense.",
    "It's a movie -- and an album -- you won't want to miss."
]

for original_text in texts:
    args = {"original_score": 0.9, "model": "gpt-4-turbo"}
    counterfactual_text = generate_guided_fizle_counterfactual(original_text, args)
    label_width = 20
    original_label = "Original:".ljust(label_width)
    counterfactual_label = "Counterfactual:".ljust(label_width)
    print(f"{original_label}: {original_text}")
    print(f"{counterfactual_label}: {counterfactual_text}")
    print()

final_system_prompt: In the task of sentiment analysis on the SST-2 dataset, a trained black-box classifier correctly predicted the label '1' for the following text. Explain why the model predicted the '1' label by identifying the words in the input that caused the label. List ONLY the words as a comma separated list. 
-
Text: Marvelous, merry and, yes, melancholy film. 
Important words identified: Marvelous, merry
Generate a counterfactual explanation for the original text by ONLY changing a minimal set of the words you identified, so that the label changes from '1' to '0'. Use the following definition of 'counterfactual explanation': 'A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome.' Enclose the generated text within <new> tags.
Original:           : Marvelous, merry and, yes, melancholy film.
Counterfactual:     : Terrible, dreary and, yes, melancholy film.

final_system_prompt: In the task of sentiment analysis on the