# Setup

Mount Google Drive and clone the repository containing the methods.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import getpass

github_username = input("Enter your GitHub username: ")
github_token = getpass.getpass("Enter your GitHub personal access token: ")

In [None]:
repo_name = "smcaleese/masters-thesis-code"
!git clone https://{github_username}:{github_token}@github.com/{repo_name}.git

In [None]:
%cd masters-thesis-code
%pwd

Install necessary dependencies.

In [None]:
%pip install transformers datasets textdistance openai

## Download datasets

Download the SST-2, QNLI and AG News datasets, clean the sentences, and create a list of input sentences.

In [None]:
from datasets import load_dataset

sst = load_dataset("stanfordnlp/sst2")
qnli = load_dataset("glue", "qnli")
ag_news = load_dataset("fancyzhx/ag_news")

In [None]:
sst

In [None]:
qnli

In [None]:
# for row in qnli["validation"]:
#     question, sentence, label = row["question"], row["sentence"], row["label"]
#     print(f"Question: {question}, sentence: {sentence}, label: {label}")

qnli_questions = [row["question"] for row in qnli["validation"]]
qnli_sentences = [row["sentence"] for row in qnli["validation"]]

qnli_sentences

In [None]:
ag_news

In [None]:
import random
random.seed(0)

num_samples = 100

sst_sentences = sst["test"]["sentence"]
random_sst_sentences_subset = random.sample(sst_sentences, num_samples)

random_qnli_questions_subset = random.sample(qnli_questions, num_samples)
random_qnli_sentences_subset = random.sample(qnli_sentences, num_samples)

# ag_news_sentences = ag_news["test"]["text"]
# random_ag_news_sentences_subset = random.sample(ag_news_sentences, num_samples)

Format the text in the datasets.

In [None]:
import re

def format_sentence(sentence):
    sentence = sentence.lower()

    # remove two spaces around a comma:
    sentence = re.sub(r"\s(')\s(ve|re|s|t|ll|d)", r"\1\2", sentence)

    # remove spaces around hyphens:
    sentence = re.sub(r"-\s-", "--", sentence)
    sentence = re.sub(r"(\w)\s-\s(\w)", r"\1-\2", sentence)

    def replace(match):
        return match.group(1)

    # remove spaces before punctuation and n't:
    sentence = re.sub(r"\s([.!,?:;')]|n't)", replace, sentence)

    # remove spaces after opening parenthesis:
    sentence = re.sub(r"([(])\s", replace, sentence)
    
    return sentence


In [None]:
random_sst_sentences_subset_formatted = [format_sentence(sentence) for sentence in random_sst_sentences_subset]

random_qnli_questions_subset_formatted = [format_sentence(sentence) for sentence in random_qnli_questions_subset]
random_qnli_answers_subset_formatted = [format_sentence(sentence) for sentence in random_qnli_sentences_subset]

Write the sentences to a file named `sst-input.csv` and `qnli-input.csv`.

In [None]:
%pwd

In [None]:
import pandas as pd

df_sst = pd.DataFrame(random_sst_sentences_subset_formatted, columns=["original_text"])
df_sst.to_csv("./input/sst-input.csv", index=False)

df_qnli = pd.DataFrame({
    "original_question": random_qnli_questions_subset_formatted,
    "original_answer": random_qnli_answers_subset_formatted
})
df_qnli.to_csv("./input/qnli-input.csv", index=False)

# df_ag_news = pd.DataFrame(random_ag_news_sentences_subset_formatted, columns=["original_text"])
# df_ag_news.to_csv("./input/ag-news-input.csv", index=False)

## Choose dataset

In [None]:
# dataset = "sst_2"
dataset = "qnli"
# dataset = "ag_news"

if dataset == "sst_2":
    input_file = "sst-input"
    model_name = "textattack/bert-base-uncased-SST-2"
    fizle_task = "sentiment analysis on the SST-2 dataset"
elif dataset == "qnli":
    input_file = "qnli-input"
    model_name = "textattack/bert-base-uncased-QNLI"
    fizle_task = "natural language inference on the QNLI dataset"
elif dataset == "ag_news":
    input_file = "ag-news-input"
    model_name =  "textattack/bert-base-uncased-ag-news"
    fizle_task = "topic classification on the AG News dataset"

## Create input dataframe

Columns to add to create output dataframe:
- original_score
- original_perplexity
- counterfactual_text
- counterfactual_score
- counterfactual_perplexity
- found_flip
- frac_tokens_same

In [None]:
%pwd

In [None]:
import pandas as pd

df_input = pd.read_csv(f"input/{input_file}.csv")
df_input.head()

In [None]:
df_input.shape

## Load models

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Load the sentiment model and tokenizer.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

if dataset == "sst_2":
    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label=id2label,
        label2id=label2id
    ).to(device)

elif dataset == "qnli":
    id2label = {0: "entailment", 1: "not_entailment"}
    label2id = {"entailment": 0, "not_entailment": 1}
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label=id2label,
        label2id=label2id
    ).to(device)

# elif dataset == "ag_news":
#     id2label = {
#         0: "World",
#         1: "Sports",
#         2: "Business",
#         3: "Sci/Tech"
#     }
#     label2id = {
#         "World": 0,
#         "Sports": 1,
#         "Business": 2,
#         "Sci/Tech": 3
#     }
#     sentiment_model = AutoModelForSequenceClassification.from_pretrained(
#         model_name,
#         num_labels=4,
#         id2label=id2label,
#         label2id=label2id
#     ).to(device)

sentiment_model_tokenizer = AutoTokenizer.from_pretrained(model_name)

Load the GPT-2 model for calculating perplexity.

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Load the language model for CLOSS.

In [None]:
import transformers

# TODO: try using a larger model to improve performance: https://arxiv.org/pdf/2111.09543
LM_model = transformers.BertForMaskedLM.from_pretrained("bert-base-uncased").to(device)
LM_model.lm_head = LM_model.cls

## Helper function

In [None]:
import re
import textdistance

def calculate_score(text, sentiment_model_tokenizer, dataset, device):
    if dataset == "sst_2":
        inputs = sentiment_model_tokenizer(text, max_length=512, truncation=True, return_tensors="pt").to(device)
    elif dataset == "qnli":
        def tokenize_with_correct_token_type_ids(input_text, tokenizer):
            # Tokenize the input
            tokens = tokenizer(input_text, return_tensors="pt", padding=True)
            
            # Get the position of the first [SEP] token
            sep_pos = (tokens.input_ids == tokenizer.sep_token_id).nonzero()[0, 1].item()
            
            # Create token_type_ids
            token_type_ids = torch.zeros_like(tokens.input_ids)
            token_type_ids[0, sep_pos+1:] = 1  # Set to 1 after the first [SEP] token
            
            # Update the tokens dictionary
            tokens['token_type_ids'] = token_type_ids
            
            return tokens

        inputs = tokenize_with_correct_token_type_ids(text, answer, sentiment_model_tokenizer).to(device)

    logits = sentiment_model(**inputs).logits
    prob_positive = torch.nn.functional.softmax(logits, dim=1)[0][1].item()
    return prob_positive

def calculate_perplexity(text):
    inputs = gpt2_tokenizer(text, return_tensors="pt").to(device)
    loss = gpt2_model(**inputs, labels=inputs["input_ids"]).loss
    perplexity = torch.exp(loss).item()
    return perplexity

def is_flip(original_score, counterfactual_score):
    # might need to be updated for AG News
    positive_to_negative = original_score >= 0.5 and counterfactual_score < 0.5
    negative_to_positive = original_score < 0.5 and counterfactual_score >= 0.5
    return positive_to_negative or negative_to_positive

def truncate_text(text, max_length=100):
    tokens = text.split()
    if len(tokens) > max_length:
        text = " ".join(tokens[:max_length])
    return text

def get_all_embeddings(model, tokenizer):
    all_word_embeddings = torch.zeros((tokenizer.vocab_size, 768)).detach().to(device)
    for i in range(tokenizer.vocab_size):
        input_tensor = torch.tensor(i).view(1, 1).to(device)
        word_embedding = model.bert.embeddings.word_embeddings(input_tensor)
        all_word_embeddings[i, :] = word_embedding
    all_word_embeddings = all_word_embeddings.detach().requires_grad_(False)
    return all_word_embeddings

def get_levenshtein_similarity_score(original_text, counterfactual_text):
    score = 1 - textdistance.levenshtein.normalized_distance(original_text, counterfactual_text)
    return score

def get_output(df_input, counterfactual_method, args):
    df_input = df_input.copy()
    output_data = {
        "original_text": [],
        "original_score": [],
        "original_perplexity": [],
        "counterfactual_text": [],
        "counterfactual_score": [],
        "counterfactual_perplexity": [],
        "found_flip": [],
        "levenshtein_similarity_score": []
    }
    if dataset == "qnli":
        output_data["original_question"] = []

    for i in range(len(df_input)):
        if dataset == "sst_2":
            original_text = df_input.iloc[i]["original_text"]
            original_text = truncate_text(original_text)
            print(f"Processing input {i + 1}/{len(df_input)}: num tokens: {len(original_text.split())}")

            original_score = calculate_score(original_text, sentiment_model_tokenizer, dataset, device)
            original_perplexity = calculate_perplexity(original_text)

            args = {**args, "original_score": original_score}
            counterfactual_text = counterfactual_method(original_text, calculate_score, args)
            counterfactual_text = format_sentence(counterfactual_text)

            label_width = 20
            print(f"\n{'original_text:'.ljust(label_width)} {original_text}")
            print(f"{'counterfactual_text:'.ljust(label_width)} {counterfactual_text}\n")

            counterfactual_score = calculate_score(counterfactual_text, sentiment_model_tokenizer, dataset, device)
            counterfactual_perplexity = calculate_perplexity(counterfactual_text)
            found_flip = is_flip(original_score, counterfactual_score)
            levenshtein_similarity_score = get_levenshtein_similarity_score(original_text, counterfactual_text)

            output_data["original_text"].append(original_text)
            output_data["original_score"].append(original_score)
            output_data["original_perplexity"].append(original_perplexity)
            output_data["counterfactual_text"].append(counterfactual_text)
            output_data["counterfactual_score"].append(counterfactual_score)
            output_data["counterfactual_perplexity"].append(counterfactual_perplexity)
            output_data["found_flip"].append(found_flip)
            output_data["levenshtein_similarity_score"].append(levenshtein_similarity_score)

        elif dataset == "qnli":
            row = df_input.iloc[i]
            original_question, original_answer = row["original_question"], row["original_answer"]
            original_input = f"{original_question} [SEP] {original_answer}"

            print(f"Processing input {i + 1}/{len(df_input)}: num tokens: {len(f"{original_question} {original_answer}".split())}")

            original_score = calculate_score(original_input, sentiment_model_tokenizer, dataset, device)
            original_perplexity = calculate_perplexity(original_answer)

            args = {**args, "original_score": original_score}
            # TODO for QNLI:

            counterfactual_answer = counterfactual_method(original_input, calculate_score, calculate_score, args)
            counterfactual_answer = format_sentence(counterfactual_answer)
            counterfactual_input = (original_question, counterfactual_answer)

            label_width = 20
            print(f"\n{'original_answer:'.ljust(label_width)} {original_answer}")
            print(f"{'counterfactual_answer:'.ljust(label_width)} {counterfactual_answer}\n")

            counterfactual_score = calculate_score(counterfactual_input, sentiment_model_tokenizer, dataset, device)
            counterfactual_perplexity = calculate_perplexity(counterfactual_answer)
            found_flip = is_flip(original_score, counterfactual_score)
            levenshtein_similarity_score = get_levenshtein_similarity_score(original_answer, counterfactual_answer)

            output_data["original_question"].append(original_question)
            output_data["original_text"].append(counterfactual_answer)
            output_data["original_score"].append(original_score)
            output_data["original_perplexity"].append(original_perplexity)
            output_data["counterfactual_text"].append(counterfactual_answer)
            output_data["counterfactual_score"].append(counterfactual_score)
            output_data["counterfactual_perplexity"].append(counterfactual_perplexity)
            output_data["found_flip"].append(found_flip)
            output_data["levenshtein_similarity_score"].append(levenshtein_similarity_score)

    df_output = pd.DataFrame(output_data)
    return df_output


In [None]:
all_word_embeddings = get_all_embeddings(sentiment_model, sentiment_model_tokenizer).to(device)

In [None]:
from openai import OpenAI
from google.colab import userdata

client = OpenAI(api_key=userdata.get("API_KEY"))

## Testing HotFlip components

In [None]:
# loss_fct, flip_target, c_tokens, device)
import torch
from CLOSS.helpers import compute_substitution_scores

loss_fct = torch.nn.CrossEntropyLoss()

text = "I really loved the movie."

prob_pos = calculate_score(text)

id_list = sentiment_model_tokenizer.encode(text, add_special_tokens=True, truncation=True)
tokens = sentiment_model_tokenizer.convert_ids_to_tokens(id_list)

if prob_pos > 0.5:
    flip_target = 0
else:
    flip_target = 1

substitution_scores, candidate_prob_pos = compute_substitution_scores(all_word_embeddings, sentiment_model, sentiment_model_tokenizer, loss_fct, flip_target, tokens, device)

In [None]:
# substitution_scores
candidate_prob_pos

## Counterfactual generator functions

In [None]:
# %cd "CLOSS"
# %cd ..
%pwd

In [None]:
from CLOSS.closs import generate_counterfactual
import re

def generate_polyjuice_counterfactual(original_text, calculate_score, args):
    perturbations = pj.perturb(
        orig_sent=original_text,
        ctrl_code="negation",
        num_perturbations=1,
        perplex_thred=None
    )
    counterfactual_text = perturbations[0]
    return counterfactual_text

def generate_closs_counterfactual(original_text, calculate_score, args):
    # TODO: move target label from inside CLOSS to here
    counterfactual_text = generate_counterfactual(
        original_text,
        sentiment_model,
        LM_model,
        calculate_score,
        sentiment_model_tokenizer,
        all_word_embeddings,
        device,
        args
    )
    return counterfactual_text

def call_openai_api(system_prompt, model):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt}
        ],
        top_p=1,
        temperature=0.4,
        frequency_penalty=1.1
    )
    output = completion.choices[0].message.content
    return output

def generate_naive_fizle_counterfactual(original_text, calculate_score, args):
    original_score, model = args["original_score"], args["model"]
    original_label = 1 if original_score >= 0.5 else 0
    cf_label = 0 if original_label == 1 else 1

    system_prompt = f"""In the task of {fizle_task}, a trained black-box classifier correctly predicted the label '{original_label}' for the following text. Generate a counterfactual explanation by making minimal changes to the input text, so that the label changes from '{original_label}' to '{cf_label}'. Use the following definition of 'counterfactual explanation': "A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome." Enclose the generated text within <new> tags.
    -
    Text: {original_text}"""

    correct_output_format = False
    for i in range(10):
        print(f"attempt: {i + 1}")
        output = call_openai_api(system_prompt, model)
        counterfactual_text = re.search("<new>(.*?)</new>", output).group(1)
        if counterfactual_text:
            correct_output_format = True
            break

    if not correct_output_format:
        print("Failed to generate counterfactual surrounded by <new> tags")
        counterfactual_text = output[5:-6]

    return counterfactual_text

def generate_guided_fizle_counterfactual(original_text, calculate_score, args):
    original_score, model = args["original_score"], args["model"]
    original_label = 1 if original_score >= 0.5 else 0
    cf_label = 0 if original_label == 1 else 1
    system_prompt = ""

    # 1. Find important words
    step1_system_prompt = " ".join([
        f"In the task of {fizle_task}, a trained black-box classifier correctly predicted the label '{original_label}' for the following text.",
        f"Explain why the model predicted the '{original_label}' label by identifying the words in the input that caused the label. List ONLY the words as a comma separated list.",
        f"\n-\nText: {original_text}",
        f"\nImportant words identified: "
    ])
    system_prompt += step1_system_prompt
    important_words = call_openai_api(step1_system_prompt, model)
    system_prompt += important_words + "\n"

    # 2. Generate the final counterfactual
    correct_output_format = False
    for i in range(10):
        step2_system_prompt = " ".join([
            f"Generate a counterfactual explanation for the original text by ONLY changing a minimal set of the words you identified, so that the label changes from '{original_label}' to '{cf_label}'.",
            f"Use the following definition of 'counterfactual explanation': 'A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome.'",
            f"Enclose the generated text within <new> tags."
        ])
        final_system_prompt = system_prompt + step2_system_prompt
        print(f"final_system_prompt: {final_system_prompt}")
        step2_output = call_openai_api(final_system_prompt, model)
        counterfactual_text = re.search("<new>(.*?)</new>", step2_output).group(1)
        if counterfactual_text:
            correct_output_format = True
            break

    if not correct_output_format:
        print("Failed to generate counterfactual surrounded by <new> tags")
        counterfactual_text = step2_output[5:-6]

    return counterfactual_text


In [None]:
s = "What came into force after the new constitution was herald? [SEP] As of that day, the new constitution heralding the Second Republic came into force."
args = {
    "beam_width": 15,
    "w": 5,
    "K": 30,
    "substitution_evaluation_method": "hotflip_only",
    "substitution_gen_method": "hotflip_only",
    "dataset": dataset
}
counterfactual = generate_closs_counterfactual(s, args)
counterfactual

## Run CLOSS and HotFlip

First run the method without optimization (`CLOSS-EO`) and without retraining the language modeling head.

- `CLOSS-EO:` skip optimizing the embedding. This increases failures but lowers perplexity.
- `CLOSS-RTL:` skip retraining the language modeling head. This has no effect on perplexity but increases the failure rate.

Move to the main parent directory.

In [None]:
# %cd "CLOSS"
# %cd ..
%pwd

In [None]:
df_input.head()

1. Run HotFlip:

In [None]:
args = {
    "beam_width": 15,
    "w": 5,
    "K": 30,
    "substitution_evaluation_method": "hotflip_only",
    "substitution_gen_method": "hotflip_only",
    "dataset": dataset
}

df_output = get_output(df_input, generate_closs_counterfactual, args)

In [None]:
df_output.head()

In [None]:
df_output.to_csv(f"./output/hotflip-output-{dataset}.csv", index=False)

2. Run CLOSS without optimization and without retraining the language modeling head:

In [None]:
args = {
    "beam_width": 15,
    "w": 5,
    "K": 30,
    "substitution_evaluation_method": "SVs",
    "substitution_gen_method": "no_opt_lmh",
    "dataset": dataset
}

df_output = get_output(df_input, generate_closs_counterfactual, args)

In [None]:
df_output.head()

In [None]:
df_output.to_csv(f"./output/closs-output-{dataset}.csv", index=False)

## Run Polyjuice

### Setup

In [None]:
%cd polyjuice
%pwd

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
%pip install -e .

Make sure the model is being imported properly.

In [None]:
import importlib
import polyjuice

importlib.reload(polyjuice)
print(polyjuice.__file__)

In [None]:
from polyjuice import Polyjuice

pj = Polyjuice(model_path="uw-hai/polyjuice", is_cuda=True)

In [None]:
text = "julia is played with exasperating blandness by laura regan ."
perturbations = pj.perturb(
    orig_sent=text,
    ctrl_code="negation",
    num_perturbations=5,
    # perplex_thred=None
)
perturbations

Run the model and get the output.

In [None]:
df_input.head()

In [None]:
df_output = get_output(df_input, generate_polyjuice_counterfactual, {})

In [None]:
df_output.head(10)

In [None]:
%cd ..
%pwd

In [None]:
df_output.to_csv(f"./output/polyjuice-output-{dataset}.csv", index=False)

## FIZLE

Two variants:
* Naive: uses a single prompt.
* Guided: Uses two prompts. The first prompt identifies important words and the second prompt generates the counterfactual.

Hyperparameters:

For all LLMs, we use top_p sampling with p = 1, temperature t = 0.4 and a repetition penalty of 1.1.


### 1. FIZLE naive

In [None]:
df_input.head()

In [None]:
args = {"model": "gpt-4-turbo"}
df_output = get_output(df_input, generate_naive_fizle_counterfactual, args)

In [None]:
df_output.head()

In [None]:
df_output.to_csv("./output/fizlenaive-output.csv", index=False)

### FIZLE guided

In [None]:
df_input.head()

In [None]:
args = {"model": "gpt-4-turbo"}
df_output = get_output(df_input, generate_naive_fizle_counterfactual, args)

In [None]:
df_output.head()

In [None]:
df_output.to_csv("./output/fizleguided-output.csv", index=False)