## Setup

Mount Google Drive and clone the repository containing the methods.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import getpass

github_username = input("Enter your GitHub username: ")
github_token = getpass.getpass("Enter your GitHub personal access token: ")

In [None]:
repo_name = "smcaleese/masters-thesis-code"
!git clone https://{github_username}:{github_token}@github.com/{repo_name}.git

In [1]:
%cd masters-thesis-code
%pwd

[Errno 2] No such file or directory: 'masters-thesis-code'
/Users/smcaleese/Documents/masters-thesis-code


  bkms = self.shell.db.get('bookmarks', {})


'/Users/smcaleese/Documents/masters-thesis-code'

Install necessary dependencies.

In [None]:
%pip install transformers datasets textdistance openai

## Download datasets

Download the SST-2, QNLI and AG News datasets, clean the sentences, and create a list of input sentences.

In [1]:
num_samples = 1000

dataset = "sst_2"
# dataset = "qnli"
# dataset = "ag_news"

In [2]:
import re

def format_sentence(sentence, dataset):
    sentence = sentence.lower()

    # remove two spaces around a comma:
    sentence = re.sub(r"\s(')\s(ve|re|s|t|ll|d)", r"\1\2", sentence)

    # remove spaces around hyphens:
    sentence = re.sub(r"-\s-", "--", sentence)
    sentence = re.sub(r"(\w)\s-\s(\w)", r"\1-\2", sentence)

    def replace(match):
        return match.group(1)

    # remove spaces before punctuation and n't:
    sentence = re.sub(r"\s([.!,?:;')]|n't)", replace, sentence)

    # remove spaces after opening parenthesis:
    sentence = re.sub(r"([(])\s", replace, sentence)

    if dataset == "qnli":
        sentence = re.sub(r"\s(\[sep\])\s", " [SEP] ", sentence)
    
    return sentence

In [3]:
from datasets import load_dataset

if dataset == "sst_2":
    sst = load_dataset("stanfordnlp/sst2")

    sst_sentences = sst["train"]["sentence"]
    sst_labels = sst["train"]["label"]

    sst_sentences_subset = sst_sentences[:num_samples]
    sst_labels_subset = sst_labels[:num_samples]

    sst_sentences_subset_formatted = [format_sentence(sentence, dataset) for sentence in sst_sentences_subset]

elif dataset == "qnli":
    qnli = load_dataset("glue", "qnli")

    qnli_questions = qnli["train"]["question"]
    qnli_answers = qnli["train"]["sentence"]
    qnli_labels = qnli["train"]["label"]

    qnli_questions_subset = qnli_questions[:num_samples]
    qnli_answers_subset = qnli_answers[:num_samples]
    qnli_labels_subset = qnli_labels[:num_samples]

    qnli_questions_subset_formatted = [format_sentence(sentence, dataset) for sentence in qnli_questions_subset]
    qnli_answers_subset_formatted = [format_sentence(sentence, dataset) for sentence in qnli_answers_subset]


  from .autonotebook import tqdm as notebook_tqdm


Write the sentences to a file named `sst-input.csv` and `qnli-input.csv`.

In [4]:
%pwd

'/Users/smcaleese/Documents/masters-thesis-code'

In [8]:
import pandas as pd

if dataset == "sst_2":
    df_sst = pd.DataFrame({
        "original_text": sst_sentences_subset_formatted,
        "original_label": sst_labels_subset
    })
    df_sst.to_csv("./input/sst-input.csv", index=False)

elif dataset == "qnli":
    df_qnli = pd.DataFrame({
        "original_question": qnli_questions_subset_formatted,
        "original_answer": qnli_answers_subset_formatted,
        "original_label": qnli_labels_subset
    })
    df_qnli.to_csv("./input/qnli-input.csv", index=False)

## Choose dataset

In [19]:
if dataset == "sst_2":
    input_file = "sst-input"
    model_name = "textattack/bert-base-uncased-SST-2"
    fizle_task = "sentiment analysis on the SST-2 dataset"
elif dataset == "qnli":
    input_file = "qnli-input"
    model_name = "textattack/bert-base-uncased-QNLI"
    fizle_task = "natural language inference on the QNLI dataset"


## Create input dataframe

Columns to add to create output dataframe:
- original_score
- original_perplexity
- counterfactual_text
- counterfactual_score
- counterfactual_perplexity
- found_flip
- frac_tokens_same

In [20]:
%pwd

'/Users/smcaleese/Documents/masters-thesis-code'

In [21]:
import pandas as pd

df_input = pd.read_csv(f"input/{input_file}.csv")
df_input.head()

Unnamed: 0,original_text,original_label
0,hide new secretions from the parental units,0
1,"contains no wit, only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0


In [22]:
df_input.shape

(1000, 2)

## Load models

In [15]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Load the sentiment model and tokenizer.

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

if dataset == "sst_2":
    id2label = {0: "NEGATIVE", 1: "POSITIVE"}
    label2id = {"NEGATIVE": 0, "POSITIVE": 1}
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label=id2label,
        label2id=label2id
    ).to(device)

elif dataset == "qnli":
    id2label = {0: "entailment", 1: "not_entailment"}
    label2id = {"entailment": 0, "not_entailment": 1}
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label=id2label,
        label2id=label2id
    ).to(device)

# elif dataset == "ag_news":
#     id2label = {
#         0: "World",
#         1: "Sports",
#         2: "Business",
#         3: "Sci/Tech"
#     }
#     label2id = {
#         "World": 0,
#         "Sports": 1,
#         "Business": 2,
#         "Sci/Tech": 3
#     }
#     sentiment_model = AutoModelForSequenceClassification.from_pretrained(
#         model_name,
#         num_labels=4,
#         id2label=id2label,
#         label2id=label2id
#     ).to(device)

sentiment_model_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
text = "what does umc stand for? [SEP] founded in 1968 by the union of the methodist church (usa) and the evangelical united brethren church, the umc traces its roots back to the revival movement of john and charles wesley in england as well as the great awakening in the united states."
tokens = sentiment_model_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
logits = sentiment_model(**tokens).logits
prob_positive = torch.nn.functional.softmax(logits, dim=1)[0][1].item()
prob_positive

0.7885696887969971

In [192]:
id = 1 if prob_positive > 0.5 else 0
label = id2label[id]
print(id, label)

1 POSITIVE


Load the GPT-2 model for calculating perplexity.

In [193]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Load the language model for CLOSS.

In [194]:
import transformers

# TODO: try using a larger model to improve performance: https://arxiv.org/pdf/2111.09543
LM_model = transformers.BertForMaskedLM.from_pretrained("bert-base-uncased").to(device)
LM_model.lm_head = LM_model.cls

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Helper function

In [195]:
import re
import textdistance
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_score(text, sentiment_model_tokenizer, dataset, device):
    def tokenize_with_correct_token_type_ids(input_text, tokenizer):
        # Tokenize the input
        tokens = tokenizer(input_text, return_tensors="pt", padding=True)
        
        # Get the position of the first [SEP] token
        sep_pos = (tokens.input_ids == tokenizer.sep_token_id).nonzero()[0, 1].item()
        
        # Create token_type_ids
        token_type_ids = torch.zeros_like(tokens.input_ids)
        token_type_ids[0, sep_pos+1:] = 1  # Set to 1 after the first [SEP] token
        
        # Update the tokens dictionary
        tokens['token_type_ids'] = token_type_ids
        
        return tokens

    if type(text) == list:
        if type(text[0]) == str:
            tokens = text
            ids = sentiment_model_tokenizer.convert_tokens_to_ids(tokens)
            text = sentiment_model_tokenizer.decode(ids[1:-1])
        elif type(text[0]) == int:
            ids = text
            text = sentiment_model_tokenizer.decode(ids[1:-1])

    if dataset == "sst_2":
        inputs = sentiment_model_tokenizer(text, max_length=512, truncation=True, return_tensors="pt").to(device)
    elif dataset == "qnli":
        inputs = tokenize_with_correct_token_type_ids(text, sentiment_model_tokenizer).to(device)

    logits = sentiment_model(**inputs).logits
    prob_positive = torch.nn.functional.softmax(logits, dim=1)[0][1].item()
    return prob_positive

def calculate_perplexity(text):
    inputs = gpt2_tokenizer(text, return_tensors="pt").to(device)
    loss = gpt2_model(**inputs, labels=inputs["input_ids"]).loss
    perplexity = torch.exp(loss).item()
    return perplexity

def is_flip(original_score, counterfactual_score):
    # might need to be updated for AG News
    positive_to_negative = original_score >= 0.5 and counterfactual_score < 0.5
    negative_to_positive = original_score < 0.5 and counterfactual_score >= 0.5
    return positive_to_negative or negative_to_positive

def truncate_text(text, max_length=100):
    tokens = text.split()
    if len(tokens) > max_length:
        text = " ".join(tokens[:max_length])
    return text

def get_all_embeddings(model, tokenizer):
    all_word_embeddings = torch.zeros((tokenizer.vocab_size, 768)).detach().to(device)
    for i in range(tokenizer.vocab_size):
        input_tensor = torch.tensor(i).view(1, 1).to(device)
        word_embedding = model.bert.embeddings.word_embeddings(input_tensor)
        all_word_embeddings[i, :] = word_embedding
    all_word_embeddings = all_word_embeddings.detach().requires_grad_(False)
    return all_word_embeddings

def get_levenshtein_similarity_score(original_text, counterfactual_text):
    score = 1 - textdistance.levenshtein.normalized_distance(original_text, counterfactual_text)
    return score

def format_polyjuice_output(polyjuice_output, original_question, original_answer):
    # Helper function to calculate cosine similarity
    def get_cosine_similarity(text1, text2):
        vectorizer = CountVectorizer().fit_transform([text1, text2])
        return cosine_similarity(vectorizer)[0][1]

    sep_token = " [SEP] "

    # 1. Return the output if it's already valid
    if sep_token in polyjuice_output:
        return polyjuice_output

    # Replace invalid separator tokens
    polyjuice_output = re.sub(r"\[(\w+)\]", sep_token, polyjuice_output)

    # If it's still valid after replacement, return it
    if sep_token in polyjuice_output:
        return polyjuice_output

    # Check if the output is more similar to a question or an answer
    similarity_to_question = get_cosine_similarity(polyjuice_output, original_question)
    similarity_to_answer = get_cosine_similarity(polyjuice_output, original_answer)

    if polyjuice_output.strip().endswith("?") or similarity_to_question > similarity_to_answer:
        # It's likely a question, so use the new question with the original answer
        return f"{polyjuice_output} [SEP] {original_answer}"
    else:
        # It's likely an answer, so use the original question with the new answer
        return f"{original_question} [SEP] {polyjuice_output}"

def get_output(df_input, counterfactual_method, args):
    df_input = df_input.copy()
    output_data = {
        "original_text": [],
        "original_score": [],
        "original_perplexity": [],
        "counterfactual_text": [],
        "counterfactual_score": [],
        "counterfactual_perplexity": [],
        "found_flip": [],
        "levenshtein_similarity_score": []
    }
    for i in range(len(df_input)):
        if dataset == "sst_2":
            original_text = df_input.iloc[i]["original_text"]
            original_text = truncate_text(original_text)
            original_text = format_sentence(original_text, dataset)
            print(f"Processing input {i + 1}/{len(df_input)}: num tokens: {len(original_text.split())}")

            original_score = calculate_score(original_text, sentiment_model_tokenizer, dataset, device)
            original_perplexity = calculate_perplexity(original_text)

            args = {**args, "original_score": original_score}
            counterfactual_text = counterfactual_method(original_text, calculate_score, args)
            counterfactual_text = format_sentence(counterfactual_text, dataset)

            label_width = 20
            print(f"\n{'original_text:'.ljust(label_width)} {original_text}")
            print(f"{'counterfactual_text:'.ljust(label_width)} {counterfactual_text}\n")

            counterfactual_score = calculate_score(counterfactual_text, sentiment_model_tokenizer, dataset, device)
            counterfactual_perplexity = calculate_perplexity(counterfactual_text)
            found_flip = is_flip(original_score, counterfactual_score)
            levenshtein_similarity_score = get_levenshtein_similarity_score(original_text, counterfactual_text)

            output_data["original_text"].append(original_text)
            output_data["original_score"].append(original_score)
            output_data["original_perplexity"].append(original_perplexity)
            output_data["counterfactual_text"].append(counterfactual_text)
            output_data["counterfactual_score"].append(counterfactual_score)
            output_data["counterfactual_perplexity"].append(counterfactual_perplexity)
            output_data["found_flip"].append(found_flip)
            output_data["levenshtein_similarity_score"].append(levenshtein_similarity_score)

        elif dataset == "qnli":
            row = df_input.iloc[i]
            original_question, original_answer = row["original_question"], row["original_answer"]
            original_text = f"{original_question} [SEP] {original_answer}"
            original_text = format_sentence(original_text, dataset)

            print(f"Processing input {i + 1}/{len(df_input)}: num tokens: {len(original_text.split())}")

            original_score = calculate_score(original_text, sentiment_model_tokenizer, dataset, device)
            original_perplexity = calculate_perplexity(original_text)

            args = {**args, "original_score": original_score}
            counterfactual_text = counterfactual_method(original_text, calculate_score, args)
            if counterfactual_method.__name__ == "generate_polyjuice_counterfactual":
                counterfactual_text = format_polyjuice_output(polyjuice_output, original_question, original_answer)
            counterfactual_text = format_sentence(counterfactual_text, dataset)

            label_width = 20
            print(f"\n{'original_text:'.ljust(label_width)} {original_text}")
            print(f"{'counterfactual_text:'.ljust(label_width)} {counterfactual_text}\n")

            counterfactual_score = calculate_score(counterfactual_text, sentiment_model_tokenizer, dataset, device)
            counterfactual_perplexity = calculate_perplexity(counterfactual_text)
            found_flip = is_flip(original_score, counterfactual_score)
            levenshtein_similarity_score = get_levenshtein_similarity_score(original_text, counterfactual_text)

            output_data["original_text"].append(original_text)
            output_data["original_score"].append(original_score)
            output_data["original_perplexity"].append(original_perplexity)
            output_data["counterfactual_text"].append(counterfactual_text)
            output_data["counterfactual_score"].append(counterfactual_score)
            output_data["counterfactual_perplexity"].append(counterfactual_perplexity)
            output_data["found_flip"].append(found_flip)
            output_data["levenshtein_similarity_score"].append(levenshtein_similarity_score)

    df_output = pd.DataFrame(output_data)
    return df_output


In [196]:
all_word_embeddings = get_all_embeddings(sentiment_model, sentiment_model_tokenizer).to(device)

In [197]:
from openai import OpenAI
# from google.colab import userdata

# client = OpenAI(api_key=userdata.get("API_KEY"))
client = OpenAI()

Test the accuracy of the model.

In [198]:
correct = 0

for i in range(len(df_input)):
    print(f"i: {i}")
    row = df_input.iloc[i]

    if dataset == "sst_2":
        original_text, original_label = row["original_text"], row["original_label"]
    elif dataset == "qnli":
        original_question, original_answer, original_label = row["original_question"], row["original_answer"], row["original_label"]
        original_text = f"{original_question} [SEP] {original_answer}"

    score = calculate_score(original_text, sentiment_model_tokenizer, dataset, device)
    y_hat = 1 if score >= 0.5 else 0
    if y_hat == original_label:
        correct += 1

accuracy = correct / len(df_input)
print(f"accuracy: {accuracy}")

i: 0
i: 1
i: 2
i: 3
i: 4
i: 5
i: 6
i: 7
i: 8
i: 9
i: 10
i: 11
i: 12
i: 13
i: 14
i: 15
i: 16
i: 17
i: 18
i: 19
i: 20
i: 21
i: 22
i: 23
i: 24
i: 25
i: 26
i: 27
i: 28
i: 29
i: 30
i: 31
i: 32
i: 33
i: 34
i: 35
i: 36
i: 37
i: 38
i: 39
i: 40
i: 41
i: 42
i: 43
i: 44
i: 45
i: 46
i: 47
i: 48
i: 49
i: 50
i: 51
i: 52
i: 53
i: 54
i: 55
i: 56
i: 57
i: 58
i: 59
i: 60
i: 61
i: 62
i: 63
i: 64
i: 65
i: 66
i: 67
i: 68
i: 69
i: 70
i: 71
i: 72
i: 73
i: 74
i: 75
i: 76
i: 77
i: 78
i: 79
i: 80
i: 81
i: 82
i: 83
i: 84
i: 85
i: 86
i: 87
i: 88
i: 89
i: 90
i: 91
i: 92
i: 93
i: 94
i: 95
i: 96
i: 97
i: 98
i: 99
accuracy: 0.92


## Counterfactual generator functions

In [127]:
# %cd "CLOSS"
# %cd ..
%pwd

'/Users/smcaleese/Documents/masters-thesis-code'

In [171]:
from CLOSS.closs import generate_counterfactual
import re

def generate_polyjuice_counterfactual(original_text, _, args):
    ctrl_code = None if dataset == "qnli" else "negation"
    perturbations = pj.perturb(
        orig_sent=original_text,
        ctrl_code=ctrl_code,
        num_perturbations=1,
        perplex_thred=None
    )
    counterfactual_text = perturbations[0]
    return counterfactual_text

def generate_closs_counterfactual(original_text, calculate_score, args):
    counterfactual_text = generate_counterfactual(
        original_text,
        sentiment_model,
        LM_model,
        calculate_score,
        sentiment_model_tokenizer,
        all_word_embeddings,
        device,
        args
    )
    return counterfactual_text

def call_openai_api(system_prompt, model):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt}
        ],
        top_p=1,
        temperature=0.4,
        frequency_penalty=1.1
    )
    output = completion.choices[0].message.content
    return output

def generate_naive_fizle_counterfactual(original_text, _, args):
    original_score, model = args["original_score"], args["model"]
    original_id = 1 if original_score >= 0.5 else 0
    cf_id = 0 if original_id == 1 else 1

    original_label = id2label[original_id]
    cf_label = id2label[cf_id]

    system_prompt = f"""In the task of {fizle_task}, a trained black-box classifier correctly predicted the label '{original_label}' for the following text. Generate a counterfactual explanation by making minimal changes to the input text, so that the label changes from '{original_label}' to '{cf_label}'. Use the following definition of 'counterfactual explanation': "A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome." Enclose the generated text within <new> tags.
    -
    Text: {original_text}"""

    for i in range(10):
        print(f"attempt: {i + 1}")
        output = call_openai_api(system_prompt, model)
        if not output:
            continue
        counterfactual_text = re.search("<new>(.*?)</new>", output).group(1)
        if counterfactual_text:
            return counterfactual_text

    if not output:
        print("No counterfactual generated.")

    print("Failed to generate counterfactual surrounded by <new> tags")
    counterfactual_text = output[5:-6]

    return counterfactual_text

def generate_guided_fizle_counterfactual(original_text, _, args):
    original_score, model = args["original_score"], args["model"]
    original_id = 1 if original_score >= 0.5 else 0
    cf_id = 0 if original_id == 1 else 1

    original_label = id2label[original_id]
    cf_label = id2label[cf_id]

    system_prompt = ""

    # 1. Find important words
    step1_system_prompt = " ".join([
        f"In the task of {fizle_task}, a trained black-box classifier correctly predicted the label '{original_label}' for the following text.",
        f"Explain why the model predicted the '{original_label}' label by identifying the words in the input that caused the label. List ONLY the words as a comma separated list.",
        f"\n-\nText: {original_text}",
        f"\nImportant words identified: "
    ])
    system_prompt += step1_system_prompt
    important_words = call_openai_api(step1_system_prompt, model)
    system_prompt += important_words + "\n"

    # 2. Generate the final counterfactual
    correct_output_format = False
    for i in range(10):
        step2_system_prompt = " ".join([
            f"Generate a counterfactual explanation for the original text by ONLY changing a minimal set of the words you identified, so that the label changes from '{original_label}' to '{cf_label}'.",
            f"Use the following definition of 'counterfactual explanation': 'A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome.'",
            f"Enclose the generated text within <new> tags."
        ])
        final_system_prompt = system_prompt + step2_system_prompt
        step2_output = call_openai_api(final_system_prompt, model)
        if not step2_output:
            continue
        counterfactual_text = re.search("<new>(.*?)</new>", step2_output).group(1)
        if counterfactual_text:
            return counterfactual_text

    if not output:
        print("No counterfactual generated.")

    print("Failed to generate counterfactual surrounded by <new> tags")
    counterfactual_text = output[5:-6]

    return counterfactual_text


In [206]:
# def generate_naive_fizle_counterfactual(original_text, calculate_score, args):

# original_text = "what act sets forth the functions of the scottish parliament? [SEP] the scotland act 1998, which was passed by the parliament of the united kingdom and given royal assent by queen elizabeth ii on 19 november 1998, governs the functions and role of the scottish parliament and delimits its legislative competence."
# original_text = "who was the mayor of san francisco during super bowl 50? [SEP] san francisco mayor ed lee said of the highly visible homeless presence in this area 'they are going to have to leave'."
# original_text = "how much of jacksonville is made up of water? [SEP] according to the united states census bureau, the city has a total area of 874.3 square miles (2,264 km2), making jacksonville the largest city in land area in the contiguous united states; of this, 86.66% (757.7 sq mi or 1,962 km2) is land and; 13.34% (116.7 sq mi or 302 km2) is water."

original_text = "I really liked the movie."

args = {"original_score": 1, "model": "gpt-4-turbo"}
# counterfactual_text = generate_naive_fizle_counterfactual(original_text, calculate_score, args)
counterfactual_text = generate_guided_fizle_counterfactual(original_text, calculate_score, args)

print(f"original_text: {original_text}")
print()
original_score = calculate_score(original_text, sentiment_model_tokenizer, dataset, device)
print(f"original_score: {original_score}")
print()

print(f"counterfactual_text: {counterfactual_text}")
counterfactual_score = calculate_score(counterfactual_text, sentiment_model_tokenizer, dataset, device)
print(f"counterfactual_score: {counterfactual_score}")

final_system_prompt: In the task of sentiment analysis on the SST-2 dataset, a trained black-box classifier correctly predicted the label 'POSITIVE' for the following text. Explain why the model predicted the 'POSITIVE' label by identifying the words in the input that caused the label. List ONLY the words as a comma separated list. 
-
Text: I really liked the movie. 
Important words identified: liked, really
Generate a counterfactual explanation for the original text by ONLY changing a minimal set of the words you identified, so that the label changes from 'POSITIVE' to 'NEGATIVE'. Use the following definition of 'counterfactual explanation': 'A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome.' Enclose the generated text within <new> tags.

step2_output: <new>I really disliked the movie.</new>
original_text: I really liked the movie.

original_score: 0.99933260679245

counterfactual_text: I really disliked the movie.
counte

In [51]:
# def calculate_score(text, sentiment_model_tokenizer, dataset, device):

original_output = calculate_score(original_text, sentiment_model_tokenizer, "qnli", device)
counterfactual_output = calculate_score(counterfactual_text, sentiment_model_tokenizer, "qnli", device)

print(f"original_output: {original_output}, counterfactual_output: {counterfactual_output}")

original_output: 0.9770978689193726, counterfactual_output: 0.004270407371222973


In [33]:
# s = "What came into force after the new constitution was herald? [SEP] As of that day, the new constitution heralding the Second Republic came into force."
# s = "What is the minimum required if you want to teach in Canada? [SEP] Teaching in Canada requires a post-secondary degree Bachelor's Degree."
# s = "I really hated the movie."
# s = "I really loved the movie and thought it was one of the best I've ever seen."

# s = "what came into force after the new constitution was herald? [SEP] as of that day, the new constitution heralding the second republic came into force."
s = "what came into force after the new constitution was heralded? [SEP] as of that day, the new constitution heralding the second republic came into force."

args = {
    "beam_width": 15,
    "w": 5,
    "K": 30,
    "tree_depth": 0.15,
    "substitution_evaluation_method": "hotflip_only",
    "substitution_gen_method": "hotflip_only",
    "dataset": dataset
}

# args = {
#     "beam_width": 15,
#     "w": 5,
#     "K": 30,
#     "tree_depth": 0.5,
#     "substitution_evaluation_method": "SVs",
#     "substitution_gen_method": "no_opt_lmh",
#     "dataset": dataset
# }

original_score = calculate_score(s, sentiment_model_tokenizer, dataset, device)
print(f"original_score: {original_score}")

counterfactual = generate_closs_counterfactual(s, calculate_score, args)
print(f"counterfactual: {counterfactual}")

counterfactual_score = calculate_score(counterfactual, sentiment_model_tokenizer, dataset, device)
print(f"counterfactual_score: {counterfactual_score}")

original_score: 0.005338747054338455
Final eval prob pos: 0.005338747054338455
33 33
Old tokens           :  [CLS] what came into force after the new constitution was heralded ? [SEP] as of that day , the new constitution heralding the second republic came into force . [SEP] [SEP]
New tokens           :  [CLS] what came into force after the new constitution was heralded   ? [SEP] as of that day , the new constitution heralding   the second republic came into force . [SEP] [SEP]
Best prob gain       : 0.0
Fraction toks same   : 1.0
counterfactual: what came into force after the new constitution was heralded? [SEP] as of that day, the new constitution heralding the second republic came into force. [SEP]
counterfactual_score: 0.005338747054338455


## Run CLOSS and HotFlip

First run the method without optimization (`CLOSS-EO`) and without retraining the language modeling head.

- `CLOSS-EO:` skip optimizing the embedding. This increases failures but lowers perplexity.
- `CLOSS-RTL:` skip retraining the language modeling head. This has no effect on perplexity but increases the failure rate.

Move to the main parent directory.

In [17]:
# %cd "CLOSS"
# %cd ..
%pwd

'/Users/smcaleese/Documents/masters-thesis-code'

In [18]:
df_input.head()

Unnamed: 0,original_text,original_label
0,it's a charming and often affecting journey.,1
1,unflinchingly bleak and desperate,0
2,allows us to hope that nolan is poised to emba...,1
3,"the acting, costumes, music, cinematography an...",1
4,"it's slow -- very, very slow.",0


1. Run HotFlip:

In [59]:
args = {
    "beam_width": 15,
    "w": 5,
    "K": 30,
    "tree_depth": 0.15,
    "substitution_evaluation_method": "hotflip_only",
    "substitution_gen_method": "hotflip_only",
    "dataset": dataset
}

df_output = get_output(df_input, generate_closs_counterfactual, args)

Processing input 1/100: num tokens: 7
Final eval prob pos: 0.9997661709785461
12 12
Old tokens           :  [CLS] it ' s a charming and often affecting journey . [SEP]
New tokens           :  [CLS] it ' s a charming and often affecting journey . [SEP]
Best prob gain       : 0.0
Fraction toks same   : 1.0

original_text:       it's a charming and often affecting journey. 
counterfactual_text: it's a charming and often affecting journey.

Processing input 2/100: num tokens: 4
Final eval prob pos: 0.014329418540000916
10 10
Old tokens           :  [CLS] unflinchingly bleak and desperate [SEP]
New tokens           :  [CLS] unfl  in  ching  ly   bleak and desperate [SEP]
Best prob gain       : 0.0
Fraction toks same   : 1.0

original_text:       unflinchingly bleak and desperate 
counterfactual_text: unflinchingly bleak and desperate

Processing input 3/100: num tokens: 19
CF FOUND!!!!!!!!!!!!!!
Final eval prob pos: 0.9995738863945007
23 24
Old tokens           :  [CLS] allows us to hope th

KeyboardInterrupt: 

In [None]:
df_output.head()

In [None]:
df_output.to_csv(f"./output/hotflip-output-{dataset}.csv", index=False)

2. Run CLOSS without optimization and without retraining the language modeling head:

In [19]:
args = {
    "beam_width": 15,
    "w": 5,
    "K": 30,
    "tree_depth": 0.15,
    "substitution_evaluation_method": "SVs",
    "substitution_gen_method": "no_opt_lmh",
    "dataset": dataset
}

df_output = get_output(df_input, generate_closs_counterfactual, args)

Processing input 1/100: num tokens: 7
grad loc importances:
 [CLS] it ' [31ms[0m [31ma[0m [32mcharming[0m and [31moften[0m [33maffecting[0m [34mjourney[0m . [SEP]

total SVs   = -0.001172225135848972
Top scoring substitutions by Shapley value:
[5, 'dangerous', 0.0009207024293787339]
[5, 'one', 0.00010360503683284837]
[5, 'long', 5.12380989230409e-05]
[5, 'his', 4.968369329297864e-05]
[5, 'family', 4.900350409038996e-05]
[5, 'a', 4.527763444550184e-05]
[5, 'the', 4.488169136693922e-05]
[5, 'far', 1.190718957933329e-05]
Final eval prob pos: 0.9997661709785461
11 12
Old tokens           :  [CLS] it ' s a [31mcharming [0m and often affecting journey . [SEP]
New tokens           :  [CLS] it ' s a [31mdangerous[0m and often affecting journey . [SEP]
Best prob gain       : 0.0
Fraction toks same   : 0.917

original_text:       it's a charming and often affecting journey. 
counterfactual_text: it's a dangerous and often affecting journey.

Processing input 2/100: num tokens: 4


KeyboardInterrupt: 

In [None]:
df_output.head()

In [None]:
df_output.to_csv(f"./output/closs-output-{dataset}.csv", index=False)

## Run Polyjuice

### Setup

In [None]:
%cd polyjuice
%pwd

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
%pip install -e .

Make sure the model is being imported properly.

In [None]:
import importlib
import polyjuice

importlib.reload(polyjuice)
print(polyjuice.__file__)

In [None]:
from polyjuice import Polyjuice

pj = Polyjuice(model_path="uw-hai/polyjuice", is_cuda=True)

In [None]:
text = "julia is played with exasperating blandness by laura regan ."
perturbations = pj.perturb(
    orig_sent=text,
    ctrl_code="negation",
    num_perturbations=5,
    # perplex_thred=None
)
perturbations

Run the model and get the output.

In [None]:
df_input.head()

In [None]:
df_output = get_output(df_input, generate_polyjuice_counterfactual, {})

In [None]:
df_output.head(10)

In [None]:
%cd ..
%pwd

In [None]:
df_output.to_csv(f"./output/polyjuice-output-{dataset}.csv", index=False)

## FIZLE

Two variants:
* Naive: uses a single prompt.
* Guided: Uses two prompts. The first prompt identifies important words and the second prompt generates the counterfactual.

Hyperparameters:

For all LLMs, we use top_p sampling with p = 1, temperature t = 0.4 and a repetition penalty of 1.1.


### 1. FIZLE naive

In [199]:
df_input.head()

Unnamed: 0,original_text,original_label
0,it's a charming and often affecting journey.,1
1,unflinchingly bleak and desperate,0
2,allows us to hope that nolan is poised to emba...,1
3,"the acting, costumes, music, cinematography an...",1
4,"it's slow -- very, very slow.",0


In [200]:
args = {"model": "gpt-4-turbo"}
df_output = get_output(df_input, generate_naive_fizle_counterfactual, args)

Processing input 1/100: num tokens: 7
system_prompt: In the task of sentiment analysis on the SST-2 dataset, a trained black-box classifier correctly predicted the label 'POSITIVE' for the following text. Generate a counterfactual explanation by making minimal changes to the input text, so that the label changes from 'POSITIVE' to 'NEGATIVE'. Use the following definition of 'counterfactual explanation': "A counterfactual explanation reveals what should have been different in an instance to observe a diverse outcome." Enclose the generated text within <new> tags.
    -
    Text: it's a charming and often affecting journey. 
attempt: 1

original_text:       it's a charming and often affecting journey. 
counterfactual_text: it's a disappointing and often frustrating journey.

Processing input 2/100: num tokens: 4
system_prompt: In the task of sentiment analysis on the SST-2 dataset, a trained black-box classifier correctly predicted the label 'NEGATIVE' for the following text. Generate a 

In [106]:
df_output.head()

Unnamed: 0,original_text,original_score,original_perplexity,counterfactual_text,counterfactual_score,counterfactual_perplexity,found_flip,levenshtein_similarity_score
0,what came into force after the new constitutio...,0.005043,103.854469,what came into force before the new constituti...,0.005943,104.258759,False,0.966667
1,what is the first major city in the stream of ...,0.567084,119.536102,what is the first major city in the stream of ...,0.003544,76.490227,True,0.494845
2,what is the minimum required if you want to te...,0.962545,36.186142,what is the minimum required if you want to te...,0.897881,35.786514,False,0.811828
3,how was temüjin kept imprisoned by the tayichi...,0.013612,76.901253,how was temüjin kept imprisoned by the tayichi...,0.069142,107.826492,False,0.357724
4,"what did herr gott, dich loben wir become know...",0.996422,89.606651,"what did herr gott, dich loben wir become know...",0.125517,74.550781,True,0.878613


In [107]:
df_output.to_csv(f"./output/fizlenaive-output-{dataset}-new-2.csv", index=False)

### FIZLE guided

In [None]:
df_input.head()

In [None]:
args = {"model": "gpt-4-turbo"}
df_output = get_output(df_input, generate_naive_fizle_counterfactual, args)

In [None]:
df_output.head()

In [None]:
df_output.to_csv(f"./output/fizleguided-output-{dataset}.csv", index=False)