In [2]:
#Chain of Thought Prompting Experiments

In [1]:
import numpy as np
import pandas as pd
import json
import os
import openai

openai.api_key = '' #deleted fro github


#Reading in data
#df_cot has the CoT prompts
df_cot = pd.read_csv("/Users/stevenslater/Desktop/FinalProject/Data/cotprompts.csv")
df_train = pd.read_csv("/Users/stevenslater/Desktop/FinalProject/Data/trainwindowsimproved.csv")
df_test = pd.read_csv("/Users/stevenslater/Desktop/FinalProject/Data/testwindowsimproved.csv")

df_cot_baseline = pd.read_csv("/Users/stevenslater/Desktop/FinalProject/Data/cotbaselineprompts.csv")
#df_cot.head()

In [None]:
df_train.head()

In [None]:
df_cot_baseline.head()

In [4]:

def ask_gpt(question):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4-1106-preview",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": question}
            ]
        )

        answer = response['choices'][0]['message']['content']
        tokens_used = response['usage']['total_tokens']
        input_tokens = len(question.split()) 
        return answer, tokens_used, input_tokens

    except openai.error.OpenAIError as e:
        return f"Error: {str(e)}", 0, 0
    except Exception as e:
        return f"An unexpected error occurred: {str(e)}", 0, 0

In [5]:
def get_template(utterance, examples_df):
    
    metaPrompt = f"""Our definition of vulnerability refers to customers who, due to their personal circumstances, are especially susceptible to harm. All customers are at risk of becoming vulnerable and this risk is increased by characteristics of vulnerability related to 4 key drivers.
    Health - health conditions or illnesses that affect ability to carry out day-to-day tasks.

    Life events - life events such as bereavement, job loss or relationship breakdown.

    Resilience - low ability to withstand financial or emotional shocks.

    Capability - low knowledge of financial matters or low confidence in managing money (financial capability). Low capability in other relevant areas such as literacy, or digital skills
    
    Below is {len(examples_df)} examples of conversations with customers, where the customer is either vulnerable or not vulnerable, we have provided chain of thought reasoning to answer each example for help.
    """

    example_string = ""
    for _, example_row in examples_df.iterrows():
        formatted_utterance = example_row['Utterance'] 
        label_text = example_row['Chainofthought']
        example_string += "\n" + "Example Conversation: " + formatted_utterance + "\nAnswer: " + label_text + "\n"

    #changed for optimal prompt in final exp.
    instruction = "\n Using the provided examples and their correct reasoned answers above for help, is the customer from the above conversation vulnerable based on our definiton of vulnerability, only answer Yes or No, and you must say only Yes or No.\n"
    question = "\n" + "Conversation to Classify: " + ' "' + utterance + '" ' + "\n"
    
    prompt = metaPrompt + example_string + question + instruction
    
    return prompt

In [None]:
df_cot.head()

In [42]:
def query_api(df, num_examples):
    
    results = []
    total_input_tokens = 0
    total_output_tokens = 0

    
    for _, row in df.iterrows():
        
        
        utterance = row['conversation_chunks']

        # Define the four groups (misclassified - V/NV , gudelines - V/NV)
        group1 = df_cot[(df_cot['Label'] == 1) & (df_cot['IsFromGuidelines'] == 1)]
        group2 = df_cot[(df_cot['Label'] == 1) & (df_cot['IsFromGuidelines'] == 0)]
        group3 = df_cot[(df_cot['Label'] == 0) & (df_cot['IsFromGuidelines'] == 1)]
        group4 = df_cot[(df_cot['Label'] == 0) & (df_cot['IsFromGuidelines'] == 0)]

        # Sample from each group as when needed
        #sample1 = group1.sample(n=num_examples // 1) #// 2)
        #sample2 = group2.sample(n=num_examples // 1)    #only two here as two groups (Pos and guideliens, Pos and no guidelines)
        #sample3 = group3.sample(n=num_examples // 1) #// 2)
        sample4 = group4.sample(n=num_examples // 1)

        # Combine the samples
        combined_sample = pd.concat([sample4]) #sample3, sample4
        

        # shuffle
        combined_sample = combined_sample.sample(frac=1).reset_index(drop=True)
        cot_examples = combined_sample

        #COT BASELINE
        #cot_examples = df_cot_baseline.sample(n=num_examples)
        

        #cot_examples = df_cot.sample(n=num_examples)
        prompt = get_template(utterance, cot_examples)
    
        #query GPT
        answer, tokens_used, input_tokens = ask_gpt(prompt)
        total_input_tokens += input_tokens
        total_output_tokens += tokens_used - input_tokens

        # Format answer
        answer = answer.rstrip('.').strip()

        
        answer_binary = 1 if answer.lower() == "yes" else 0

        
        results.append({
            'Data': utterance,
            'Actual Label': row['labels'],
            'ChatGPT Label': answer_binary
        })
    
    #monitoring cost
    cost_per_1k_input_tokens = 0.01
    cost_per_1k_output_tokens = 0.03
    total_cost = (total_input_tokens / 1000 * cost_per_1k_input_tokens) + \
                 (total_output_tokens / 1000 * cost_per_1k_output_tokens)

    print(f"Total Cost: {total_cost}")
    
    # Convert the results list into a DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df


In [None]:

sampled_df = df_test
#Run with 12 CoT examples, on the test set - did thsi for many different experiments with the CoT strategies
results_12_misclassified_neg = query_api(sampled_df,12)
results_12_misclassified_neg

In [48]:
#save it to csv
results_12_misclassified_neg.to_csv('results_12_misclassified_neg.csv')

In [None]:
#METRICS 
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

actual_labels = results_12_misclassified_neg['Actual Label'].astype(int) 
predicted_labels = results_12_misclassified_neg['ChatGPT Label'].astype(int) 

accuracy = accuracy_score(actual_labels, predicted_labels)
precision = precision_score(actual_labels, predicted_labels)
recall = recall_score(actual_labels, predicted_labels)
f1 = f1_score(actual_labels, predicted_labels)

confusion = confusion_matrix(actual_labels, predicted_labels)


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(confusion)