In [43]:
import pandas as pd
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv # pip install python-dotenv
load_dotenv()   # Set API KEY values from .env file

from utils.analysis import * 
from utils.experiments import *


In [44]:
dataset = pd.read_csv("data/ipcc_statements_dataset_original_cleaned_context.csv")
test_set = dataset[dataset["split"]=="test"].copy()
train_set = dataset[dataset["split"]=="train"].copy()


In [45]:
test_set.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,statement_idx,report,page_num,sent_num,original_statement,final_statement,confidence,score,split,context,has_confidence_in_final_statement
3,3,0,3,AR6_WGI,24,2,"Since 1750, increases in CO 2 (47%) and CH 4 (...","Since 1750, increases in CO2 (47%) and CH4 (15...",very high,3,test,Estimates account for both direct emissions in...,False
42,42,1,42,AR6_WGI,37,16,"Over the next 2000 years, global mean sea leve...","Over the next 2000 years, global mean sea leve...",low,0,test,Continued ice loss over the 21st century is vi...,False
77,77,2,77,AR6_WGI,47,7,"By the end of the century, scenarios with very...","By the end of the century, scenarios with very...",high,2,test,"{6.6, 6.7, Box TS.7}. D.2.3 Scenarios with ver...",False
81,81,3,81,AR6_WGI,62,2,"Over the past millennium, and especially since...","Over the past millennium, and especially since...",medium,1,test,"{1.3.2, 1.5.1, Cross-Chapter Boxes 2.1 and 2.4...",False
86,86,4,86,AR6_WGI,63,8,This paleo context supports the assessment tha...,The paleo context supports the assessment that...,high,2,test,"For example, under a past global warming level...",False


In [46]:
client = OpenAI()

In [47]:
# Prompt the model and get raw response, normalized response, logprobs
def query_openai_model(client, statement, prompt_fn, model): 
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt_fn(statement)}
            ],
            temperature=0, 
            logprobs=True,
            top_logprobs=5
        )
    raw_output = response.choices[0].message.content
    cleaned_output = extract_confidence(raw_output)
    top_logprobs = response.choices[0].logprobs.content[0].top_logprobs
    logprobs = [(logprob.token, logprob.logprob, np.round(np.exp(logprob.logprob)*100,2)) for logprob in top_logprobs]
    return pd.Series([raw_output, cleaned_output, logprobs])

In [48]:
# Query the model. n_sc = number of times to run the experiment, for self-consistency.
# The input dataset will be modified with additional columns containing the results of the experiment.
def run_experiment(dataset, model, n_sc=5, prompt_fn=get_zero_shot_prompt):
    for i in range(1, n_sc+1):
        dataset[[f"raw_output_{i}", f"model_confidence_classification_{i}", f"model_logprobs{i}"]] = dataset["final_statement"].apply(lambda x: query_openai_model(client, x, prompt_fn, model))
    return dataset
                                                                                                                   

In [52]:
model = "gpt-4-turbo"
results_df = test_set.copy()
results_df = run_experiment(results_df, model, 5, get_zero_shot_prompt) 
results_df.to_csv(f"results/cleaned_dataset/{model.replace("/", "-")}_zero_shot_06052024.csv")

In [53]:
results_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,statement_idx,report,page_num,sent_num,original_statement,final_statement,confidence,score,...,model_logprobs2,raw_output_3,model_confidence_classification_3,model_logprobs3,raw_output_4,model_confidence_classification_4,model_logprobs4,raw_output_5,model_confidence_classification_5,model_logprobs5
3,3,0,3,AR6_WGI,24,2,"Since 1750, increases in CO 2 (47%) and CH 4 (...","Since 1750, increases in CO2 (47%) and CH4 (15...",very high,3,...,"[(very, -0.00092325005, 99.91), (high, -7.0062...",very high,very high,"[(very, -0.014158106, 98.59), (high, -4.265745...",very high,very high,"[(very, -0.0020730526, 99.79), (Very, -6.70535...",very high,very high,"[(very, -0.00092325005, 99.91), (high, -7.0062..."
42,42,1,42,AR6_WGI,37,16,"Over the next 2000 years, global mean sea leve...","Over the next 2000 years, global mean sea leve...",low,0,...,"[(high, -0.022998184, 97.73), (medium, -3.7908...",high,high,"[(high, -0.037428018, 96.33), (medium, -3.3048...",high,high,"[(high, -0.001642573, 99.84), (medium, -6.5404...",high,high,"[(high, -0.037428018, 96.33), (medium, -3.3048..."
77,77,2,77,AR6_WGI,47,7,"By the end of the century, scenarios with very...","By the end of the century, scenarios with very...",high,2,...,"[(very, -0.2981072, 74.22), (high, -1.3560905,...",very high,very high,"[(very, -0.31110346, 73.26), (high, -1.31945, ...",very high,very high,"[(very, -0.31110346, 73.26), (high, -1.31945, ...",very high,very high,"[(very, -0.31110346, 73.26), (high, -1.31945, ..."
81,81,3,81,AR6_WGI,62,2,"Over the past millennium, and especially since...","Over the past millennium, and especially since...",medium,1,...,"[(high, -0.010342409, 98.97), (medium, -4.5818...",high,high,"[(high, -0.011819648, 98.82), (medium, -4.4461...",high,high,"[(high, -0.0013770182, 99.86), (medium, -6.597...",high,high,"[(high, -0.011819648, 98.82), (medium, -4.4461..."
86,86,4,86,AR6_WGI,63,8,This paleo context supports the assessment tha...,The paleo context supports the assessment that...,high,2,...,"[(very, -0.6027303, 54.73), (high, -0.7925763,...",very high,very high,"[(very, -0.6177923, 53.91), (high, -0.77483195...",very high,very high,"[(very, -0.6027303, 54.73), (high, -0.7925763,...",high,high,"[(high, -0.2503518, 77.85), (very, -1.5075539,..."
