In [1]:
import pandas as pd
import numpy as np
import os
from together import Together
from dotenv import load_dotenv # pip install python-dotenv
from utils.experiments import *
from utils.analysis import *
load_dotenv()   # Set API KEY values from .env file


True

# Notebook with example of how to run an experiment with a together.ai model

Prerequisites: 

- Make a Together.ai account (https://www.together.ai/), get a Together api key, and put it in a .env file in the root directory of this repo. 
    - in your .env file: TOGETHER_API_KEY="your api key"
- Python requirements: 
    - pip install together python-dotenv pandas numpy scikit-learn 
- Run all the code before the "Choose Model" markdown cell

Yoou're ready to run experiments! 
You can copy cells in the "run Experiment and Analyze" section, change the model variable to any model string listed on https://docs.together.ai/docs/inference-models, and get running! 

Note: the results_df variable gets rewritten, so the notebook will not be happy if you run cells out of order.

In [2]:
dataset = pd.read_csv("data/ipcc_statements_dataset_original_cleaned_context.csv")
test_set = dataset[dataset["split"]=="test"].copy()
train_set = dataset[dataset["split"]=="train"].copy()


In [3]:
client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))

In [4]:
def query_together_model(client, statement, prompt_fn, model): 
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": prompt_fn(statement)}
            ],
        temperature=0, 
        logprobs=1,
        )
    raw_output = response.choices[0].message.content
    cleaned_output = extract_confidence(raw_output)
    logprobs = response.choices[0].logprobs
    return pd.Series([raw_output, cleaned_output, logprobs])

In [5]:
# Query the model. n_sc = number of times to run the experiment, for self-consistency.
# The input dataset will be modified with additional columns containing the results of the experiment.
def run_experiment(dataset, model, n_sc=5, prompt_fn=get_zero_shot_prompt):
    for i in range(1, n_sc+1):
        dataset[[f"raw_output_{i}", f"model_confidence_classification_{i}", f"model_logprobs{i}"]] = dataset["final_statement"].apply(lambda x: query_together_model(client, x, prompt_fn, model))
    return dataset
                                                                                                                   

# Choose model - Llama 3 8B Chat

## Run Experiment and Analyze

In [9]:
model = "meta-llama/Llama-3-8b-chat-hf"
results_df = test_set.copy()
results_df = run_experiment(results_df, model, 5, get_zero_shot_prompt) 
results_df.to_csv(f"results/cleaned_dataset/{model.replace("/", "-")}_zero_shot_06052024.csv")

In [7]:
results_df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,statement_idx,report,page_num,sent_num,original_statement,final_statement,confidence,score,...,model_logprobs2,raw_output_3,model_confidence_classification_3,model_logprobs3,raw_output_4,model_confidence_classification_4,model_logprobs4,raw_output_5,model_confidence_classification_5,model_logprobs5
3,3,0,3,AR6_WGI,24,2,"Since 1750, increases in CO 2 (47%) and CH 4 (...","Since 1750, increases in CO2 (47%) and CH4 (15...",very high,3,...,"tokens=['very', ' high', '<|eot_id|>'] token_l...",very high,very high,"tokens=['very', ' high', '<|eot_id|>'] token_l...",very high,very high,"tokens=['very', ' high', '<|eot_id|>'] token_l...",very high,very high,"tokens=['very', ' high', '<|eot_id|>'] token_l..."
42,42,1,42,AR6_WGI,37,16,"Over the next 2000 years, global mean sea leve...","Over the next 2000 years, global mean sea leve...",low,0,...,"tokens=['very', ' high', '<|eot_id|>'] token_l...",very high,very high,"tokens=['very', ' high', '<|eot_id|>'] token_l...",very high,very high,"tokens=['very', ' high', '<|eot_id|>'] token_l...",very high,very high,"tokens=['very', ' high', '<|eot_id|>'] token_l..."
77,77,2,77,AR6_WGI,47,7,"By the end of the century, scenarios with very...","By the end of the century, scenarios with very...",high,2,...,"tokens=['High', '<|eot_id|>'] token_logprobs=[...",High,high,"tokens=['High', '<|eot_id|>'] token_logprobs=[...",High,high,"tokens=['High', '<|eot_id|>'] token_logprobs=[...",High,high,"tokens=['High', '<|eot_id|>'] token_logprobs=[..."
81,81,3,81,AR6_WGI,62,2,"Over the past millennium, and especially since...","Over the past millennium, and especially since...",medium,1,...,"tokens=['medium', '<|eot_id|>'] token_logprobs...",medium,medium,"tokens=['medium', '<|eot_id|>'] token_logprobs...",medium,medium,"tokens=['medium', '<|eot_id|>'] token_logprobs...",medium,medium,"tokens=['medium', '<|eot_id|>'] token_logprobs..."
86,86,4,86,AR6_WGI,63,8,This paleo context supports the assessment tha...,The paleo context supports the assessment that...,high,2,...,"tokens=['very', ' high', '<|eot_id|>'] token_l...",very high,very high,"tokens=['very', ' high', '<|eot_id|>'] token_l...",very high,very high,"tokens=['very', ' high', '<|eot_id|>'] token_l...",very high,very high,"tokens=['very', ' high', '<|eot_id|>'] token_l..."


In [None]:
results_df = pd.read_csv("results/cleaned_dataset/meta-llama-Llama-3-8b-chat-hf_zero_shot_06052024.csv")

In [None]:
results_df.head()

In [None]:
results_df.tail()

In [None]:
model_classification_col_names = [f"model_confidence_classification_{i}" for i in range(1, 6)]
print_accuracy_slope_bias_metrics(results_df, model_classification_col_names, plot=False, verbose=False)