# Causal Reasoning In Large Language Models: CLadder


#### 1. Data Preparation


In [1]:
import pandas as pd
import json

dataset_path = "../data/cladder/cladder-v1-q-commonsense.json"
with open(dataset_path, "r") as f:
    data = json.load(f)
    
df = pd.DataFrame(data)
df.rename(columns={'given_info': 'info'}, inplace=True)

df = df[df['meta'].apply(lambda x: x.get('query_type') != 'backadj')].reset_index(drop=True)


In [None]:
df.iloc[0]['question']


In [None]:
df.head(2)

In [None]:
df.iloc[100]['meta']

In [None]:
index = 6330

print('Info: ', df.iloc[index]['info'])
print('Question: ', df.iloc[index]['question'])
print('Answer: ', df.iloc[index]['answer'])
print('Graph ID: ', df.iloc[index]['meta']['graph_id'])
print('Query type: ', df.iloc[index]['meta']['query_type'])
print('Rung: ', df.iloc[index]['meta']['rung'])
print('Formal form: ', df.iloc[index]['meta']['formal_form'])
print('Reasoning: ' , df.iloc[index]['reasoning'])

In [None]:
df['meta'][3]

In [2]:
df_new = df.copy()
meta_df = df_new['meta'].apply(pd.Series)
meta_df
df_new = pd.concat([df_new, meta_df], axis = 1)
df_new = df_new.drop('meta', axis = 1)
df_new.rename(columns={'given_info': 'given_info_meta', 'given_info': 'given_info'}, inplace=True)

In [None]:
df_new['query_type'].unique()

In [None]:
df_new.columns

In [3]:
df_sampled = df_new.sample(n = 1000, random_state=25)
print(len(df_sampled))

1000


In [None]:
print(df_sampled.head(10))

In [4]:
column_names = ['answer', 'query_type', 'answer', 'graph_id', 'rung', 'query_type', 'story_id', 'polarity']

for column_name in column_names:
    print(df_sampled[column_name].value_counts())
    print(df_new[column_name]. value_counts())
    print('----------------------------------')

answer
no     504
yes    496
Name: count, dtype: int64
answer
yes    4345
no     4345
Name: count, dtype: int64
----------------------------------
query_type
marginal              209
ate                   174
correlation           174
ett                   138
det-counterfactual     95
nie                    92
nde                    73
collider_bias          23
exp_away               22
Name: count, dtype: int64
query_type
marginal              1702
ate                   1518
correlation           1518
ett                   1288
nie                    874
det-counterfactual     870
nde                    552
exp_away               184
collider_bias          184
Name: count, dtype: int64
----------------------------------
answer
no     504
yes    496
Name: count, dtype: int64
answer
yes    4345
no     4345
Name: count, dtype: int64
----------------------------------
graph_id
mediation      197
arrowhead      188
confounding    106
diamond        105
IV             102
chain           

#### 2. Add columns for each model to dataframe for storing results

In [5]:
df_cladder = df_sampled.copy()

In [6]:
from utils import add_columns_to_dataframe, generate_results, generate_results_per_rung


model_names = ['claude-3.5-haiku']
df_cladder = add_columns_to_dataframe(df_cladder, model_names)

#### 3. Run models

In [7]:
import anthropic
from constants import ANTHROPIC_API_KEY
from model_inference import intialize_anthropic_client, run_model_on_cladder

client = intialize_anthropic_client(api_key=ANTHROPIC_API_KEY)


In [None]:
run_model_on_cladder(df=df_cladder, 
                     output_column='claude-3.5-sonnet', 
                     model='claude-3-5-sonnet-20241022', 
                     method_name='input_output', 
                     info_column='info', 
                     question_column='question', 
                     temperature=1.0, 
                     overwrite=True, 
                     min_range=0, 
                     max_range=1000, 
                     client=client)

In [20]:
import datetime

# Save pickle
df_cladder.to_pickle('../data/log/cladder-anthropic-sonnet' + '-' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '.pkl')

# Read pickle
#unpickled_df = pd.read_pickle('./cladder.pkl')
#df_cladder = unpickled_df

In [None]:
generate_results(df_cladder, 'answer', model_names)

In [None]:
generate_results_per_rung(df_cladder, 'answer', model_names)

### 4. Run the experiments with the perturbed datasets

In [8]:
import pickle
# Read pickle

df_cladder_nonsensical = pd.read_pickle('../data/cladder/nonsensical-data.pkl')
df_cladder_anticommonsensical = pd.read_pickle('../data/cladder/anticommonsensical-data.pkl')

In [9]:
df_cladder_nonsensical = add_columns_to_dataframe(df_cladder_nonsensical, model_names)
df_cladder_anticommonsensical = add_columns_to_dataframe(df_cladder_anticommonsensical, model_names)

In [None]:
run_model_on_cladder(df=df_cladder_nonsensical, 
                     output_column='claude-3.5-haiku', 
                     model='claude-3-5-haiku-20241022', 
                     method_name='input_output', 
                     info_column='nonsensical_info', 
                     question_column='nonsensical_question', 
                     temperature=1.0, 
                     overwrite=True, 
                     min_range=102, 
                     max_range=1000, 
                     client=client)

In [None]:
run_model_on_cladder(df=df_cladder_anticommonsensical, 
                     output_column='claude-3.5-sonnet', 
                     model='claude-3-5-sonnet-20241022', 
                     method_name='input_output', 
                     info_column='anticommonsensical_info', 
                     question_column='anticommonsensical_question', 
                     temperature=1.0, 
                     overwrite=True, 
                     min_range=0, 
                     max_range=1000, 
                     client=client)

In [None]:
run_model_on_cladder(df=df_cladder_nonsensical, output_column='gpt-4o-mini', model='gpt-4o-mini', method_name='input_output', info_column='nonsensical_info', question_column='nonsensical_question', temperature=1.0, overwrite=True, min_range=0, max_range=1000)
#run_model_on_cladder(df=df_cladder_nonsensical, output_column='gpt-4o-mini', model='gpt-4o-mini', method_name='input_output', info_column='nonsensical_info', question_column='nonsensical_question', temperature=1.0, overwrite=True, min_range=0, max_range=1000)

#run_model_on_cladder(df=df_cladder_anticommonsensical, output_column='gpt-4o-mini', model='gpt-4o-mini', method_name='input_output', info_column='anticommonsensical_info', question_column='anticommonsensical_question', temperature=1.0, overwrite=True, min_range=0, max_range=1000)
#run_model_on_cladder(df=df_cladder_anticommonsensical, output_column='gpt-4o-mini', model='gpt-4o-mini', method_name='input_output', info_column='anticommonsensical_info', question_column='anticommonsensical_question', temperature=1.0, overwrite=True, min_range=0, max_range=1000)


In [None]:
generate_results(df_cladder_nonsensical, 'answer', model_names)
generate_results_per_rung(df_cladder_nonsensical, 'answer', model_names)

In [28]:
import datetime

df_cladder_nonsensical.to_pickle('../data/log/cladder-anthropic-sonnet-nonsensical' + '-' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '.pkl')


In [32]:
df_cladder_anticommonsensical.to_pickle('../data/log/cladder-anthropic-sonnet-anticommonsensical' + '-' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '.pkl')