# Causal Reasoning In Large Language Models: CLadder


#### 1. Data Preparation


In [1]:
import pandas as pd
import json

dataset_path = "../data/cladder/cladder-v1-q-commonsense.json"
with open(dataset_path, "r") as f:
    data = json.load(f)
    
df = pd.DataFrame(data)
df.rename(columns={'given_info': 'info'}, inplace=True)

df = df[df['meta'].apply(lambda x: x.get('query_type') != 'backadj')].reset_index(drop=True)


In [None]:
df.iloc[0]['question']


In [None]:
df.head(2)

In [None]:
df.iloc[100]['meta']

In [None]:
index = 6330

print('Info: ', df.iloc[index]['info'])
print('Question: ', df.iloc[index]['question'])
print('Answer: ', df.iloc[index]['answer'])
print('Graph ID: ', df.iloc[index]['meta']['graph_id'])
print('Query type: ', df.iloc[index]['meta']['query_type'])
print('Rung: ', df.iloc[index]['meta']['rung'])
print('Formal form: ', df.iloc[index]['meta']['formal_form'])
print('Reasoning: ' , df.iloc[index]['reasoning'])

In [None]:
df['meta'][3]

In [7]:
df_new = df.copy()
meta_df = df_new['meta'].apply(pd.Series)
meta_df
df_new = pd.concat([df_new, meta_df], axis = 1)
df_new = df_new.drop('meta', axis = 1)
df_new.rename(columns={'given_info': 'given_info_meta', 'given_info': 'given_info'}, inplace=True)

In [None]:
df_new['query_type'].unique()

In [None]:
df_new.columns

In [None]:
df_sampled = df_new.sample(n = 1000, random_state=25)
print(len(df_sampled))

In [None]:
print(df_sampled.head(10))

In [None]:
column_names = ['answer', 'query_type', 'answer', 'graph_id', 'rung', 'query_type', 'story_id', 'polarity']

for column_name in column_names:
    print(df_sampled[column_name].value_counts())
    print(df_new[column_name]. value_counts())
    print('----------------------------------')

#### 2. Add columns for each model to dataframe for storing results

In [13]:
df_cladder = df_sampled.copy()

In [14]:
from utils import add_columns_to_dataframe, generate_results, generate_results_per_rung


model_names = ['deepseek-r1', 'deepseek-v3', 'llama-3.1-70B']
df_cladder = add_columns_to_dataframe(df_cladder, model_names)

#### 3. Run models

In [15]:
from openai import OpenAI
from constants import DEEPINFRA_API_KEY, OPENAI_API_KEY
from model_inference import initialize_openai_client, run_model_on_cladder

client = initialize_openai_client(api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com/v1/openai")

In [None]:
#Running the LLM

run_model_on_cladder(df=df_cladder, 
                     output_column='deepseek-v3', 
                     model='deepseek-ai/DeepSeek-V3', 
                     method_name='input_output', 
                     info_column='info', 
                     question_column='question', 
                     temperature=1.0, 
                     overwrite=True, 
                     min_range=0, 
                     max_range=1000, 
                     client=client)

In [184]:
import datetime

# Save pickle
df_cladder.to_pickle('../data/log/cladder-deepseek' + '-' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '.pkl')

# Read pickle
#unpickled_df = pd.read_pickle('./cladder.pkl')
#df_cladder = unpickled_df

In [None]:
generate_results(df_cladder, 'answer', model_names)

### 4. Run the experiments with the perturbed datasets

In [24]:
import pickle
# Read pickle

df_cladder_nonsensical = pd.read_pickle('../data/cladder/nonsensical-data.pkl')
df_cladder_anticommonsensical = pd.read_pickle('../data/cladder/anticommonsensical-data.pkl')

In [25]:
df_cladder_nonsensical = add_columns_to_dataframe(df_cladder_nonsensical, model_names)
df_cladder_anticommonsensical = add_columns_to_dataframe(df_cladder_anticommonsensical, model_names)

In [None]:
run_model_on_cladder(df=df_cladder_nonsensical, output_column='llama-8b', model='deepseek-ai/DeepSeek-V3', method_name='input_output', info_column='nonsensical_info', question_column='nonsensical_question', temperature=1.0, overwrite=True, min_range=0, max_range=1000)
#run_model_on_cladder(df=df_cladder_nonsensical, output_column='mistral-7b', model='mistralai/Mistral-7B-Instruct-v0.3', method_name='input_output', info_column='nonsensical_info', question_column='nonsensical_question', temperature=1.0, overwrite=True, min_range=0, max_range=1000)
#run_model_on_cladder(df=df_cladder_nonsensical, output_column='wizardlm', model='microsoft/WizardLM-2-8x22B', method_name='input_output', info_column='nonsensical_info', question_column='nonsensical_question', temperature=1.0, overwrite=True, min_range=0, max_range=1000)
#run_model_on_cladder(df=df_cladder_nonsensical, output_column='llama-nemotron', model='nvidia/Llama-3.1-Nemotron-70B-Instruct', method_name='input_output', info_column='nonsensical_info', question_column='nonsensical_question', temperature=1.0, overwrite=True, min_range=0, max_range=1000)


In [None]:
generate_results(df_cladder_nonsensical, 'answer', model_names)
generate_results_per_rung(df_cladder_nonsensical, 'answer', model_names)

In [49]:
import datetime

df_cladder_nonsensical.to_pickle('../data/log/cladder-openllms-nonsensical' + '-' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '.pkl')