# Program of Thoughts

This notebook implements "Program of Thoughts" with the CLadder dataset.

#### 1. Data Preparation


In [1]:
import pandas as pd
import json

dataset_path = "../data/cladder/cladder-v1-q-commonsense.json"
with open(dataset_path, "r") as f:
    data = json.load(f)
    
df = pd.DataFrame(data)
df.rename(columns={'given_info': 'info'}, inplace=True)

df = df[df['meta'].apply(lambda x: x.get('query_type') != 'backadj')].reset_index(drop=True)


In [2]:
index = 6700

print('Info: ', df.iloc[index]['info'])
print('Question: ', df.iloc[index]['question'])
print('Answer: ', df.iloc[index]['answer'])
print('Graph ID: ', df.iloc[index]['meta']['graph_id'])
print('Query type: ', df.iloc[index]['meta']['query_type'])
print('Rung: ', df.iloc[index]['meta']['rung'])
print('Formal form: ', df.iloc[index]['meta']['formal_form'])
print('Reasoning: ' , df.iloc[index]['reasoning'])

Info:  The overall probability of citrus intake is 18%. For patients not consuming citrus, the probability of scurvy is 68%. For patients consuming citrus, the probability of scurvy is 49%.
Question:  Is scurvy less likely than no scurvy overall?
Answer:  no
Graph ID:  chain
Query type:  marginal
Rung:  1
Formal form:  P(Y)
Reasoning:  {'step0': 'Let X = eating citrus; V2 = vitmain C; Y = scurvy.', 'step1': 'X->V2,V2->Y', 'step2': 'P(Y)', 'step3': 'P(Y | X=1)*P(X=1) + P(Y | X=0)*P(X=0)', 'step4': 'P(X=1) = 0.18\nP(Y=1 | X=0) = 0.68\nP(Y=1 | X=1) = 0.49', 'step5': '0.18*0.49 - 0.82*0.68 = 0.65', 'end': '0.65 > 0'}


In [3]:
df['meta'][3]

{'story_id': 'alarm',
 'graph_id': 'mediation',
 'mediators': ['V2'],
 'polarity': False,
 'groundtruth': -0.2305349321780112,
 'query_type': 'nie',
 'rung': 3,
 'formal_form': 'E[Y_{X=0, V2=1} - Y_{X=0, V2=0}]',
 'given_info': {'p(Y | X, V2)': [[0.08430222457648505, 0.5394610521458689],
   [0.4061509701126924, 0.8620283206949241]],
  'p(V2 | X)': [0.7416866188819116, 0.23519324071521291]},
 'estimand': '\\sum_{V2 = v} P(Y=1|X =0,V2 = v)*[P(V2 = v | X = 1) − P(V2 = v | X = 0)]',
 'treatment': 'X',
 'outcome': 'Y',
 'model_id': 0}

In [4]:
df_new = df.copy()
meta_df = df_new['meta'].apply(pd.Series)
meta_df
df_new = pd.concat([df_new, meta_df], axis = 1)
df_new = df_new.drop('meta', axis = 1)
df_new.rename(columns={'given_info': 'given_info_meta', 'given_info': 'given_info'}, inplace=True)

In [5]:
df_sampled = df_new.sample(n = 1000, random_state=25)
print(len(df_sampled))

1000


In [6]:
column_names = ['answer', 'query_type', 'answer', 'graph_id', 'rung', 'query_type', 'story_id', 'polarity']

for column_name in column_names:
    print(df_sampled[column_name].value_counts())
    print(df_new[column_name]. value_counts())
    print('----------------------------------')

answer
no     504
yes    496
Name: count, dtype: int64
answer
yes    4345
no     4345
Name: count, dtype: int64
----------------------------------
query_type
marginal              209
ate                   174
correlation           174
ett                   138
det-counterfactual     95
nie                    92
nde                    73
collider_bias          23
exp_away               22
Name: count, dtype: int64
query_type
marginal              1702
ate                   1518
correlation           1518
ett                   1288
nie                    874
det-counterfactual     870
nde                    552
exp_away               184
collider_bias          184
Name: count, dtype: int64
----------------------------------
answer
no     504
yes    496
Name: count, dtype: int64
answer
yes    4345
no     4345
Name: count, dtype: int64
----------------------------------
graph_id
mediation      197
arrowhead      188
confounding    106
diamond        105
IV             102
chain           

#### 2. Add models to dataframe for storing results

In [7]:
df_cladder = df_sampled.copy()

In [8]:
from utils import add_columns_to_dataframe, generate_results, generate_results_per_rung


model_names = ['gpt-4o', 'gpt-4o-mini']
df_cladder = add_columns_to_dataframe(df_cladder, model_names)

In [9]:
index = 300
print(df_cladder.iloc[index]['info'])
print(df_cladder.iloc[index]['question'])
print(df_cladder.iloc[index]['answer'])
print(df_cladder.iloc[index]['reasoning'])

For individuals who are not male and applicants to a non-competitive department, the probability of admission acceptance is 63%. For individuals who are not male and applicants to a competitive department, the probability of admission acceptance is 36%. For individuals who are male and applicants to a non-competitive department, the probability of admission acceptance is 60%. For individuals who are male and applicants to a competitive department, the probability of admission acceptance is 37%. For individuals who are not male and out-of-state residents, the probability of competitive department is 67%. For individuals who are not male and in-state residents, the probability of competitive department is 28%. For individuals who are male and out-of-state residents, the probability of competitive department is 89%. For individuals who are male and in-state residents, the probability of competitive department is 57%. The overall probability of in-state residency is 99%.
Does gender positi

In [10]:
df_cladder.iloc[index]

question_id                                                           6432
desc_id                  gender_admission_state-arrowhead-nie-model562-...
info                     For individuals who are not male and applicant...
question                 Does gender positively affect admission status...
answer                                                                  no
reasoning                {'step0': 'Let V2 = residency status; X = gend...
story_id                                            gender_admission_state
graph_id                                                         arrowhead
treated                                                                NaN
result                                                                 NaN
polarity                                                              True
groundtruth                                                       -0.07786
query_type                                                             nie
rung                     

### 3. Initialize prompts and REPL

In [11]:
from langchain_core.tools import Tool
from langchain_experimental.utilities import PythonREPL


For example, replace imports like: `from langchain.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_experimental.utilities.python import PythonREPL


In [12]:
python_repl = PythonREPL()

In [13]:
python_repl.run("print(1+1)")

Python REPL can execute arbitrary code. Use with caution.


'2\n'

In [None]:
index = 8
info = df_cladder.iloc[index]['info']
question = df_cladder.iloc[index]['question']
answer = df_cladder.iloc[index]['answer']
rung = df_cladder.iloc[index]['rung']
reasoning = df_cladder.iloc[index]['reasoning']

print(info)
print(question)
print(answer)
print(rung)
print(reasoning)

In [15]:
custom_prompt = """Generate dowhy code ONLY to solve this problem. 

Question: 
For patients not assigned the drug treatment, the probability of low cholesterol is 54%. For patients assigned the drug treatment, the probability of low cholesterol is 52%. For patients not assigned the drug treatment, the probability of taking of all assigned drugs is 79%. For patients assigned the drug treatment, the probability of taking of all assigned drugs is 43%.

Don't mistake p(X|Y) and p(X and Y), 

Code: 
import dowhy
from dowhy import CausalModel
import numpy as np
import pandas as pd

# Set a random seed for reproducibility
np.random.seed(42)

# Number of observations in the synthetic dataset
n = 1000

the probaility of X and Y is 0.45
p(X | Y) = 0.54
# Given probabilities
p_assigned_drug = 0.5  # Assume 50% of patients are assigned the drug treatment
p_low_cholesterol_given_no_drug = 0.54
p_low_cholesterol_given_drug = 0.52
p_takes_all_assigned_drugs_given_no_drug = 0.79
p_takes_all_assigned_drugs_given_drug = 0.43

# Generate whether the patient was assigned the drug treatment (0 = not assigned, 1 = assigned)
assigned_drug = np.random.choice([0, 1], size=n, p=[1 - p_assigned_drug, p_assigned_drug])

# Generate whether the patient takes all assigned drugs based on whether they were assigned the drug treatment
takes_all_drugs = np.array([
    np.random.binomial(1, p_takes_all_assigned_drugs_given_no_drug if drug == 0 else p_takes_all_assigned_drugs_given_drug)
    for drug in assigned_drug
])

# Generate whether the patient has low cholesterol based on drug treatment and whether they take all assigned drugs
low_cholesterol = np.array([
    np.random.binomial(1, p_low_cholesterol_given_no_drug if drug == 0 else p_low_cholesterol_given_drug)
    for drug in assigned_drug
])

# Create the DataFrame
data = pd.DataFrame({
    'AssignedDrug': assigned_drug,
    'TakesAllDrugs': takes_all_drugs,
    'LowCholesterol': low_cholesterol
})


# Define the causal model
model = CausalModel(
    data=data,
    treatment='TakesAllDrugs',  # Taking all drugs is the treatment
    outcome='LowCholesterol',   # Low cholesterol is the outcome
    graph="digraph {TakesAllDrugs -> LowCholesterol;}"  # Causal graph
)

# Estimate the causal effect using a linear regression method
causal_estimate = model.estimate_effect(
    identified_estimand=model.identify_effect(),
    method_name="backdoor.linear_regression"
)

# Print the causal estimate for additional insights
print("Causal Estimate:", causal_estimate.value)


"""

#prompt_question =  info + ' ' + question + custom_prompt

In [16]:
custom_prompt = ''' Generate dowhy code ONLY to solve this problem.  

Info:  The overall probability of citrus intake is 18%. For patients not consuming citrus, the probability of scurvy is 68%. For patients consuming citrus, the probability of scurvy is 49%.
Question:  Is scurvy less likely than no scurvy overall?

Don't mistake p(X|Y) and p(X and Y),
Code: 

import numpy as np
import pandas as pd
import dowhy
from dowhy import CausalModel

# Given probabilities:
p_citrus = 0.84  # P(X=1)
p_no_citrus = 0.16  # P(X=0)
p_no_citrus_scurvy = 0.11  # P(X=0, Y=1)
p_citrus_scurvy = 0.45  # P(X=1, Y=1)

# Calculate conditional probabilities
p_scurvy_given_citrus = p_citrus_scurvy / p_citrus
p_scurvy_given_no_citrus = p_no_citrus_scurvy / p_no_citrus

# Generate synthetic data
n_samples = 10000
citrus = np.random.binomial(n=1, p=p_citrus, size=n_samples)

# Generate scurvy data
scurvy = np.zeros(n_samples)
for i in range(n_samples):
    if citrus[i] == 1:
        scurvy[i] = np.random.binomial(n=1, p=p_scurvy_given_citrus)
    else:
        scurvy[i] = np.random.binomial(n=1, p=p_scurvy_given_no_citrus)

# Create DataFrame
data = pd.DataFrame({
    'citrus': citrus,
    'scurvy': scurvy
})

# Create causal model
causal_graph = """
digraph {
    citrus -> scurvy;
}
"""

model = CausalModel(
    data=data,
    treatment='citrus',
    outcome='scurvy',
    graph=causal_graph
)

# Identify and estimate causal effect
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
estimate = model.estimate_effect(
    identified_estimand,
    method_name="backdoor.linear_regression"
)

# Print results
print("Conditional Probabilities:")
print(f"P(scurvy|citrus) = {p_scurvy_given_citrus:.3f}")
print(f"P(scurvy|no citrus) = {p_scurvy_given_no_citrus:.3f}")
print(f"Difference = {p_scurvy_given_citrus - p_scurvy_given_no_citrus:.3f}")

print("\nCausal Effect Estimate:")
print(estimate)

'''

### 4. Run Program of Thoughts

In [15]:
from openai import OpenAI
from constants import OPENAI_API_KEY
from model_inference import initialize_openai_client, run_model_on_cladder
from utils import extract_python_code

client = initialize_openai_client(api_key=OPENAI_API_KEY, base_url='https://api.openai.com/v1/')


In [17]:
run_model_on_cladder(df=df_cladder, 
                     output_column='gpt-4o-mini', 
                     model='gpt-4o-mini', 
                     method_name='program_of_thoughts', 
                     info_column='info', 
                     question_column='question', 
                     temperature=1.0, 
                     overwrite=True, 
                     min_range=19, 
                     max_range=1000, 
                     client=client)

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
We know that citrus intake causes sufficient vitamin C, and we know that sufficient vitamin C causes scurvy. Would the patient has scurvy if citrus intake instead of absence of citrus?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': []}
Causal Estimate: 0.09799356789708619

 Instruction: Based on this causal estimate, answer yes or no. Think about how the causal estimate answers the question, do not do calculations, but give an explanation. Answer: 
(LOG) Prompt Question:  We know that citrus intake causes sufficient vitamin C, and we know that sufficient vitamin C causes scurvy. Would the patient has scurvy if citrus intake instead of absence of citrus?
(LOG) Correct answer:  yes
(LOG) Prompt Answer:  
!!! Code generated by LLM: !!!

import

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
For nonsmokers, the probability of high tar deposit is 34%. For smokers, the probability of high tar deposit is 70%. For nonsmokers and with no tar deposit, the probability of lung cancer is 33%. For nonsmokers and with high tar deposit, the probability of lung cancer is 62%. For smokers and with no tar deposit, the probability of lung cancer is 19%. For smokers and with high tar deposit, the probability of lung cancer is 52%. The overall probability of smoking is 29%. Will smoking decrease the chance of lung cancer?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': []}
Causal Estimate: -0.008768608381992227

 Instruction: Based on this causal estimate, answer yes or no. Think about how the causal estimate answers the question, do not do calcu

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
The overall probability of taking the elevator is 31%. For those who choose to take the stairs, the probability of penguin death is 45%. For those who choose to take the elevator, the probability of penguin death is 34%. Is penguin death less likely than penguin lives overall?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': []}
Causal Estimate: 0.018510987986919747

 Instruction: Based on this causal estimate, answer yes or no. Think about how the causal estimate answers the question, do not do calculations, but give an explanation. Answer: 
(LOG) Prompt Question:  The overall probability of taking the elevator is 31%. For those who choose to take the stairs, the probability of penguin death is 45%. For those who choose to take the elevator,

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
For normal weight people, the probability of long lifespan is 85%. For obese people, the probability of long lifespan is 64%. Will obesity decrease the chance of long lifespan?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': []}
Causal Estimate: 0.018510987986919747

 Instruction: Based on this causal estimate, answer yes or no. Think about how the causal estimate answers the question, do not do calculations, but give an explanation. Answer: 
(LOG) Prompt Question:  For normal weight people, the probability of long lifespan is 85%. For obese people, the probability of long lifespan is 64%. Will obesity decrease the chance of long lifespan?
(LOG) Correct answer:  yes
(LOG) Prompt Answer:  
!!! Code generated by LLM: !!!

import dowhy
from dow

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
For those who choose to take the stairs and penguins who are sad, the probability of penguin death is 31%. For those who choose to take the stairs and penguins who are happy, the probability of penguin death is 78%. For those who choose to take the elevator and penguins who are sad, the probability of penguin death is 48%. For those who choose to take the elevator and penguins who are happy, the probability of penguin death is 85%. For those who choose to take the stairs, the probability of penguin happiness is 74%. For those who choose to take the elevator, the probability of penguin happiness is 29%. Does my decision negatively affect penguin survival through penguin mood?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': []}
Causal Estimate

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
For those who choose to take the stairs and penguins who are sad, the probability of penguin death is 81%. For those who choose to take the stairs and penguins who are happy, the probability of penguin death is 55%. For those who choose to take the elevator and penguins who are sad, the probability of penguin death is 43%. For those who choose to take the elevator and penguins who are happy, the probability of penguin death is 23%. For those who choose to take the stairs, the probability of penguin happiness is 42%. For those who choose to take the elevator, the probability of penguin happiness is 71%. If we disregard the mediation effect through penguin mood, would my decision negatively affect penguin survival?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
The overall probability of smoking mother is 29%. For infants with nonsmoking mothers, the probability of normal infant birth weight is 57%. For infants with smoking mothers, the probability of normal infant birth weight is 12%. Is normal infant birth weight more likely than low infant birth weight overall?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': []}
Causal Estimate: -0.46576256999940224

 Instruction: Based on this causal estimate, answer yes or no. Think about how the causal estimate answers the question, do not do calculations, but give an explanation. Answer: 
(LOG) Prompt Question:  The overall probability of smoking mother is 29%. For infants with nonsmoking mothers, the probability of normal infant birth weight is 57%. For inf

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
We know that vaccination causes having smallpox and severe vaccination reaction. having smallpox or severe vaccination reaction causes smallpox survival. Would the person dies from smallpox if vaccination instead of lack of vaccination?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': []}
Causal Estimate: 0.049677788400382616

 Instruction: Based on this causal estimate, answer yes or no. Think about how the causal estimate answers the question, do not do calculations, but give an explanation. Answer: 
(LOG) Prompt Question:  We know that vaccination causes having smallpox and severe vaccination reaction. having smallpox or severe vaccination reaction causes smallpox survival. Would the person dies from smallpox if vaccination instead of lack

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
The overall probability of receives treatment is 56%. For patients not receiving treatment, the probability of recovery is 18%. For patients receiving treatment, the probability of recovery is 64%. Is recovery less likely than non-recovery overall?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': []}
Causal Estimate: 0.44083987023532845

 Instruction: Based on this causal estimate, answer yes or no. Think about how the causal estimate answers the question, do not do calculations, but give an explanation. Answer: 
(LOG) Prompt Question:  The overall probability of receives treatment is 56%. For patients not receiving treatment, the probability of recovery is 18%. For patients receiving treatment, the probability of recovery is 64%. Is recovery

  intercept_parameter = self.model.params[0]


FOLLOWUP PROMPT IS:  Question: 
The overall probability of college degree or higher is 80%. The probability of high school degree or lower and high salary is 14%. The probability of college degree or higher and high salary is 28%. Is the chance of high salary smaller when observing college degree or higher?
 The solution after generating doWhy code to solve this problem is: linear_regression
{'control_value': 0, 'treatment_value': 1, 'test_significance': None, 'evaluate_effect_strength': False, 'confidence_intervals': False, 'target_units': 'ate', 'effect_modifiers': []}
Causal Estimate: 0.14050179211469538
Probability of high salary given college degree: 0.28
Probability of high salary given no college degree: 0.14

 Instruction: Based on this causal estimate, answer yes or no. Think about how the causal estimate answers the question, do not do calculations, but give an explanation. Answer: 
(LOG) Prompt Question:  The overall probability of college degree or higher is 80%. The probab

KeyboardInterrupt: 

In [19]:
# def run_openai(df, model_col_name, model, method_name, min_range, max_range, temperature=1.0, overwrite=False):
    
#     if model_col_name not in df.columns:
#         raise KeyError(model_col_name + " : Column name doesn't exist!")
    
#     for i in range(min_range, max_range):

#         if df.iloc[i][model_col_name] is None or overwrite == True:

#             prompt_question = df.iloc[i]['info'] + " " + df.iloc[i]['question'] + custom_prompt
#             generated_code = extract_python_code(input_output(prompt_question, model, temperature))
#             generated_code = python_repl.sanitize_input(generated_code)
#             solution = python_repl.run(generated_code)
            
#             followup_prompt = df.iloc[i]['info'] + " " + df.iloc[i]['question']  + "The solution to this answer after generating doWhy code is: " +  solution + "Based on the causal estimate, answer the question with yes or no in the end."
#             final_answer = input_output(followup_prompt, 'gpt-4o', 1)

#             extracted_answer = extract_yes_or_no(final_answer)
            
#             print('Info: ', df.iloc[i]['info'])
#             print('Question: ', df.iloc[i]['question'])
#             print('Actual answer: ', df.iloc[i]['answer'])

#             print('Generated code: ', generated_code)
#             print('Solution: ', solution)
#             print('PROMPT ANSWER: ', final_answer)
#             print('EXTRACTED ANSWER: ', extracted_answer)
            
#             df.at[i, model_col_name] = extracted_answer
#             df.at[i, model_col_name + '_reasoning'] = solution
#             print('SOLUTION: ', solution)
#             print('EXTRACTED ANSWER: ', extracted_answer)
#             print('Generation completed: ', i)

#         else:
        
#             print('Skipping: ', i)    

# run_openai(df_cladder, model_col_name='gpt-4o', model='gpt-4o', method_name='input_output', overwrite=True, min_range=0, max_range=10)