In [1]:
import pandas as pd


dta_file = 'karlan_list_2007.dta'
csv_file = 'karlan_list_2007.csv'

# Read the .dta file
df = pd.read_stata(dta_file)


# Convert and save to .csv
df.to_csv(csv_file, index=False)

df.shape


(50083, 51)

In [None]:
df = pd.read_csv(csv_file)
df.columns = df.columns.str.strip().str.lower()
dataset_description = df.describe(include='all').transpose()
dataset_description['missing_values'] = df.isnull().sum()
dataset_description['data_type'] = df.dtypes
description_subset = dataset_description[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'missing_values', 'data_type']]
description_subset

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_values,data_type
treatment,50083.0,0.666813,0.471357,0.0,0.0,1.0,1.0,1.0,0,int64
control,50083.0,0.333187,0.471357,0.0,0.0,0.0,1.0,1.0,0,int64
ratio,50083.0,,,,,,,,0,object
ratio2,50083.0,0.222311,0.415803,0.0,0.0,0.0,0.0,1.0,0,int64
ratio3,50083.0,0.222211,0.415736,0.0,0.0,0.0,0.0,1.0,0,int64
size,50083.0,,,,,,,,0,object
size25,50083.0,0.166723,0.372732,0.0,0.0,0.0,0.0,1.0,0,int64
size50,50083.0,0.166623,0.372643,0.0,0.0,0.0,0.0,1.0,0,int64
size100,50083.0,0.166723,0.372732,0.0,0.0,0.0,0.0,1.0,0,int64
sizeno,50083.0,0.166743,0.37275,0.0,0.0,0.0,0.0,1.0,0,int64


In [None]:
import pandas as pd
from scipy.stats import ttest_ind
import statsmodels.formula.api as smf

vars_to_test = ['mrm2', 'freq', 'couple', 'median_hhincome']
df_clean = df[['treatment'] + vars_to_test].dropna()
df_clean.shape



(47114, 5)

In [23]:
t_test_results = []
regression_results = []

for var in vars_to_test:
    # Separate groups
    treat_group = df_clean[df_clean['treatment'] == 1][var]
    control_group = df_clean[df_clean['treatment'] == 0][var]
    # T-test
    t_stat, t_pval = ttest_ind(treat_group, control_group, equal_var=False)
   
    # Linear regression
    formula = f"{var} ~ treatment"
    model = smf.ols(formula, data=df_clean).fit()
    coef = model.params['treatment']
    reg_pval = model.pvalues['treatment']

    t_test_results.append({
        "Variable": var,
        "T-test(p-value)": round(t_pval, 4),
        "Significant (T-test)": "Yes" if t_pval < 0.05 else "No"
    })

    regression_results.append({
        "Variable": var,
        "Coef": round(coef, 4),
        "Regression(p-value)": round(reg_pval, 4),
        "Significant (Reg)": "Yes" if reg_pval < 0.05 else "No"
    })


t_df = pd.DataFrame(t_test_results)
r_df = pd.DataFrame(regression_results)

print("=== T-Test Results ===")
print(t_df.to_string(index=False))
print("\n=== Linear Regression Results ===")
print(r_df.to_string(index=False))

=== T-Test Results ===
       Variable  T-test(p-value) Significant (T-test)
           mrm2           0.9372                   No
           freq           0.9066                   No
         couple           0.9336                   No
median_hhincome           0.5431                   No

=== Linear Regression Results ===
       Variable      Coef  Regression(p-value) Significant (Reg)
           mrm2    0.0093               0.9373                No
           freq   -0.0132               0.9064                No
         couple   -0.0002               0.9336                No
median_hhincome -130.5570               0.5438                No
