### Card and Kruger (1994)

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [2]:
### Data download
candk_data = pd.read_csv("CandK1994_replicate_file.csv", index_col=0)

In [3]:
### Data check
candk_data

Unnamed: 0,sheet,chain,co_owned,state,southj,centralj,northj,pa1,pa2,shore,...,firstin2,special2,meals2,open2r,hrsopen2,psoda2,pfry2,pentree2,nregs2,nregs112
1,46,1,0,0,0,0,0,1,0,0,...,0.08,1.0,2.0,6.5,16.5,1.03,,0.94,4.0,4.0
2,49,2,0,0,0,0,0,1,0,0,...,0.05,0.0,2.0,10.0,13.0,1.01,0.89,2.35,4.0,4.0
3,506,2,1,0,0,0,0,1,0,0,...,0.25,,1.0,11.0,11.0,0.95,0.74,2.33,4.0,3.0
4,56,4,1,0,0,0,0,1,0,0,...,0.15,0.0,2.0,10.0,12.0,0.92,0.79,0.87,2.0,2.0
5,61,4,1,0,0,0,0,1,0,0,...,0.15,0.0,2.0,10.0,12.0,1.01,0.84,0.95,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406,423,2,1,1,0,0,1,0,0,0,...,0.50,0.0,1.0,11.0,11.0,1.05,0.84,2.32,3.0,2.0
407,424,2,1,1,0,0,1,0,0,0,...,0.50,0.0,1.0,11.0,14.0,1.05,0.94,2.32,5.0,3.0
408,426,3,1,1,0,0,1,0,0,0,...,0.25,1.0,2.0,6.0,18.0,1.11,1.05,1.05,6.0,5.0
409,427,4,0,1,0,0,1,0,0,0,...,,1.0,2.0,10.5,12.5,1.11,1.09,2.07,2.0,2.0


In [4]:
### Variable making
candk_data['fte'] = candk_data['empft'] + candk_data['nmgrs'] + (0.5 * candk_data['emppt'])
candk_data['fte_after'] = candk_data['empft2'] + candk_data['nmgrs2'] + (0.5 * candk_data['emppt2'])

In [5]:
candk_data[['fte','fte_after']]

Unnamed: 0,fte,fte_after
1,40.50,24.00
2,13.75,11.50
3,8.50,10.50
4,34.00,20.00
5,24.00,35.50
...,...,...
406,9.00,23.75
407,9.75,17.50
408,24.50,20.50
409,14.00,20.50


In [6]:
print(np.mean(candk_data[candk_data['state'] == 1]['fte']), np.mean(candk_data[candk_data['state'] == 0]['fte']))
print(np.mean(candk_data[candk_data['state'] == 1]['fte_after']), np.mean(candk_data[candk_data['state'] == 0]['fte_after']))

20.439408099688475 23.33116883116883
21.02742946708464 21.165584415584416


1. First Approach: mean difference

In [7]:
summary = (candk_data.groupby('state')
             .agg(mean_before=('fte', lambda x: np.mean(x.dropna())),
                  mean_after=('fte_after', lambda x: np.mean(x.dropna())),
                  var_before=('fte', lambda x: np.var(x.dropna(), ddof=1)),
                  var_after=('fte_after', lambda x: np.var(x.dropna(), ddof=1)),
                  count_before=('fte', lambda x: x.notna().sum()),
                  count_after=('fte_after', lambda x: x.notna().sum()))
             .reset_index())

summary['se_before'] = np.sqrt(summary['var_before'] / summary['count_before'])
summary['se_after'] = np.sqrt(summary['var_after'] / summary['count_after'])
summary['state'] = summary['state'].apply(lambda x: "PA" if x == 0 else "NJ")

In [8]:
summary

Unnamed: 0,state,mean_before,mean_after,var_before,var_after,count_before,count_after,se_before,se_after
0,PA,23.331169,21.165584,140.57145,68.504293,77,77,1.351149,0.943221
1,NJ,20.439408,21.027429,82.923591,86.360291,321,319,0.508261,0.520309


2. Second Approach: regression

In [10]:
before = candk_data.loc[:,['state', 'fte']]
after = candk_data.loc[:,['state', 'fte_after']]

before.loc[:,'after'] = 0
after.loc[:,'after'] = 1

after.rename(columns={'fte_after': 'fte'}, inplace=True)

In [11]:
panel_data = pd.concat([before, after], axis=0)

In [12]:
import statsmodels.formula.api as amf

ols_result = smf.ols('fte ~ after * state', data=panel_data).fit()

print(ols_result.summary())

                            OLS Regression Results                            
Dep. Variable:                    fte   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     1.964
Date:                Tue, 31 Dec 2024   Prob (F-statistic):              0.118
Time:                        16:58:14   Log-Likelihood:                -2904.2
No. Observations:                 794   AIC:                             5816.
Df Residuals:                     790   BIC:                             5835.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      23.3312      1.072     21.767      