In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
from linearmodels.iv import IV2SLS
from statsmodels.sandbox.regression.gmm import IV2SLS as ivs
from scipy import stats
from linearmodels.iv.results import compare
from statsmodels.iolib.summary2 import summary_col

In [5]:
dfdata = pd.read_csv("data.csv")

In [6]:
dfsob = pd.get_dummies(dfdata['bplg'], prefix="sob", prefix_sep='')

In [7]:
del dfsob['sob56']

In [8]:
dfyobdummy = pd.get_dummies(dfdata['yob']-1900, prefix="yob", prefix_sep='')

In [9]:
del dfyobdummy['yob54']

In [10]:
frame = [dfdata, dfsob, dfyobdummy]
dfdatav2 = pd.concat(frame, axis=1)

In [11]:
dfdatav2['census70'] = np.where(dfdatav2["year"] == 1970, 1, 0)

In [12]:
dfdatav2['census80'] = np.where(dfdatav2["year"] == 1980, 1, 0)

In [13]:
dfdatav2['male'] = np.where(dfdatav2["sex"] == 1, 1, 0)

In [14]:
dfdatav2['white'] = np.where(dfdatav2["race"] == 1, 1, 0)

In [15]:
dfdatav2['age2'] = dfdatav2['age']^2

In [16]:
dfdatav2['age3'] = dfdatav2['age']^3

In [17]:
dfdatav2['age4'] =dfdatav2['age']^4

In [18]:
dfdatav2['ny7'] = np.where(dfdatav2["numlawyears"] == 7, 1, 0)

In [19]:
dfdatav2['ny8'] = np.where(dfdatav2["numlawyears"] == 8, 1, 0)

In [20]:
dfdatav2['ny9'] = np.where(dfdatav2["numlawyears"] >= 9, 1, 0)

In [21]:
dfcohort = dfdatav2.groupby(['bplg', 'birthyr'])

In [22]:
grouplnwkwage = dfcohort.apply(lambda x: x['lnwkwage'].tolist()).to_dict()
groupslwt = dfcohort.apply(lambda x: x['slwt'].tolist()).to_dict()

In [23]:
dfcohortmean = dfcohort.mean()

In [24]:
dfcontrol = dfcohortmean.drop(['year','statefip','sex','birthqtr','race','higrade','incwage','weeks','slwt','yob','yearat14','numlawyears','enrolage','drop_age','req_sch','work_age','work_sch','ca','cl','lnwkwage','yob_bplg'], axis=1) 

In [25]:
p9010 = {k: np.percentile(v, 90, interpolation='linear')-np.percentile(v,10, interpolation='linear') for k, v in grouplnwkwage.items()}
p5010 = {k: np.percentile(v, 50, interpolation='linear')-np.percentile(v,10, interpolation='linear') for k, v in grouplnwkwage.items()}
p9050 = {k: np.percentile(v, 90, interpolation='linear')-np.percentile(v,50, interpolation='linear') for k, v in grouplnwkwage.items()}
pslwt = {k: np.sum(v) for k, v in groupslwt.items()}

In [26]:
ds = [p9010, p5010, p9050, pslwt]
d={}
for k in p9010.keys():
    d[k]=tuple(d[k] for d in ds)

In [27]:
dfpwage = pd.DataFrame.from_dict(d, orient='index', columns=['difference9010', 'difference5010', 'difference9050','slwt'])

In [28]:
dfpwage.head()

Unnamed: 0,difference9010,difference5010,difference9050,slwt
"(1, 1905)",2.066198,1.342408,0.723789,25597.0
"(1, 1906)",2.383811,1.545925,0.837886,23604.0
"(1, 1907)",2.390939,1.53393,0.857008,25602.0
"(1, 1908)",2.33274,1.46688,0.86586,26002.0
"(1, 1909)",2.129042,1.26621,0.862832,30098.0


In [29]:
dfdata = pd.concat([dfcontrol,dfpwage], axis=1)

In [30]:
controls = ['census80','male','white','age','age2','age3','age4','sob1','sob4','sob5','sob6','sob8','sob9','sob10','sob11','sob12','sob13','sob16','sob17','sob18','sob19','sob20','sob21','sob22','sob23','sob24','sob25','sob26','sob27','sob28','sob29','sob30','sob31','sob32','sob33','sob34','sob35','sob36','sob37','sob38','sob39','sob40','sob41','sob42','sob44','sob45','sob46','sob47','sob48','sob49','sob50','sob51','sob53','sob54','sob55','yob5','yob6','yob7','yob8','yob9','yob10','yob11','yob12','yob13','yob14','yob15','yob16','yob17','yob18','yob19','yob20','yob21','yob22','yob23','yob24','yob25','yob26','yob27','yob28','yob29','yob30','yob31','yob32','yob33','yob34','yob35','yob36','yob37','yob38','yob39','yob40','yob41','yob42','yob43','yob44','yob45','yob46','yob47','yob48','yob49','yob50','yob51','yob52','yob53']

In [31]:
string_controls = 'census70'
for i in controls:
    string_controls = string_controls + '+' + i

In [32]:
dfdata.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,gradcap,sob1,sob4,sob5,sob6,sob8,sob9,sob10,sob11,...,age2,age3,age4,ny7,ny8,ny9,difference9010,difference5010,difference9050,slwt
bplg,birthyr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1905,54.252918,7.863813,1,0,0,0,0,0,0,0,...,52.252918,52.747082,50.252918,0.0,0.0,0.0,2.066198,1.342408,0.723789,25597.0
1,1906,53.299578,7.919831,1,0,0,0,0,0,0,0,...,54.101266,53.700422,49.299578,0.0,0.0,0.0,2.383811,1.545925,0.837886,23604.0
1,1907,52.249027,9.050584,1,0,0,0,0,0,0,0,...,54.249027,54.750973,48.249027,0.0,0.0,0.0,2.390939,1.53393,0.857008,25602.0
1,1908,51.275862,8.371648,1,0,0,0,0,0,0,0,...,50.37931,49.931034,53.068966,0.0,0.0,0.0,2.33274,1.46688,0.86586,26002.0
1,1909,50.288079,8.466887,1,0,0,0,0,0,0,0,...,48.288079,48.711921,54.288079,0.0,0.0,0.0,2.129042,1.26621,0.862832,30098.0


In [33]:
endog = ['gradcap']
exog = ['census70','census80','male','white','age','age2','age3','age4','sob1','sob4','sob5','sob6','sob8','sob9','sob10','sob11','sob12','sob13','sob16','sob17','sob18','sob19','sob20','sob21','sob22','sob23','sob24','sob25','sob26','sob27','sob28','sob29','sob30','sob31','sob32','sob33','sob34','sob35','sob36','sob37','sob38','sob39','sob40','sob41','sob42','sob44','sob45','sob46','sob47','sob48','sob49','sob50','sob51','sob53','sob54','sob55','yob5','yob6','yob7','yob8','yob9','yob10','yob11','yob12','yob13','yob14','yob15','yob16','yob17','yob18','yob19','yob20','yob21','yob22','yob23','yob24','yob25','yob26','yob27','yob28','yob29','yob30','yob31','yob32','yob33','yob34','yob35','yob36','yob37','yob38','yob39','yob40','yob41','yob42','yob43','yob44','yob45','yob46','yob47','yob48','yob49','yob50','yob51','yob52','yob53']
dfdata['const'] = 1
control = ['const'] + exog
instr = ['ny7','ny8','ny9']
firststage = control + instr
ols = control+endog

In [34]:
iv9010 = IV2SLS(dfdata['difference9010'],dfdata[control],dfdata[endog],dfdata[instr],weights=dfdata['slwt']).fit(cov_type='robust')
print(iv9010)

                          IV-2SLS Estimation Summary                          
Dep. Variable:         difference9010   R-squared:                      0.8665
Estimator:                    IV-2SLS   Adj. R-squared:                 0.8605
No. Observations:                2449   F-statistic:                 2.392e+04
Date:                Fri, Mar 20 2020   P-value (F-stat)                0.0000
Time:                        02:40:58   Distribution:                chi2(106)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          9.1255     0.8736     10.446     0.0000      7.4134      10.838
census70       3.1834     1.6077     1.9801     0.04

In [35]:
print(iv9010.first_stage)

    First Stage Estimation Results    
                               gradcap
--------------------------------------
R-squared                       0.9997
Partial R-squared               0.1110
Shea's R-squared                0.1110
Partial F-statistic             194.06
P-value (Partial F-stat)        0.0000
Partial F-stat Distn           chi2(3)
const                           0.8028
                              (0.4095)
census70                        2.1779
                              (0.6498)
census80                        0.1270
                              (0.0190)
male                           -0.6558
                             (-2.3031)
white                           5.4414
                              (15.058)
age                             0.4202
                              (1.2422)
age2                           -0.0017
                             (-0.0095)
age3                           -0.0570
                             (-0.3442)
age4                     

In [36]:
iv5010 = IV2SLS(dfdata['difference5010'],dfdata[control],dfdata[endog],dfdata[instr],weights=dfdata['slwt']).fit(cov_type='robust')
print(iv5010)

                          IV-2SLS Estimation Summary                          
Dep. Variable:         difference5010   R-squared:                      0.6516
Estimator:                    IV-2SLS   Adj. R-squared:                 0.6359
No. Observations:                2449   F-statistic:                    6849.3
Date:                Fri, Mar 20 2020   P-value (F-stat)                0.0000
Time:                        02:40:59   Distribution:                chi2(106)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          7.2580     0.7549     9.6152     0.0000      5.7786      8.7375
census70       2.4298     1.3630     1.7826     0.07

In [37]:
print(iv5010.first_stage)

    First Stage Estimation Results    
                               gradcap
--------------------------------------
R-squared                       0.9997
Partial R-squared               0.1110
Shea's R-squared                0.1110
Partial F-statistic             194.06
P-value (Partial F-stat)        0.0000
Partial F-stat Distn           chi2(3)
const                           0.8028
                              (0.4095)
census70                        2.1779
                              (0.6498)
census80                        0.1270
                              (0.0190)
male                           -0.6558
                             (-2.3031)
white                           5.4414
                              (15.058)
age                             0.4202
                              (1.2422)
age2                           -0.0017
                             (-0.0095)
age3                           -0.0570
                             (-0.3442)
age4                     

In [38]:
iv9050 = IV2SLS(dfdata['difference9050'],dfdata[control],dfdata[endog],dfdata[instr],weights=dfdata['slwt']).fit(cov_type='robust')
print(iv9050)

                          IV-2SLS Estimation Summary                          
Dep. Variable:         difference9050   R-squared:                      0.9328
Estimator:                    IV-2SLS   Adj. R-squared:                 0.9298
No. Observations:                2449   F-statistic:                 3.864e+04
Date:                Fri, Mar 20 2020   P-value (F-stat)                0.0000
Time:                        02:41:00   Distribution:                chi2(106)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          1.8675     0.3673     5.0841     0.0000      1.1476      2.5874
census70       0.7536     0.6622     1.1380     0.25

In [39]:
print(iv9050.first_stage)

    First Stage Estimation Results    
                               gradcap
--------------------------------------
R-squared                       0.9997
Partial R-squared               0.1110
Shea's R-squared                0.1110
Partial F-statistic             194.06
P-value (Partial F-stat)        0.0000
Partial F-stat Distn           chi2(3)
const                           0.8028
                              (0.4095)
census70                        2.1779
                              (0.6498)
census80                        0.1270
                              (0.0190)
male                           -0.6558
                             (-2.3031)
white                           5.4414
                              (15.058)
age                             0.4202
                              (1.2422)
age2                           -0.0017
                             (-0.0095)
age3                           -0.0570
                             (-0.3442)
age4                     

In [40]:
print(compare({'90-10':iv9010,'50-10': iv5010, '90-50': iv9050}))

                               Model Comparison                              
                                  90-10              50-10              90-50
-----------------------------------------------------------------------------
Dep. Variable            difference9010     difference5010     difference9050
Estimator                       IV-2SLS            IV-2SLS            IV-2SLS
No. Observations                   2449               2449               2449
Cov. Est.                        robust             robust             robust
R-squared                        0.8665             0.6516             0.9328
Adj. R-squared                   0.8605             0.6359             0.9298
F-statistic                   2.392e+04             6849.3          3.864e+04
P-value (F-stat)                 0.0000             0.0000             0.0000
const                            9.1255             7.2580             1.8675
                               (10.446)           (9.6152)      

In [42]:
OLS_9010 = IV2SLS(dfdata['difference9010'],dfdata[ols],None, None, weights=dfdata['slwt']).fit(cov_type='robust')
print(OLS_9010)

                            OLS Estimation Summary                            
Dep. Variable:         difference9010   R-squared:                      0.8666
Estimator:                        OLS   Adj. R-squared:                 0.8606
No. Observations:                2449   F-statistic:                 2.356e+04
Date:                Fri, Mar 20 2020   P-value (F-stat)                0.0000
Time:                        02:42:08   Distribution:                chi2(106)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          9.1260     0.8728     10.456     0.0000      7.4154      10.837
census70       3.2222     1.6049     2.0078     0.04

In [43]:
OLS_5010 = IV2SLS(dfdata['difference5010'],dfdata[ols],None, None, weights=dfdata['slwt']).fit(cov_type='robust')
print(OLS_5010)

                            OLS Estimation Summary                            
Dep. Variable:         difference5010   R-squared:                      0.6520
Estimator:                        OLS   Adj. R-squared:                 0.6363
No. Observations:                2449   F-statistic:                    6804.6
Date:                Fri, Mar 20 2020   P-value (F-stat)                0.0000
Time:                        02:42:39   Distribution:                chi2(106)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          7.2586     0.7543     9.6231     0.0000      5.7803      8.7370
census70       2.4743     1.3615     1.8173     0.06

In [44]:
OLS_9050 = IV2SLS(dfdata['difference9050'],dfdata[ols],None, None, weights=dfdata['slwt']).fit(cov_type='robust')
print(OLS_9050)

                            OLS Estimation Summary                            
Dep. Variable:         difference9050   R-squared:                      0.9328
Estimator:                        OLS   Adj. R-squared:                 0.9298
No. Observations:                2449   F-statistic:                 3.866e+04
Date:                Fri, Mar 20 2020   P-value (F-stat)                0.0000
Time:                        02:42:55   Distribution:                chi2(106)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
const          1.8674     0.3675     5.0820     0.0000      1.1472      2.5876
census70       0.7479     0.6641     1.1261     0.26