In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from numpy import *
import matplotlib.pylab as plt
import csv
import math
import scipy
import openpyxl
import statsmodels.api as sm
from linearmodels.iv import IV2SLS
from numpy.linalg import inv
from linearmodels import PanelOLS

import os
#cwd = os.getcwd()

# Probem 2

The dataset "jtrain1" has data on firms and the amount of job training their employees get. The treatment variable of interest is "grant". 

In [2]:
# Dropping observations for which hrsemp is missing
# Only using data for year 1987 and 1988
file = 'jtrain1.csv'
df = pd.read_csv(file)
df = df[(df.year == 1987) | (df.year == 1988)]
new_df = df[['year','grant','hrsemp','fcode']]
new_df = new_df.dropna()
#np.where(np.isnan(new_df.hrsemp))

## Estimating the difference in differences estimator in 3 ways

i) Taking the diff-in-diff of the means of "hrsemp" $(control,treatment)\times(before,after)$

In [72]:
new_df['E'] = new_df.fcode.isin(new_df[new_df['grant'] == 1]
                                .fcode.tolist()).astype(int)
new_df['dum88'] = (new_df['year'] == 1988).astype(int)
year = pd.Categorical(new_df.year)
fcode = pd.Categorical(new_df.fcode)
new_df = new_df.set_index(['fcode','year'])
new_df['year'] = year
new_df['fcode'] = fcode
means = new_df.groupby(['year','E'])['hrsemp'].mean()
d_i_d = (means[1988,1]) - (means[1987,1]) - 
(means[1988,0] - means[1987,0])
print("The difference in difference estimator is",round(d_i_d,2))

The difference in difference estimator is 28.01


ii) Regression model
\begin{equation}
hrsemp_{it} = \beta_0 + \beta_1 grant_{it} + \beta_21(year=1988) + \beta_3E_i + \mu_{it}
\end{equation}
where $E_i$ is a dummy variable for being a treatment (i.e. a firm recieves the grant in 1988)

In [40]:
exog = ['grant','dum88','E']
x = sm.add_constant(new_df[exog])
mod1 = PanelOLS(new_df.hrsemp, x)
res1 = mod1.fit(cov_type='clustered', cluster_entity=True)
print(res1)

                          PanelOLS Estimation Summary                           
Dep. Variable:                 hrsemp   R-squared:                        0.1501
Estimator:                   PanelOLS   R-squared (Between):              0.0692
No. Observations:                 256   R-squared (Within):               0.4710
Date:                Thu, Oct 04 2018   R-squared (Overall):              0.1501
Time:                        14:26:03   Log-likelihood                   -1140.5
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      14.831
Entities:                         131   P-value                           0.0000
Avg Obs:                       1.9542   Distribution:                   F(3,252)
Min Obs:                       1.0000                                           
Max Obs:                       2.0000   F-statistic (robust):             14.637
                            

iii) Fixed effect regression:
\begin{equation}
hrsemp_{it} = \theta_i + \beta_1 grant_{it} + \beta_21(year=1988) + \mu_{it}
\end{equation}

In [49]:
exog = ['grant','dum88','fcode']
x = sm.add_constant(new_df[exog])
mod2 = PanelOLS(new_df.hrsemp,x)
res2 = mod2.fit()
print("The parameter estimate")
print(round(res2.params[0:3],2))

The parameter estimate
const     7.27
grant    27.88
dum88     0.51
Name: parameter, dtype: float64


The parameter estimates from the three estimations result in approximately the same values, at around 28. This is the case because we only have 2 time periods so the fixed effects estimator is the same as the difference in means before and after the treatment. 

## Building on the fixed effect regression
Using all 3 years of data rather than 2 years only and including a firm specific time trend in the model in 2 ways

i) Add the firm-specific trends to the right-hand side

In [3]:
df = pd.read_csv(file)
df = df[df['hrsemp'] >= 0]
df['E'] = df.fcode.isin(df[df['grant'] == 1].
                        fcode.tolist()).astype(int)
year = pd.Categorical(df.year)
fcode = pd.Categorical(df.fcode)
df = df.set_index(['fcode','year'])
df['year'] = year
df['fcode'] = fcode
df['year*fcode']= pd.Categorical(df['year'].
                astype(str)+'_' + df['fcode'].astype(str))

In [79]:
exog = ['grant','fcode','year']
x = sm.add_constant(df[exog])
mod3 = PanelOLS(df.hrsemp,x)
res3 = mod3.fit()
print("The parameter estimate")
print(round(res3.params[0:2],2))

The parameter estimate
const     5.04
grant    34.04
Name: parameter, dtype: float64


ii) For each firm, run a regression of the dependent and independent variables on an intercept and a time trend, take the residuals, and run a regression of the dependent residuals on the independent residuals

In [64]:
exog = ['year','fcode']
x = sm.add_constant(df[exog])
mod4 = PanelOLS(df.hrsemp,x)
res4 = mod4.fit()
df['resid_y'] = res4.predict()

mod5 = PanelOLS(df.grant,x)
res5 = mod5.fit()
df['resid_grant'] = res5.predict()

In [65]:
exog = ['resid_grant']
x = sm.add_constant(df[exog])
mod6 = PanelOLS(df.resid_y, x)
res6 = mod6.fit()
print(res6)

                          PanelOLS Estimation Summary                           
Dep. Variable:                resid_y   R-squared:                        0.0860
Estimator:                   PanelOLS   R-squared (Between):              0.0511
No. Observations:                 390   R-squared (Within):               0.7492
Date:                Thu, Oct 04 2018   R-squared (Overall):              0.0860
Time:                        14:43:32   Log-likelihood                   -1716.2
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      36.520
Entities:                         135   P-value                           0.0000
Avg Obs:                       2.8889   Distribution:                   F(1,388)
Min Obs:                       1.0000                                           
Max Obs:                       3.0000   F-statistic (robust):             36.520
                            

The coefficients on grant are not exactly the same when we modeled the firm specific trends differently. The coefficient is greater when we added the firm trend directly into the right hand side.