In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

  from pandas.core import datetools


In [2]:
df = pd.read_excel('TestExer4_Wage-round1.xls')

In [3]:
df.head()

Unnamed: 0,logw,educ,age,exper,smsa,south,nearc,daded,momed
0,6.306275,7,29,16,1,0,0,9.94,10.25
1,6.175867,12,27,9,1,0,0,8.0,8.0
2,6.580639,12,34,16,1,0,0,14.0,12.0
3,5.521461,11,27,10,1,0,1,11.0,12.0
4,6.591674,12,34,16,1,0,1,8.0,7.0


## Part (a)

The coefficient for `educ` in the OLS estimate is 0.0816. This means that when education increases by 1 year `logw` increases by 0.082.

In [4]:
df['exper2'] = df['exper']**2 

In [5]:
X = df[['educ', 'exper', 'exper2', 'smsa', 'south']]
X = sm.add_constant(X)
y = df['logw']

model = sm.OLS(y,X)
result = model.fit()

print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                   logw   R-squared:                       0.263
Model:                            OLS   Adj. R-squared:                  0.262
Method:                 Least Squares   F-statistic:                     214.6
Date:                Tue, 24 Apr 2018   Prob (F-statistic):          3.70e-196
Time:                        03:11:48   Log-Likelihood:                -1365.6
No. Observations:                3010   AIC:                             2743.
Df Residuals:                    3004   BIC:                             2779.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.6110      0.068     67.914      0.0

## Part (b)

`Intelligence`, `Efficiency` of a person may be factors that could make `educ` and `exper` endogenous.  

In this case OLS is very useful as it is inconsistent so the estimate in Part(a) is ot useful.

## Part (c)

`age` is related to `exper` as older people usually have a lot of experience. So, `age` and `age2` would be highly correlated with `exper` and `exper2`.

## Part (d)

### First Stage Regression

All the instruments have high correlation with `educ` as evidenced by their p-values. As the endogenous variable and the instrument variables have high correlation, they are suitable instruments for schooling.

In [6]:
df['age2'] = df['age']**2

In [7]:
y1 = df['educ']
X1 = df[['smsa', 'south', 'age', 'age2', 'nearc', 'daded', 'momed']]
X1 = sm.add_constant(X1)

model1 = sm.OLS(y1,X1)
res1 = model1.fit()

print(res1.summary())

                            OLS Regression Results                            
Dep. Variable:                   educ   R-squared:                       0.247
Model:                            OLS   Adj. R-squared:                  0.245
Method:                 Least Squares   F-statistic:                     140.4
Date:                Tue, 24 Apr 2018   Prob (F-statistic):          2.14e-179
Time:                        03:11:49   Log-Likelihood:                -6808.2
No. Observations:                3010   AIC:                         1.363e+04
Df Residuals:                    3002   BIC:                         1.368e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.6524      3.976     -1.421      0.1

In [8]:
y2 = df['exper']
X2 = df[['smsa', 'south', 'age', 'age2', 'nearc', 'daded', 'momed']]
X2 = sm.add_constant(X2)

model2 = sm.OLS(y2,X2)
res2 = model2.fit()

print(res2.summary())

                            OLS Regression Results                            
Dep. Variable:                  exper   R-squared:                       0.685
Model:                            OLS   Adj. R-squared:                  0.685
Method:                 Least Squares   F-statistic:                     933.7
Date:                Tue, 24 Apr 2018   Prob (F-statistic):               0.00
Time:                        03:11:49   Log-Likelihood:                -6808.2
No. Observations:                3010   AIC:                         1.363e+04
Df Residuals:                    3002   BIC:                         1.368e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.3476      3.976     -0.087      0.9

In [9]:
y3 = df['exper2']
X3 = df[['smsa', 'south', 'age', 'age2', 'nearc', 'daded', 'momed']]
X3 = sm.add_constant(X3)

model3 = sm.OLS(y3,X3)
res3 = model3.fit()

print(res3.summary())

                            OLS Regression Results                            
Dep. Variable:                 exper2   R-squared:                       0.657
Model:                            OLS   Adj. R-squared:                  0.656
Method:                 Least Squares   F-statistic:                     820.4
Date:                Tue, 24 Apr 2018   Prob (F-statistic):               0.00
Time:                        03:11:49   Log-Likelihood:                -16020.
No. Observations:                3010   AIC:                         3.206e+04
Df Residuals:                    3002   BIC:                         3.210e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        681.3828     84.846      8.031      0.0

In [10]:
df['pred_educ'] = res1.predict(X1)
df['pred_exper'] = res2.predict(X2)
df['pred_exper2'] = res3.predict(X3)

In [11]:
y4 = df['logw']
X4 = df[['smsa', 'south', 'pred_educ', 'pred_exper', 'pred_exper2']]
X4 = sm.add_constant(X4)

model4 = sm.OLS(y4, X4)
res4 = model4.fit()

print(res4.summary())

                            OLS Regression Results                            
Dep. Variable:                   logw   R-squared:                       0.219
Model:                            OLS   Adj. R-squared:                  0.218
Method:                 Least Squares   F-statistic:                     168.6
Date:                Tue, 24 Apr 2018   Prob (F-statistic):          1.84e-158
Time:                        03:11:49   Log-Likelihood:                -1452.9
No. Observations:                3010   AIC:                             2918.
Df Residuals:                    3004   BIC:                             2954.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           4.4169      0.118     37.476      

## Part (e)  

As can be observed from the above table, `educ` (`pred_educ`) has a positive effect on `logw`.

## Part (f)

**Sargan Test** 

$H0 :\delta = 0 $ in  
$\epsilon = Z\delta + \zeta $

$nR^2 = 3010*0.001 = 3.01$  

m = 8, k = 6  

$\chi^2(m-k) = \chi^2(2) = 5.99$  

Since, $nR^2 < \chi^2(2)$ we do not reject the null hypothesis, $H0$. So, the instruments are valid as $Z$ is not correlated with th error term $\epsilon$.


In [12]:
e_2SLS = df['logw'] - res4.predict(X)

In [13]:
y = e_2SLS
Z = df[['smsa', 'south', 'age', 'age2', 'nearc', 'daded', 'momed']]
Z = sm.add_constant(Z)

model = sm.OLS(y,Z)
res = model.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.658
Model:                            OLS   Adj. R-squared:                  0.657
Method:                 Least Squares   F-statistic:                     826.0
Date:                Tue, 24 Apr 2018   Prob (F-statistic):               0.00
Time:                        03:11:49   Log-Likelihood:                -8760.5
No. Observations:                3010   AIC:                         1.754e+04
Df Residuals:                    3002   BIC:                         1.759e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -68.9047      7.606     -9.059      0.0

In [14]:
n = df.shape[0]

In [15]:
print('Number of samples = {}'.format(n))

Number of samples = 3010


In [16]:
print("n*R-squared = {}".format(n*0.001))

n*R-squared = 3.0100000000000002
