In [1]:
# Applied Data Analysis School: October/november 2020
# 6. REGRESSION ANALYSIS AND CAUSALITY WITH R | By: João Cerejeira | 10 & 12 November
# https://www.gades-solutions.com/project/data-analysis-school/

# SET WROKING DIRECTORY

  setwd("C:/Users/mangelo.EEG/Documents/GitHub/R_Training/regression")

rm(list=ls()) #Removes all items in Environment!

# LIBRARIES

library(tidyverse)
library(AER)      # for `ivreg()`
library(lmtest)   # for `coeftest()` and `bptest()`.
library(broom)    # for `glance(`) and `tidy()`
library(PoEdata)  # for PoE4 datasets
library(car)      # for `hccm()` robust standard errors
library(sandwich)
library(knitr)    # for making neat tables with `kable()`
library(stargazer) 

# DATA

  data("mroz", package="PoEdata")
  
  mroz1 <- mroz[mroz$lfp==1,] #restricts sample to lfp=1

# First Stage

educ.ols <- lm(educ~exper+I(exper^2)+mothereduc, data=mroz1)

  summary(educ.ols)

  kable(tidy(educ.ols), digits=4, align='c',caption=
        "First stage in the 2SLS model for the 'wage' equation")

    stargazer(educ.ols,type = "text")

# Second Stage

    educHat <- fitted(educ.ols)

    wage.2sls <- lm(log(wage)~educHat+exper+I(exper^2), data=mroz1)
kable(tidy(wage.2sls), digits=4, align='c',caption=
        "Second stage in the 2SLS model for the 'wage' equation")

# But the standard errors are incorrect  the correct method is to use 
# a dedicated software function to solve an instrumental variable model

mroz1.ols <- lm(log(wage)~educ+exper+I(exper^2), data=mroz1)

mroz1.iv <- ivreg(log(wage)~educ+exper+I(exper^2)|
                    exper+I(exper^2)+mothereduc, data=mroz1)

stargazer(mroz1.ols, wage.2sls, mroz1.iv,
          title="Wage equation: OLS, 2SLS, and IV models compared",
          header=FALSE, 
          type="text", # "html", "text" or "latex" (in index.Rmd) 
          keep.stat="n",  # what statistics to print
          omit.table.layout="n",
          star.cutoffs=NA,
          digits=4, 
          #  single.row=TRUE,
          intercept.bottom=FALSE, #moves the intercept coef to top
          column.labels=c("OLS","explicit 2SLS", "IV mothereduc", 
                          "IV mothereduc and fathereduc"),
          dep.var.labels.include = FALSE,
          model.numbers = FALSE,
          dep.var.caption="Dependent variable: wage",
          model.names=FALSE,
          star.char=NULL) #supresses the stars

### Test for weak instruments in the  wage  equation

# we just test the joint significance of the instruments in an  educ  model

educ.ols <- lm(educ~exper+I(exper^2)+mothereduc+fathereduc, 
               data=mroz1)

  tab <- tidy(educ.ols)

  kable(tab, digits=4,
        caption="The 'educ' first-stage equation")

# The test rejects the null hypothesis that both  mothereduc  and  fathereduc  
# coefficients are zero, indicating that at least one instrument is strong. 
# A rule of thumb requires to soundly reject the null hypothesis at a value of the 
# F -statistic greater than 10 or, for only one instrument, a  t -statistic greater 
# than 3.16, to make sure that an instrument is strong.

  linearHypothesis(educ.ols, c("mothereduc=0", "fathereduc=0"))

### Specification Tests

# Hausman test for endogeneity, where the null hypothesis is  H0:Cov(x,e)=0

# Test for the validity of instruments, test for overidentifying restrictions, 
# or Sargan test H0:Cov(z,e)=0

  summary(mroz1.iv, diagnostics=TRUE)

# Results:
# Weak instruments test: rejects the null, meaning that at least one instrument is strong
# (Wu-)Hausman test for endogeneity: barely rejects the null that the variable of concern 
# is uncorrelated with the error term, indicating that  educ  is marginally endogenous
# Sargan overidentifying restrictions: does not reject the null, meaning that the extra instruments 
# are valid (are uncorrelated with the error term).



-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --

[32mv[39m [34mggplot2[39m 3.3.2     [32mv[39m [34mpurrr  [39m 0.3.4
[32mv[39m [34mtibble [39m 3.0.3     [32mv[39m [34mdplyr  [39m 1.0.2
[32mv[39m [34mtidyr  [39m 1.1.0     [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr  [39m 1.3.1     [32mv[39m [34mforcats[39m 0.5.0

-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: car

Loading required package: carData


Attaching package: 'car'


The following object is masked from 'package:dplyr':

    recode


The following object is masked from 'package:purrr':

    some


Loading required package: lmtest

Loading required package: zoo


Attaching package: 'zoo'


The following objects are masked from


Call:
lm(formula = educ ~ exper + I(exper^2) + mothereduc, data = mroz1)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.4423 -1.2963 -0.0837  1.1761  5.9870 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.775103   0.423889  23.061   <2e-16 ***
exper        0.048862   0.041669   1.173    0.242    
I(exper^2)  -0.001281   0.001245  -1.029    0.304    
mothereduc   0.267691   0.031130   8.599   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.111 on 424 degrees of freedom
Multiple R-squared:  0.1527,	Adjusted R-squared:  0.1467 
F-statistic: 25.47 on 3 and 424 DF,  p-value: 3.617e-15




Table: First stage in the 2SLS model for the 'wage' equation

|    term     | estimate | std.error | statistic | p.value |
|:-----------:|:--------:|:---------:|:---------:|:-------:|
| (Intercept) |  9.7751  |  0.4239   |  23.0605  | 0.0000  |
|    exper    |  0.0489  |  0.0417   |  1.1726   | 0.2416  |
| I(exper^2)  | -0.0013  |  0.0012   |  -1.0290  | 0.3040  |
| mothereduc  |  0.2677  |  0.0311   |  8.5992   | 0.0000  |


                        Dependent variable:    
                    ---------------------------
                               educ            
-----------------------------------------------
exper                          0.049           
                              (0.042)          
                                               
I(exper2)                     -0.001           
                              (0.001)          
                                               
mothereduc                   0.268***          
                              (0.031)          
                                               
Constant                     9.775***          
                              (0.424)          
                                               
-----------------------------------------------
Observations                    428            
R2                             0.153           
Adjusted R2                    0.147           
Residual Std. Error      2.111 (df = 42



Table: Second stage in the 2SLS model for the 'wage' equation

|    term     | estimate | std.error | statistic | p.value |
|:-----------:|:--------:|:---------:|:---------:|:-------:|
| (Intercept) |  0.1982  |  0.4933   |  0.4017   | 0.6881  |
|   educHat   |  0.0493  |  0.0391   |  1.2613   | 0.2079  |
|    exper    |  0.0449  |  0.0142   |  3.1668   | 0.0017  |
| I(exper^2)  | -0.0009  |  0.0004   |  -2.1749  | 0.0302  |


Wage equation: OLS, 2SLS, and IV models compared
                   Dependent variable: wage      
             ------------------------------------
               OLS    explicit 2SLS IV mothereduc
-------------------------------------------------
Constant     -0.5220     0.1982        0.1982    
             (0.1986)   (0.4933)      (0.4729)   
                                                 
educ          0.1075                   0.0493    
             (0.0141)                 (0.0374)   
                                                 
educHat                  0.0493                  
                        (0.0391)                 
                                                 
exper         0.0416     0.0449        0.0449    
             (0.0132)   (0.0142)      (0.0136)   
                                                 
I(exper2)    -0.0008     -0.0009       -0.0009   
             (0.0004)   (0.0004)      (0.0004)   
                                                 




Table: The 'educ' first-stage equation

|term        | estimate| std.error| statistic| p.value|
|:-----------|--------:|---------:|---------:|-------:|
|(Intercept) |   9.1026|    0.4266|   21.3396|  0.0000|
|exper       |   0.0452|    0.0403|    1.1236|  0.2618|
|I(exper^2)  |  -0.0010|    0.0012|   -0.8386|  0.4022|
|mothereduc  |   0.1576|    0.0359|    4.3906|  0.0000|
|fathereduc  |   0.1895|    0.0338|    5.6152|  0.0000|

Unnamed: 0_level_0,Res.Df,RSS,Df,Sum of Sq,F,Pr(>F)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,425,2219.216,,,,
2,423,1758.575,2.0,460.6411,55.4003,4.268909e-22



Call:
ivreg(formula = log(wage) ~ educ + exper + I(exper^2) | exper + 
    I(exper^2) + mothereduc, data = mroz1)

Residuals:
     Min       1Q   Median       3Q      Max 
-3.10804 -0.32633  0.06024  0.36772  2.34351 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)   
(Intercept)  0.1981861  0.4728772   0.419  0.67535   
educ         0.0492630  0.0374360   1.316  0.18891   
exper        0.0448558  0.0135768   3.304  0.00103 **
I(exper^2)  -0.0009221  0.0004064  -2.269  0.02377 * 

Diagnostic tests:
                 df1 df2 statistic p-value    
Weak instruments   1 424    73.946  <2e-16 ***
Wu-Hausman         1 423     2.968  0.0856 .  
Sargan             0  NA        NA      NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.6796 on 424 degrees of freedom
Multiple R-Squared: 0.1231,	Adjusted R-squared: 0.1169 
Wald test: 7.348 on 3 and 424 DF,  p-value: 8.228e-05 
