In [216]:
import warnings
warnings.filterwarnings('ignore')
from pymatch.Matcher import Matcher
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import scipy
import openpyxl
import statsmodels.api as sm
from linearmodels.iv import IV2SLS
from numpy.linalg import inv

import os
#cwd = os.getcwd()

# Problem 1
The dataset "data1.csv" consists of 72 observations of average expenditures, age, and income of individual $i$.<br>
Let the dependent variable, $Y_i = Avgexp$ for average expenditure. <br>
The independent variables are separated into two groups:<br>
$X_{1i} = Age$ and $X_{2i} = Income$ <br>

In [217]:
file = 'data1.csv'
df = pd.read_csv(file)
#print('Descriptive statistics of the data')
#df.describe()

# A big multiple regression of $Y_i$ on $(X_{1i}, X_{21})$
\begin{equation}
Y_i = X'_{1i}\beta_1   + X'_{2i}\beta_2   + \mu_i
\end{equation}

In [188]:
X1 = df.Age
X2 = df.Income
X = pd.concat([X1,X2], axis=1)
#X = sm.add_constant(X)
Y = df.Avgexp
model = sm.OLS(Y, X).fit()
predictions1 = model.predict()
residuals = model.resid
#model.summary()
print(round(model.params,3))

Age       -0.686
Income    82.900
dtype: float64


# Two-part regression
Regression of $Y_i$ on $X_{2i}$: 
\begin{equation}
Y_i = \delta X_{2i} + e_i
\end{equation}

In [195]:
model_y = sm.OLS(Y, X2).fit()
predictions_y = model_y.predict()
resid_y = model_y.resid
#model_y.summary()
print('Estimated parameter \u03B4 =',round(model_y.params,3)[0])

Estimated parameter δ = 77.654


Regression of $X_{1i}$ on $X_{21}$:
\begin{equation}
X_{1i} = \phi X_{2i} + v_i
\end{equation}

In [197]:
model_x = sm.OLS(X1, X2).fit()
predictions_x = model_x.predict()
resid_x = model_x.resid
#model_x.summary()
print('Estimated parameter \u03A6 =',round(model_x.params,3)[0])

Estimated parameter Φ = 7.647


Regression on the residuals from the $Y$ regression on the residuals from the $X$ regression:
\begin{equation}
e_i = b v_i + \eta_i
\end{equation}

In [198]:
resid_reg = sm.OLS(resid_y, resid_x).fit()
predictions = resid_reg.predict()
#resid_reg.summary()
print('Estimated parameter b =',round(resid_reg.params,3)[0])

Estimated parameter b = -0.686


The two-part regression leads to the same estimate  $\widehat{\beta}_1 = -0.686$

# Regressing $Y_i$ on residuals of $X_{1i}$ on $X_{2i}$

Residuals from regression of $X_{1i}$ on $X_{21}$:
\begin{equation}
v_i = X_{1i} - \phi X_{2i}
\end{equation}

Regressing $Y_i$ on $v_i$
\begin{equation}
Y_i = \theta v_i + \xi_i
\end{equation}

In [201]:
model_c = sm.OLS(Y, resid_x).fit()
#model_c.summary()
print('Estimated parameter \u03B8 =', round(model_c.params,3)[0])

Estimated parameter θ = -0.686


Regressing $Y_i$ on $\widetilde{X}_{1i}$ where $Y_i$ is the original data and $\widetilde{X}_{1i}$ is the residuals from regressing $X_{1i}$ on $X_{2i}$ gives the same result for $\beta_1$ estimate, $\widehat{\beta}_1 = -0.686$.<br><br>
The original model  
\begin{equation}
Y_i = X'_{1i}\beta_1   + X'_{2i}\beta_2   + \mu_i$
\end{equation}
We get $\beta_1$ from OLS estimate:
\begin{equation}
\widehat{\beta}_1 = (X_{1i}'X_{1i})^{-1}X_{1i}'Y
\end{equation}
<br>
The regression of $Y_i$ on the residuals of $X_{1i}$ on $X_{2i}$ is such that:
\begin{equation}
Y_i = \theta v_i + \xi_i = \theta X_{1i} - \theta \phi X_{2i} + \xi_i
\end{equation}
<br>
Let the coefficient on $X_{2i}$ be called $\lambda$, so the equation becomes
\begin{equation}
Y_i = \theta X_{1i} - \lambda X_{2i} + \xi_i
\end{equation}
<br>
Now, an estimate of $\theta$ is
\begin{equation}
\widehat{\theta} = (X_{1i}'X_{1i})^{-1}X_{1i}'Y = \widehat{\beta}_1
\end{equation}

# Problem 2
The dataset "data2" consists of observations on consumption, income, government expenditure, and investment. The model is such that $Y_i = Consumption$ is the dependent variable and income is the independent variable.<br> 
\begin{equation}
Y_i = \beta_0 + \beta_1 Income_i + \epsilon_i
\end{equation}<br>
Assuming that income is an endogenous variable i.e. $Cov(Income_i,\epsilon_i) \neq 0$ and <br>
\begin{equation}
Income = \delta_0 + \delta_1 Y_i + \alpha Investment_i+ \mu_i
\end{equation} <br>
with investment as our IV for income to make a just identified model, such that $Cov(investment, \epsilon_i) = 0$

In [37]:
file = 'data2.csv'
df2 = pd.read_csv(file)
#print('Descriptive statistics of the data')
#df.describe()

# IV estimation with GMM 
$\beta_{IV} = (Z'X)^{-1}Z'Y$ <br> 

In [175]:
df2['const'] = 1
X = np.array(df2[['const','income']])
Y = np.array(df2['consumption'])
Z = np.matrix(df2[['const','investment']]) 

beta_IV = inv(Z.transpose()*X)*(Z.transpose()*Y.reshape(20,1))
print("The estimates of beta_IV")
print('beta0',beta_IV[0])
print('beta1',beta_IV[1])

The estimates of beta_IV
beta0 [[2.79999699]]
beta1 [[0.77408956]]


# 2SLS IV estimation 
First stage estimation: regression of treatment variable on the instrument.<br>
$Income = \delta_0 + \delta_1 Investment_i + \mu_i$ <br>

In [173]:
results_fs = sm.OLS(df2['income'],
                    df2[['const', 'investment']]).fit()
print(round(results_fs.params,3))

const          2.167
investment    13.359
dtype: float64


Second stage estimation: regression of the outcome on the predicted value from the first stage regression
$Y_i = \beta_0 + \beta_1 \widehat{Income}_i  + \epsilon_i$

In [174]:
df2['predicted_inc'] = results_fs.predict()

results_ss = sm.OLS(df2['consumption'],
                    df2[['const', 'predicted_inc']]).fit()
print(round(results_ss.params,3))

const            2.800
predicted_inc    0.774
dtype: float64


# 2SLS using a linearmodels.iv package for checking results

In [177]:
iv = IV2SLS(dependent=df2['consumption'],
            exog=df2['const'],
            endog=df2['income'],
            instruments=df2['investment']).fit(cov_type='unadjusted')

print(round(iv.params,3))

const     2.800
income    0.774
Name: parameter, dtype: float64


GMM and 2SLS estimations result in similar values for $\widehat{\beta}_1 = 0.774$.

# Ratio of reduced form coefficients
Structural equations:<br>
\begin{equation}
Y_i = \beta_0 + \beta_1 Income_i  + \epsilon_i \\
Income_i = \delta_0 + \delta_1 Y_i + \alpha Investment_i + \mu_i
\end{equation}
<br>

$Income_i = \delta_0 + \delta_1 (\beta_0 + \beta_1 Income + \epsilon_i)+ \alpha Investment_i  + \mu_i$ <br><br>
Reduced form equations:<br>
\begin{equation}
Income_i = \beta^*_0 + \frac{\alpha}{1 - \delta_1 \beta_1} Investment_i + \mu^*_i \\
Y_i = \beta^{**}_0 + \frac{\beta_1 \alpha}{1 - \delta_1 \beta_1} Investment_i + \epsilon*_i
\end{equation}
<br>
Ratio of the reduced form coefficients on Investment
\begin{equation}
Ratio = \frac{\frac{\beta_1 \alpha}{1 - \delta_1 \beta_1}}{\frac{\alpha}{1 - \delta_1 \beta_1}}\\
Ratio = \frac{\beta_1 \alpha}{1 - \delta_1 \beta_1}\frac{1 - \delta_1 \beta_1}{\alpha} = \beta_1
\end{equation}

In [215]:
# Running the two reduced form equations
cons = sm.OLS(df2['consumption'],
             df2[['const','income']]).fit()
beta0 = cons.params[0]
beta1 = cons.params[1]
inc = sm.OLS(df2['income'],
                    df2[['const','consumption','investment']]).fit()
delta0 = inc.params[0]
delta1 = inc.params[1]
alpha = inc.params[2]
num = (beta1*alpha / (1-delta1*beta1))
denum = alpha/(1-delta1*beta1)
Ratio = num / denum
print('Ratio of reduced form equations',round(Ratio,4))

Ratio of reduced form equations 0.7748


This value matches the IV estimates for $\beta_1$ from the GMM and 2SLS methods where $\widehat{\beta}_1 = 0.774$.

# Ratio of sample covariance
Using the formula <br>
$\widehat{\alpha} = \frac{scov(\widetilde{Z},\widetilde{Y})}{scov(\widetilde{Z},\widetilde{T})}$
where the tildes mean residuals after regressing on X and scov sample covariance.<br>

In [203]:
scov_zy = np.cov(df2['investment'],df2['consumption'])
scov_zx = np.cov(df2['investment'],df2['income'])
cov_ratio = np.divide(scov_zy,scov_zx)
print(cov_ratio)

[[1.         0.77408956]
 [0.77408956 0.64235919]]


Where the sample covariance ratio IV estimate is 0.774, which matches our previous estimation results from GMM, 2SLS, and the ratio of reduced form coefficients.