<a href="https://colab.research.google.com/github/rohitm487/Causal_Inference_in_Python/blob/main/4_Causal_Inference_Instrumental_Variables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install linearmodels

Collecting linearmodels
  Downloading linearmodels-6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting mypy-extensions>=0.4 (from linearmodels)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting pyhdfe>=0.1 (from linearmodels)
  Downloading pyhdfe-0.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting formulaic>=1.0.0 (from linearmodels)
  Downloading formulaic-1.0.2-py3-none-any.whl.metadata (6.8 kB)
Collecting setuptools-scm<9.0.0,>=8.0.0 (from setuptools-scm[toml]<9.0.0,>=8.0.0->linearmodels)
  Downloading setuptools_scm-8.1.0-py3-none-any.whl.metadata (6.6 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=1.0.0->linearmodels)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading linearmodels-6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from linearmodels.iv import IV2SLS

# Simulate Instrumental Variable dataset
def sim_iv_df():
    np.random.seed(42)

    n = 100
    # Instrument: whether the user received an email (Z)
    Received_Email = np.random.binomial(1, 0.5, n)

    # Unobserved motivation (confounder affecting both X and Y)
    Unobs_Motivation = np.random.normal(0, 1, n)

    # Use of mobile app (endogenous variable, X), influenced by email and motivation
    Use_Mobile_App = (Received_Email + Unobs_Motivation + np.random.normal(0, 0.5, n) > 0).astype(int)

    # Retention (dependent variable, Y), affected by app usage and motivation
    Retention = (Use_Mobile_App + Unobs_Motivation + np.random.normal(0, 0.5, n) > 0).astype(int)

    return pd.DataFrame({
        'Received_Email': Received_Email,
        'Unobs_Motivation': Unobs_Motivation,
        'Use_Mobile_App': Use_Mobile_App,
        'Retention': Retention
    })

# Generate the dataset
dat = sim_iv_df()

# Explore the dataset
print(dat.head())

# Check for endogeneity bias by looking at the relationship between motivation and app usage/retention
print(dat.groupby('Use_Mobile_App')['Unobs_Motivation'].mean())
print(dat.groupby('Retention')['Unobs_Motivation'].mean())


   Received_Email  Unobs_Motivation  Use_Mobile_App  Retention
0               0          0.087047               1          1
1               1         -0.299007               1          1
2               1          0.091761               1          1
3               1         -1.987569               1          0
4               0         -0.219672               1          1
Use_Mobile_App
0   -0.636267
1    0.311773
Name: Unobs_Motivation, dtype: float64
Retention
0   -0.964489
1    0.373579
Name: Unobs_Motivation, dtype: float64


In [4]:
# Naive regression (ignoring endogeneity)
model1 = ols('Retention ~ Use_Mobile_App', data=dat).fit()
print(model1.summary())


                            OLS Regression Results                            
Dep. Variable:              Retention   R-squared:                       0.489
Model:                            OLS   Adj. R-squared:                  0.484
Method:                 Least Squares   F-statistic:                     93.69
Date:                Fri, 11 Oct 2024   Prob (F-statistic):           6.01e-16
Time:                        09:33:39   Log-Likelihood:                -28.275
No. Observations:                 100   AIC:                             60.55
Df Residuals:                      98   BIC:                             65.76
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.2727      0.056      4.

## This stage predicts `Use.Mobile.App` using the instrumental variable `Received.Email`

In [5]:
# First-stage regression: Use.Mobile.App ~ Received.Email
model2 = ols('Use_Mobile_App ~ Received_Email', data=dat).fit()
print(model2.summary())


                            OLS Regression Results                            
Dep. Variable:         Use_Mobile_App   R-squared:                       0.131
Model:                            OLS   Adj. R-squared:                  0.123
Method:                 Least Squares   F-statistic:                     14.84
Date:                Fri, 11 Oct 2024   Prob (F-statistic):           0.000209
Time:                        09:34:08   Log-Likelihood:                -59.388
No. Observations:                 100   AIC:                             122.8
Df Residuals:                      98   BIC:                             128.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.5094      0.061      8.

## We use the predicted values `(Use_Mobile_App_Hat)` from the first stage to predict retention, addressing the endogeneity.

In [8]:
# Create the predicted value from the first-stage regression (Use_Mobile_App_Hat)
dat['Use_Mobile_App_Hat'] = model2.predict(dat['Received_Email'])

# Second-stage regression: Retention ~ Use_Mobile_App_Hat
model3 = ols('Retention ~ Use_Mobile_App_Hat', data=dat).fit()
print(model3.summary())


                            OLS Regression Results                            
Dep. Variable:              Retention   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.9190
Date:                Fri, 11 Oct 2024   Prob (F-statistic):              0.340
Time:                        09:39:12   Log-Likelihood:                -61.354
No. Observations:                 100   AIC:                             126.7
Df Residuals:                      98   BIC:                             131.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              0.5499      0

# Two-Stage Least Squares (2SLS):
The two-stage least squares (2SLS) model automatically performs both stages of the IV estimation in one step.

In [6]:
# Two-stage least squares (2SLS) for IV estimation
iv_model = IV2SLS.from_formula('Retention ~ 1 + [Use_Mobile_App ~ Received_Email]', data=dat).fit()
print(iv_model.summary)


                          IV-2SLS Estimation Summary                          
Dep. Variable:              Retention   R-squared:                      0.3010
Estimator:                    IV-2SLS   Adj. R-squared:                 0.2939
No. Observations:                 100   F-statistic:                    1.3414
Date:                Fri, Oct 11 2024   P-value (F-stat)                0.2468
Time:                        09:37:11   Distribution:                  chi2(1)
Cov. Estimator:                robust                                         
                                                                              
                               Parameter Estimates                                
                Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
----------------------------------------------------------------------------------
Intercept          0.5499     0.1543     3.5639     0.0004      0.2475      0.8524
Use_Mobile_App     0.2538     0.2192

In [9]:
# Compare all models (summary of results)
print("Naive Regression:")
print(model1.summary())

print("\nFirst-Stage Regression:")
print(model2.summary())

print("\nSecond-Stage Regression:")
print(model3.summary())

print("\nTwo-Stage Least Squares (IV) Model:")
print(iv_model.summary)


Naive Regression:
                            OLS Regression Results                            
Dep. Variable:              Retention   R-squared:                       0.489
Model:                            OLS   Adj. R-squared:                  0.484
Method:                 Least Squares   F-statistic:                     93.69
Date:                Fri, 11 Oct 2024   Prob (F-statistic):           6.01e-16
Time:                        09:39:25   Log-Likelihood:                -28.275
No. Observations:                 100   AIC:                             60.55
Df Residuals:                      98   BIC:                             65.76
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.2727 