In [1]:
import numpy as np 
import pandas as pd 
import statsmodels.api as sm 
import statsmodels.formula.api as smf 
from itertools import combinations 
import plotnine as p


In [2]:
2+2

4

In [3]:

# read data
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def read_data(file): 
    return pd.read_stata("https://raw.github.com/scunning1975/mixtape/master/" + file)

training_bias_reduction = read_data("training_bias_reduction.dta") 

In [4]:
training_bias_reduction

Unnamed: 0,Unit,Y,D,X
0,1,5,1,11
1,2,2,1,7
2,3,10,1,5
3,4,6,1,3
4,5,4,0,10
5,6,0,0,8
6,7,5,0,4
7,8,1,0,1


## Steps for Bias correction

###
1. Find the closest matching unit with the treatment unit based on the covariate (IV). For eg. for `Unit` 1 with X=11, the matching un-treated unit is X=10, i.e. `Unit` 5. Similarly for others

In [5]:
training_bias_reduction['Y1'] = np.where(training_bias_reduction.D==1, training_bias_reduction.Y, 0)
training_bias_reduction['Y0'] = np.where(training_bias_reduction.D==0, training_bias_reduction.Y, 0)

In [6]:
training_bias_reduction

Unnamed: 0,Unit,Y,D,X,Y1,Y0
0,1,5,1,11,5,0
1,2,2,1,7,2,0
2,3,10,1,5,10,0
3,4,6,1,3,6,0
4,5,4,0,10,0,4
5,6,0,0,8,0,0
6,7,5,0,4,0,5
7,8,1,0,1,0,1


## Step 2
1. Create a column with the fitted data using a model for `Y ~ X`

In [7]:
fitted_model = sm.OLS.from_formula('Y ~ X', training_bias_reduction).fit()

In [8]:
training_bias_reduction['fitted'] =  fitted_model.predict(training_bias_reduction.X)

In [9]:
training_bias_reduction

Unnamed: 0,Unit,Y,D,X,Y1,Y0,fitted
0,1,5,1,11,5,0,3.888071
1,2,2,1,7,2,0,4.082474
2,3,10,1,5,10,0,4.179676
3,4,6,1,3,6,0,4.276878
4,5,4,0,10,0,4,3.936672
5,6,0,0,8,0,0,4.033873
6,7,5,0,4,0,5,4.228277
7,8,1,0,1,0,1,4.37408


## Compute the bias
`Bias := it is the diff in the predicted values generated based on the covariates.` implying that, given the same model, and the same set of covariates, and no information on which units 
are treated or not, the model should generate fitted values consistent with these assumptions.


Bias reduction method :=
It is the diff between the diff of `Treated` and `Un-Treated` covariate and the diff between `Treated` predicted value and `un_treated` predicted value

In [10]:
np.array(training_bias_reduction['Y'][training_bias_reduction.D==1]) - np.array(training_bias_reduction['Y'][training_bias_reduction.D==0])

array([1, 2, 5, 5], dtype=int8)

In [11]:
ATT = np.mean((np.array(training_bias_reduction['Y'][training_bias_reduction.D==1]) - np.array(training_bias_reduction['Y'][training_bias_reduction.D==0])) -
 (np.array(training_bias_reduction['fitted'][training_bias_reduction.D==1]) - np.array(training_bias_reduction['fitted'][training_bias_reduction.D==0])))

print(ATT)

3.2864506627393233


## Computing the variance of the bias estimator

![bias_estimator_variance.png](./bias_estimator_variance.png)

In [12]:
var_att = (((np.array(training_bias_reduction.Y[training_bias_reduction.D==1]) - np.array(training_bias_reduction.Y[training_bias_reduction.D==0]) - ATT)) **2).mean()
var_att, np.sqrt(var_att)

(3.188828650814136, 1.7857291650231106)