In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from pandas.plotting import scatter_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
import seaborn as sns
%matplotlib inline
import os
import pandas as pd
import json


## 1. Propensity score matching

In [4]:
data = pd.read_csv('lalonde.csv')
data.head()

Unnamed: 0,id,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78
0,NSW1,1,37,11,1,0,1,1,0.0,0.0,9930.046
1,NSW2,1,22,9,0,1,0,1,0.0,0.0,3595.894
2,NSW3,1,30,12,1,0,0,0,0.0,0.0,24909.45
3,NSW4,1,27,11,1,0,0,1,0.0,0.0,7506.146
4,NSW5,1,33,8,1,0,0,1,0.0,0.0,289.7899


Observational study
We have 2 groups: "treated" vs "control"
The question is: was the treatment effective on the worker? That is, did it improved his earnings? did it depend on any of the other variables? (age, origin, marriage, education)
We need a way to measure effectiveness based on change in earnings

treatment assignment: Z = 1 (treated) and Z = 0 (control) -> treat
response if subject is treated: r_t -> how re78 differs from re74 and re75 (Z=1)
response if subject is control: r_c -> how re78 differs from re74 and re75 (Z=0)
observed covariates (features x) -> age, educ, black, hispan, married, nodegree
unobserved covariates ->  Naive model: exclude them because assume “Only observed variables determine treatment assignment”. Bad, naive model is almost never true. -> Use propensity score.
"Adjustments using a model attempt to compare people who are not directly
comparable — people of somewhat different ages or smoking habits — removing
these differences using a mathematical structure that has elements estimated from
the data at hand."


Propensity score -> "if naive model is true, it equals the probability to treat"
e(x) = P(Z=1|x)
Computed by logistic regression. Features x, label Z



In [5]:
# print the shape of the DataFrame
data.shape

(614, 11)

In [15]:
data


Unnamed: 0,id,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78
0,NSW1,1,37,11,1,0,1,1,0.0000,0.000,9930.0460
1,NSW2,1,22,9,0,1,0,1,0.0000,0.000,3595.8940
2,NSW3,1,30,12,1,0,0,0,0.0000,0.000,24909.4500
3,NSW4,1,27,11,1,0,0,1,0.0000,0.000,7506.1460
4,NSW5,1,33,8,1,0,0,1,0.0000,0.000,289.7899
5,NSW6,1,22,9,1,0,0,1,0.0000,0.000,4056.4940
6,NSW7,1,23,12,1,0,0,0,0.0000,0.000,0.0000
7,NSW8,1,32,11,1,0,0,1,0.0000,0.000,8472.1580
8,NSW9,1,22,16,1,0,0,0,0.0000,0.000,2164.0220
9,NSW10,1,33,12,0,0,1,0,0.0000,0.000,12418.0700


In [20]:
# plot salaries
#data['diff'] = data['re78']-data['re75']
data[data['treat'] == 1].plot(y='re74', figsize=(16, 8), grid=True)
#data[data['treat'] == 0].plot(y='diff', figsize=(16, 8), grid=True)


SyntaxError: unexpected EOF while parsing (<ipython-input-20-ab811c558b43>, line 4)

In [36]:
# compute the response
data[data['treat'] == 1].describe()


AttributeError: 'DataFrame' object has no attribute 'histo'

In [35]:
data[data['treat'] == 0].describe()


Unnamed: 0,treat,age,educ,black,hispan,married,nodegree,re74,re75,re78,diff
count,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0,429.0
mean,0.0,28.030303,10.235431,0.202797,0.142191,0.512821,0.596737,5619.236506,2466.484443,6984.169742,4517.685299
std,0.0,10.786653,2.855238,0.402552,0.349654,0.500419,0.491126,6788.750796,3291.996183,7294.161791,6917.542917
min,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13776.53
25%,0.0,19.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,220.1813,0.0
50%,0.0,25.0,11.0,0.0,0.0,1.0,1.0,2547.047,1086.726,4975.505,2688.385
75%,0.0,35.0,12.0,0.0,0.0,1.0,1.0,9277.128,3881.419,11688.82,8007.6011
max,0.0,55.0,18.0,1.0,1.0,1.0,1.0,25862.32,18347.23,25564.67,25564.67


In [37]:
data[data['treat'] == 0]

NameError: name 'histo' is not defined

In [None]:
# visualize the relationship between the features and the response using scatterplots
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='TV', y='sales', ax=axs[0], figsize=(16, 8), grid=True)
data.plot(kind='scatter', x='radio', y='sales', ax=axs[1], grid=True)
data.plot(kind='scatter', x='newspaper', y='sales', ax=axs[2], grid=True)