# Qiita ~ 傾向スコアでセレクションバイアスを補正する~

In [68]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import statsmodels.api as sm
import random

from operator import itemgetter
from pandas import DataFrame, Series
from plotly.offline import iplot, plot, init_notebook_mode
from plotly.graph_objs import Histogram, Box, Scatter, Figure, Layout, Bar

from sklearn.cross_validation import train_test_split, cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, LinearSVR

### RHC Data

In [45]:
### データの読み込み
data_df = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/rhc.csv")
#data_df.head()

In [46]:
### 死亡率とRHCの有無のクロス集計
pd.crosstab(data_df.death, data_df.swang1)

swang1,No RHC,RHC
death,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1315,698
Yes,2236,1486


In [47]:
### RHCの有無での死亡率の差を計算
(1486 / (698 + 1486)) - (2236 / (1315 + 2236))

0.050721150622586864

In [48]:
### 用いる説明変数群、詳細はデータセットの記述を参照
cols = ["cat1", "sex", "race", "edu", "income", 
        "resp", "card", "neuro", "gastr", "renal", "meta", "hema", "seps", "trauma", "ortho", 
        "das2d3pc", "dnr1", "ca", "surv2md1", "aps1", "scoma1", "wtkilo1", "temp1",
        "resp1", "hrt1", "pafi1", "paco21", "ph1", "wblc1", "hema1", "sod1", "pot1", "crea1",
        "bili1", "alb1", "cardiohx", "chfhx", "dementhx", "psychhx", "chrpulhx", "renalhx",
        "liverhx", "gibledhx", "immunhx", "transhx", "amihx",
        "age", "meanbp1"]

### 説明変数中のカテゴリカル変数
categorical_columns = ["cat1", "sex", "race", "edu", "income","ca", "dnr1",
                       "resp", "card", "neuro", "gastr", "renal", "meta", "hema", "seps", "trauma", "ortho"]

### カテゴリカル変数のダミー化
X = data_df[cols]
dummy = pd.get_dummies(X[categorical_columns], drop_first=True)
X = pd.concat([X, dummy], axis=1)
X = X.drop(categorical_columns, axis=1)

### 切片の導入
X["Intercept"] = 1

### RHC有無のダミー変数
z1 = pd.get_dummies(data_df["swang1"])["RHC"]

### 目的変数
y = pd.get_dummies(data_df["death"])["Yes"]

In [50]:
### StatsModelsのLogitにより傾向スコアを推定
glm = sm.Logit(z1, X)
result = glm.fit()
ps = result.predict(X)
#ps

Optimization terminated successfully.
         Current function value: inf
         Iterations 6



overflow encountered in exp


divide by zero encountered in log



In [51]:
#result.summary2()

In [52]:
### c統計量としてAUCをを計算
fpr, tpr, thresholds = roc_curve(z1, ps)
auc(fpr, tpr)

0.7963133740379587

In [53]:
### IPWによりATEを推定
ipwe1 = sum((z1*y)/ps)/sum(z1/ps) 
ipwe0 = sum(((1-z1)*y)/(1-ps))/sum((1-z1)/(1-ps)) 
ATE = ipwe1 - ipwe0
ATE

0.05907512789883507

### CM Data

In [54]:
### データの読み込み
data_df = pd.read_csv('https://github.com/iwanami-datascience/vol3/raw/master/kato%26hoshino/q_data_x.csv')
#data_df.head()

In [55]:
### アプリ利用ダミーとCM視聴有無のクロス集計
pd.crosstab(data_df.gamedummy, data_df.cm_dummy)

cm_dummy,0,1
gamedummy,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5428,3832
1,428,312


In [56]:
### CM視聴有無でのアプリ利用率の差を計算
(312 / (3832 + 312)) - (428 / (5428 + 428))

0.002202143595586223

In [57]:
### CM視聴有無でのアプリ利用回数の差を計算
data_df[data_df.cm_dummy == 1].gamecount.mean() - data_df[data_df.cm_dummy == 0].gamecount.mean()

-1.4845493913116865

In [58]:
### CM視聴有無でのアプリ利用時間の差を計算
data_df[data_df.cm_dummy == 1].gamesecond.mean() - data_df[data_df.cm_dummy == 0].gamesecond.mean()

-629.6405765396544

In [59]:
### 説明変数
cols = ["age", "sex", "TVwatch_day", "marry_dummy", "child_dummy", "inc", "pmoney", "area_kanto", "area_tokai",
        "area_keihanshin", "job_dummy1", "job_dummy2", "job_dummy3", "job_dummy4", "job_dummy5", "job_dummy6",
        "fam_str_dummy1", "fam_str_dummy2", "fam_str_dummy3", "fam_str_dummy4"]
X = data_df[cols]

### 切片の導入
X["Intercept"] = 1

### CM視聴有無ダミー
z1 = data_df.cm_dummy

### 目的変数群（1:アプリ利用ダミー, 2:アプリ利用回数、3:アプリ利用時間）
y1 = data_df.gamedummy
y2 = data_df.gamecount
y3 = data_df.gamesecond



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [61]:
### StatsModelsのLogitにより傾向スコアを推定
glm = sm.Logit(z1, X)
result = glm.fit()
ps = result.predict(X)
#ps

Optimization terminated successfully.
         Current function value: 0.542152
         Iterations 6


In [63]:
#result.summary2()

In [64]:
### c統計量としてAUCをを計算
fpr, tpr, thresholds = roc_curve(z1, ps)
auc(fpr, tpr)

0.7917012811992321

In [65]:
### IPWによりアプリ利用ダミーへのATEを推定
ipwe11 = sum((z1*y1)/ps)/sum(z1/ps) ## Treated
ipwe10 = sum(((1-z1)*y1)/(1-ps))/sum((1-z1)/(1-ps)) ## Control
ATE1 = ipwe11 - ipwe10 
ATE1

0.032311773305120556

In [66]:
### IPWによりアプリ利用回数へのATEを推定
ipwe21 = sum((z1*y2)/ps)/sum(z1/ps) ## Treated
ipwe20 = sum(((1-z1)*y2)/(1-ps))/sum((1-z1)/(1-ps)) ## Control
ATE2 = ipwe21 - ipwe20
ATE2

5.349029566474508

In [67]:
### IPWによりアプリ利用時間へのATEを推定
ipwe31 = sum((z1*y3)/ps)/sum(z1/ps) 
ipwe30 = sum(((1-z1)*y3)/(1-ps))/sum((1-z1)/(1-ps)) 
ATE3 = ipwe31 - ipwe30 
ATE3

1513.69969078248