In [12]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import random
import os
from sklearn.model_selection import train_test_split

In [13]:
# データをダウンロード
base_path = "/Users/ryoto/workspace/hit-u/zemi/econml_demo/data"
df = pd.read_csv(os.path.join(base_path, "app_cm_dataset.csv"))
print(df.columns)
print(df.shape)
df.head()

Index(['Unnamed: 0', 'cm_dummy', 'gamedummy', 'area_kanto', 'area_keihan',
       'area_tokai', 'area_keihanshin', 'age', 'sex', 'marry_dummy',
       'job_dummy1', 'job_dummy2', 'job_dummy3', 'job_dummy4', 'job_dummy5',
       'job_dummy6', 'job_dummy7', 'job_dummy8', 'inc', 'pmoney',
       'fam_str_dummy1', 'fam_str_dummy2', 'fam_str_dummy3', 'fam_str_dummy4',
       'fam_str_dummy5', 'child_dummy', 'T', 'F1', 'F2', 'F3', 'M1', 'M2',
       'M3', 'TVwatch_day', 'gamesecond', 'gamecount'],
      dtype='object')
(10000, 36)


Unnamed: 0.1,Unnamed: 0,cm_dummy,gamedummy,area_kanto,area_keihan,area_tokai,area_keihanshin,age,sex,marry_dummy,...,T,F1,F2,F3,M1,M2,M3,TVwatch_day,gamesecond,gamecount
0,0,0,0,0,0,0,1,44.5,1,1,...,0,0,0,0,0,1,0,33.4276,0,0
1,1,0,0,0,1,0,0,34.5,1,1,...,0,0,0,0,0,1,0,31.542862,0,0
2,2,0,0,0,1,0,0,24.5,1,0,...,0,0,0,0,1,0,0,37.825805,0,0
3,3,0,0,0,1,0,0,44.5,1,1,...,0,0,0,0,0,1,0,36.345911,0,0
4,4,0,0,0,1,0,0,34.5,1,1,...,0,0,0,0,1,0,0,49.344942,0,0


In [14]:
cnt = 0
for i in df["cm_dummy"]:
    if i != 0:
        cnt += 1
print(cnt)

4144


In [15]:
X = df[['area_kanto', 'area_tokai', 'area_keihanshin', 'age', 'sex', 'marry_dummy', 'child_dummy',
        'job_dummy1', 'job_dummy2', 'job_dummy3', 'job_dummy4', 'job_dummy5', 'job_dummy6', 'job_dummy7', 
        'inc', 'pmoney', 'fam_str_dummy1', 'fam_str_dummy2', 'fam_str_dummy3', 'fam_str_dummy4', 'TVwatch_day']]
Y = df['gamesecond'] # アプリ利用秒数
T = df['cm_dummy'] # CM接触有無

# 学習データとテストデータに分割 (Wで層化)
X_train, X_test, Y_train, Y_test, T_train, T_test = train_test_split(X, Y, T, test_size=0.2, shuffle=True, random_state=42, stratify=T)

In [16]:
# Main imports
from econml.metalearners import TLearner, SLearner, XLearner, DomainAdaptationLearner

# Helper imports 
import numpy as np
from numpy.random import binomial, multivariate_normal, normal, uniform
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier

## T-Learner

In [17]:
# Instantiate T learner
models = RandomForestRegressor(n_estimators=1000, max_depth=6, min_samples_leaf=64, random_state = 42)
T_learner = TLearner(models=models)
# Train T_learner
T_learner.fit(Y_train, T_train, X=X_train)
# Estimate treatment effects on test data
T_te = T_learner.effect(X_test)

# print("True ATE : ", True_ATE)
pred_ATE_T = np.round(T_learner.ate(X_test), 9)
print("Predict ATE : ", pred_ATE_T)

Predict ATE :  147.495083668


## S-Learner

In [18]:
# Instantiate S learner
overall_model = RandomForestRegressor(n_estimators=1000, max_depth=6, min_samples_leaf=64, random_state = 42)
S_learner = SLearner(overall_model=overall_model)
# Train S_learner
S_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
S_te = S_learner.effect(X_test)

# print("True ATE : ", True_ATE)
pred_ATE_S = np.round(S_learner.ate(X_test), 9)
print("Predict ATE : ", pred_ATE_S)

Predict ATE :  32.636683685


## X-Learner

In [19]:
# Instantiate X learner
models = RandomForestRegressor(n_estimators=1500, max_depth= 6 , min_samples_leaf=64, random_state = 42)
propensity_model = RandomForestClassifier(n_estimators=500, max_depth = 6 , min_samples_leaf=64, random_state = 42)
X_learner = XLearner(models=models, propensity_model=propensity_model)
# Train X_learner
X_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
X_te = X_learner.effect(X_test)
# print("True ATE : ", True_ATE)
pred_ATE_X = np.round(X_learner.ate(X_test), 5)
print("Predict ATE : ", pred_ATE_X)

Predict ATE :  52.27167
