In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import random
import os
from IPython.display import Image, display

In [2]:
def fix_seed(seed):
    # random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)

SEED = 42
fix_seed(SEED)

ESサイトのユーザーに対して**RCT**を適用したメールマーケティングを行ったデータ。

In [3]:
# データのロード
base_path = "/Users/ryoto/workspace/hit-u/zemi/econome_ml_demo/data"
mail = pd.read_csv(os.path.join(base_path,'E-MailAnalytics.csv'))
print(mail.shape)
mail.head()

(64000, 12)


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [4]:
print("num of No E-Mail : ", len(mail.query("segment == 'No E-Mail'")))
print("num of Mens E-Mail : ", len(mail.query("segment == 'Womens E-Mail'")))
print("num of No Womens : ", len(mail.query("segment == 'Mens E-Mail'")))

num of No E-Mail :  21306
num of Mens E-Mail :  21387
num of No Womens :  21307


## RCTのデータからATEを求めてBaseLineとする
簡略化のために女性向けのメールが配信されているデータを削除する。

In [5]:
mail_df = mail.query("segment != 'Womens E-Mail'")
print(len(mail_df.query("segment == 'No E-Mail'")))
print(len(mail_df.query("segment == 'Womens E-Mail'")))
print(len(mail_df.query("segment == 'Mens E-Mail'")))

21306
0
21307


In [6]:
# segment == Men's E-mail のデータのみで実験する
male_df = mail.query("segment != 'Womens E-Mail'")
male_df["segment"] = male_df.segment.map(lambda x: 1 if x == 'Mens E-Mail' else 0)
male_df = male_df.rename(columns={"segment" : "treatment"})
print(male_df.shape)
display(male_df.head())

sample_rules = (male_df.history > 300) | (male_df.recency < 6) | (male_df.channel=='Multichannel')
mail_df = pd.concat([
    male_df[(sample_rules) & (male_df.treatment == 0)].sample(frac=0.5, random_state=1),
    male_df[(sample_rules) & (male_df.treatment == 1)],
    male_df[(~sample_rules) & (male_df.treatment == 0)],
    male_df[(~sample_rules) & (male_df.treatment == 1)].sample(frac=0.5, random_state=1)
], axis=0, ignore_index=True)

print(mail_df.shape)
display(mail_df.head())

(42613, 12)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,treatment,visit,conversion,spend
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,0,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,1,0,0,0.0
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,1,0,0,0.0
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,1,1,0,0.0
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,0,0,0,0.0


(31925, 12)


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,treatment,visit,conversion,spend
0,8,5) $500 - $750,572.65,1,0,Urban,1,Web,0,0,0,0.0
1,5,1) $0 - $100,42.38,1,0,Urban,1,Phone,0,1,0,0.0
2,1,"7) $1,000 +",3003.48,1,1,Urban,1,Phone,0,0,0,0.0
3,1,5) $500 - $750,662.1,0,1,Urban,1,Web,0,0,0,0.0
4,5,1) $0 - $100,44.37,0,1,Urban,0,Web,0,0,0,0.0


In [7]:
mail_df.groupby("treatment").agg({"spend" : "mean", "conversion": "mean", "visit":"count"}).rename(columns={"visit" : "count"})

Unnamed: 0_level_0,spend,conversion,count
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.557954,0.00454,14757
1,1.541704,0.013572,17168


In [8]:
# treatment 0, 1 のデータフレームに分ける
treatment_1 = male_df.query("treatment == 1")
treatment_0 = male_df.query("treatment == 0")

# 介入が購買金額に与えた影響を計算する
ts_1 = treatment_1["spend"].mean()
ts_0 = treatment_0["spend"].mean()
print("介入が購買金額に与えた影響は,", np.round(ts_1 - ts_0, 3))

# 介入がconversionに与えた影響を計算する
tc_1 = treatment_1["conversion"].mean()
tc_0 = treatment_0["conversion"].mean()
True_ATE = np.round((tc_1 - tc_0), 5)
print("介入がconversionに与えた影響は,", True_ATE)

介入が購買金額に与えた影響は, 0.77
介入がconversionに与えた影響は, 0.00681


## データの前処理

In [9]:
mail_df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,treatment,visit,conversion,spend
0,8,5) $500 - $750,572.65,1,0,Urban,1,Web,0,0,0,0.0
1,5,1) $0 - $100,42.38,1,0,Urban,1,Phone,0,1,0,0.0
2,1,"7) $1,000 +",3003.48,1,1,Urban,1,Phone,0,0,0,0.0
3,1,5) $500 - $750,662.1,0,1,Urban,1,Web,0,0,0,0.0
4,5,1) $0 - $100,44.37,0,1,Urban,0,Web,0,0,0,0.0


In [10]:
# ラベルエンコーディング（OrdinalEncoder）
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()
encoded = oe.fit_transform(mail_df[['history_segment', 'zip_code', "channel"]].values)
# decoded = oe.inverse_transform(encoded)

print('エンコード結果: ')
encoded_df = pd.DataFrame(encoded, columns = ["history_segment", "zip_code", "channel"])
encoded_df.head()

エンコード結果: 


Unnamed: 0,history_segment,zip_code,channel
0,4.0,2.0,2.0
1,0.0,2.0,1.0
2,6.0,2.0,1.0
3,4.0,2.0,2.0
4,0.0,2.0,2.0


In [11]:
mail_df_use = mail_df[
    ['recency', 'history', 'mens', 'womens', 'newbie', 'treatment', 'visit', 'conversion', 'spend']
    ]
demo_df = pd.concat([mail_df_use.reset_index(), encoded_df], axis = 1).drop(columns = "index")
print(demo_df.shape)
demo_df.head()

(31925, 12)


Unnamed: 0,recency,history,mens,womens,newbie,treatment,visit,conversion,spend,history_segment,zip_code,channel
0,8,572.65,1,0,1,0,0,0,0.0,4.0,2.0,2.0
1,5,42.38,1,0,1,0,1,0,0.0,0.0,2.0,1.0
2,1,3003.48,1,1,1,0,0,0,0.0,6.0,2.0,1.0
3,1,662.1,0,1,1,0,0,0,0.0,4.0,2.0,2.0
4,5,44.37,0,1,0,0,0,0,0.0,0.0,2.0,2.0


In [12]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(demo_df, test_size= 0.2,random_state=SEED)
print(train_df.shape)
print(val_df.shape)

(25540, 12)
(6385, 12)


## Econmlで試す

In [13]:
# Main imports
from econml.metalearners import TLearner, SLearner, XLearner, DomainAdaptationLearner

# Helper imports 
import numpy as np
from numpy.random import binomial, multivariate_normal, normal, uniform
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
import matplotlib.pyplot as plt

In [14]:
# set data 
Y = train_df.conversion
T = train_df.treatment
X = train_df.drop(columns=["conversion", "treatment", "spend", "visit"])
X_val = val_df.drop(columns=["conversion", "treatment", "spend", "visit"])

### T-Learner

In [15]:
# Instantiate T learner
models = RandomForestRegressor(n_estimators=1000, max_depth=10, min_samples_leaf=64, random_state = 42)
T_learner = TLearner(models=models)
# Train T_learner
T_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
T_te = T_learner.effect(X_val)

print("True ATE : ", True_ATE)
pred_ATE_T = np.round(T_learner.ate(X_val), 5)
print("Predict ATE : ", pred_ATE_T)

True ATE :  0.00681
Predict ATE :  0.00721


### S-Learner

In [16]:
# Instantiate S learner
overall_model = RandomForestRegressor(n_estimators=1000, max_depth=6, min_samples_leaf=64, random_state = 42)
S_learner = SLearner(overall_model=overall_model)
# Train S_learner
S_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
S_te = S_learner.effect(X_val)

print("True ATE : ", True_ATE)
pred_ATE_S = np.round(S_learner.ate(X_val), 5)
print("Predict ATE : ", pred_ATE_S)

True ATE :  0.00681
Predict ATE :  0.00678


### X-Learner

In [15]:
# Instantiate X learner
models = RandomForestRegressor(n_estimators=1500, max_depth= 10 , min_samples_leaf=64, random_state = 42)
propensity_model = RandomForestClassifier(n_estimators=500, max_depth = 10 , min_samples_leaf=64, random_state = 42)
X_learner = XLearner(models=models, propensity_model=propensity_model)
# Train X_learner
X_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
X_te = X_learner.effect(X_val)
print("True ATE : ", True_ATE)
pred_ATE_X = np.round(X_learner.ate(X_val), 5)
print("Predict ATE : ", pred_ATE_X)

True ATE :  0.00681
Predict ATE :  0.00728


In [16]:
import shap
shap_values = X_learner.shap_values(X)
# local view: explain hetergoeneity for a given observation
ind=0
shap.plots.force(shap_values["d払い_action_dummy"]["d払い_binary"][ind], matplotlib=True)
# global view: explain hetergoeneity for a sample of dataset
shap.summary_plot(shap_values['d払い_action_dummy']['d払い_binary'])

Exact explainer:   0%|          | 11/25540 [00:22<13:49:47,  1.95s/it]

KeyboardInterrupt: 

# Use_fulldata

In [18]:
# set data 
Y = demo_df.conversion
T = demo_df.treatment
X = demo_df.drop(columns=["conversion", "treatment", "spend", "visit"])
print(demo_df.shape)

(31925, 12)


## T-Learner

In [19]:
# Instantiate T learner
models = RandomForestRegressor(n_estimators=1000, max_depth=10,  random_state = 42)
T_learner = TLearner(models=models)
# Train T_learner
T_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
T_te = T_learner.effect(X)

print("True ATE : ", True_ATE)
pred_ATE_T = np.round(T_learner.ate(X), 6)
print("Predict ATE : ", pred_ATE_T)

True ATE :  0.00681
Predict ATE :  0.007537


In [20]:
# Instantiate T learner
models = GradientBoostingRegressor(n_estimators=1000, max_depth=10,  random_state = 42)
T_learner = TLearner(models=models)
# Train T_learner
T_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
T_te = T_learner.effect(X)

print("True ATE : ", True_ATE)
pred_ATE_T = np.round(T_learner.ate(X), 6)
print("Predict ATE : ", pred_ATE_T)

True ATE :  0.00681
Predict ATE :  0.007792


## S-Learner

In [21]:
# Instantiate S learner
overall_model = RandomForestRegressor(n_estimators=1000, max_depth=10,  random_state = 42)
S_learner = SLearner(overall_model=overall_model)
# Train S_learner
S_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
S_te = S_learner.effect(X)

print("True ATE : ", True_ATE)
pred_ATE_S = np.round(S_learner.ate(X), 5)
print("Predict ATE : ", pred_ATE_S)

True ATE :  0.00681
Predict ATE :  0.00741


In [22]:
# Instantiate S learner
overall_model = GradientBoostingRegressor(n_estimators=1000, max_depth=10,  random_state = 42)
S_learner = SLearner(overall_model=overall_model)
# Train S_learner
S_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
S_te = S_learner.effect(X)

print("True ATE : ", True_ATE)
pred_ATE_S = np.round(S_learner.ate(X), 5)
print("Predict ATE : ", pred_ATE_S)

True ATE :  0.00681
Predict ATE :  0.00277


## X-Learner

In [23]:
# Instantiate X learner
models = RandomForestRegressor(n_estimators=1000, max_depth=12,  random_state = 42)
propensity_model = RandomForestClassifier(n_estimators=1000, max_depth=12,  random_state = 42)
X_learner = XLearner(models=models, propensity_model=propensity_model)
# Train X_learner
X_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
X_te = X_learner.effect(X)
print("True ATE : ", True_ATE)
pred_ATE_X = np.round(X_learner.ate(X), 5)
print("Predict ATE : ", pred_ATE_X)

True ATE :  0.00681
Predict ATE :  0.00776


In [24]:
# Instantiate X learner
models = GradientBoostingRegressor(n_estimators=1000, max_depth=12,  random_state = 42)
propensity_model = RandomForestClassifier(n_estimators=1000, max_depth=12,  random_state = 42)
X_learner = XLearner(models=models, propensity_model=propensity_model)
# Train X_learner
X_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
X_te = X_learner.effect(X)
print("True ATE : ", True_ATE)
pred_ATE_X = np.round(X_learner.ate(X), 5)
print("Predict ATE : ", pred_ATE_X)

True ATE :  0.00681
Predict ATE :  0.00853
