In [50]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

ESサイトのユーザーに対して**RCT**を適用したメールマーケティングを行ったデータ。

In [97]:
# データのダウンロード
mail = pd.read_csv('../data/E-MailAnalytics.csv')

In [52]:
print(mail.shape)
mail.head()

(64000, 12)


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [53]:
print(len(mail.query("segment == 'No E-Mail'")))
print(len(mail.query("segment == 'Womens E-Mail'")))
print(len(mail.query("segment == 'Mens E-Mail'")))

21306
21387
21307


## RCTのデータからATEを求めてBaseLineとする
簡略化のために女性向けのメールが配信されているデータを削除する。

In [54]:
mail_df = mail.query("segment != 'Womens E-Mail'")
print(len(mail.query("segment == 'No E-Mail'")))
print(len(mail.query("segment == 'Womens E-Mail'")))
print(len(mail.query("segment == 'Mens E-Mail'")))

21306
21387
21307


In [55]:
mail_df["segment"] = mail_df.segment.map(lambda x: 1 if x == 'Mens E-Mail' else 0)
mail_df = mail_df.rename(columns={"segment" : "treatment"})
mail_df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,treatment,visit,conversion,spend
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,0,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,1,0,0,0.0
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,1,0,0,0.0
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,1,1,0,0.0
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,0,0,0,0.0


In [56]:
mail_df.groupby("treatment").agg({"spend" : "mean", "conversion": "mean", "visit":"count"}).rename(columns={"visit" : "count"})

Unnamed: 0_level_0,spend,conversion,count
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.652789,0.005726,21306
1,1.422617,0.012531,21307


In [57]:
# treatment 0, 1 のデータフレームに分ける
treatment_1 = mail_df.query("treatment == 1")
treatment_0 = mail_df.query("treatment == 0")

In [58]:
# 介入が購買金額に与えた影響を計算する
ts_1 = treatment_1["spend"].mean()
ts_0 = treatment_0["spend"].mean()
print("介入が購買金額に与えた影響は,", np.round(ts_1 - ts_0, 3))

介入が購買金額に与えた影響は, 0.77


In [59]:
fig = px.violin(mail_df, y="spend", x="treatment", color="treatment", box=True, points="all")
fig.show()

In [78]:
# 介入がconversionに与えた影響を計算する
tc_1 = treatment_1["conversion"].mean()
tc_0 = treatment_0["conversion"].mean()
print("介入がconversionに与えた影響は,", np.round((tc_1 - tc_0), 5))

介入がconversionに与えた影響は, 0.00681


In [61]:
fig = px.violin(mail_df, y="conversion", x="treatment", color="treatment", box=True, points="all")
fig.show()

## データの前処理

In [62]:
mail_df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,treatment,visit,conversion,spend
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,0,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,1,0,0,0.0
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,1,0,0,0.0
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,1,1,0,0.0
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,0,0,0,0.0


In [63]:
# ラベルエンコーディング（OrdinalEncoder）
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()
encoded = oe.fit_transform(mail_df[['history_segment', 'zip_code', "channel"]].values)
# decoded = oe.inverse_transform(encoded)

print('エンコード結果: ')
encoded_df = pd.DataFrame(encoded, columns = ["history_segment", "zip_code", "channel"])
encoded_df.head()

エンコード結果: 


Unnamed: 0,history_segment,zip_code,channel
0,2.0,0.0,2.0
1,4.0,0.0,2.0
2,4.0,0.0,1.0
3,1.0,2.0,2.0
4,2.0,0.0,0.0


In [64]:
mail_df_use = mail_df[
    ['recency', 'history', 'mens', 'womens', 'newbie', 'treatment', 'visit', 'conversion', 'spend']
    ]
demo_df = pd.concat([mail_df_use.reset_index(), encoded_df], axis = 1)
demo_df.head()

Unnamed: 0,index,recency,history,mens,womens,newbie,treatment,visit,conversion,spend,history_segment,zip_code,channel
0,1,6,329.08,1,1,1,0,0,0,0.0,2.0,0.0,2.0
1,3,9,675.83,1,0,1,1,0,0,0.0,4.0,0.0,2.0
2,8,9,675.07,1,1,1,1,0,0,0.0,4.0,0.0,1.0
3,13,2,101.64,0,1,0,1,1,0,0.0,1.0,2.0,2.0
4,14,4,241.42,0,1,1,0,0,0,0.0,2.0,0.0,0.0


## Meta-LearnerでDemoをやってみる

In [84]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(demo_df, test_size= 0.2)
print(train_df.shape)
print(val_df.shape)

(34090, 13)
(8523, 13)


In [85]:
# 集団を2つに分ける
df_0 = train_df.query("treatment == 0")  # 介入を受けていない集団
df_1 = train_df.query("treatment == 1")  # 介入を受けた集団

X_0 = df_0.drop(columns=["treatment", "conversion"])
X_1 = df_1.drop(columns=["treatment", "conversion"])
y_0 = df_0[["conversion"]].astype(int)
y_1 = df_1[["conversion"]].astype(int)

In [86]:
# lightgbmで実装してみる
import lightgbm as lgb

# 介入を受けていないモデル
reg_0 = lgb.LGBMRegressor(
    objective='binary',
    num_leaves=64,
    min_child_samples=20,
    max_depth=7
)

reg_0.fit(X_0, y_0)

# 介入を受けたモデル
reg_1 = lgb.LGBMRegressor(
    objective='binary',
    num_leaves=64,
    min_child_samples=20,
    max_depth=7
)

reg_1.fit(X_1, y_1)

LGBMRegressor(max_depth=7, num_leaves=64, objective='binary')

In [87]:
# ATEを求める
X = val_df.drop(columns = ["treatment","conversion"]) 
mu_0 = reg_0.predict(X)
mu_1 = reg_1.predict(X)

ATE = (mu_1 - mu_0).mean()
print("ATE：", float(ATE))

ATE： 3.3671334668755106e-07


In [95]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from econml.metalearners import  XLearner

# set data 

Y = train_df.conversion
T = train_df.treatment
X = train_df.drop(columns=["conversion", "treatment"])
X_val = val_df.drop(columns=["conversion", "treatment"])
# Instantiate X learner
models = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_leaf=64)
propensity_model = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_leaf=64)
X_learner = XLearner(models=models, propensity_model=propensity_model)
# Train X_learner
X_learner.fit(Y, T, X=X)
# Estimate treatment effects on test data
X_te = X_learner.effect(X_val)

print(X_te.mean())

0.0020776322963507324


In [96]:
print(X_te.mean()*100)

0.20776322963507324


In [94]:
fig = px.violin(X_te, box=True, points="all")
fig.show()