In [1]:
# import packeges
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import os
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier

ESサイトのユーザーに対して**RCT**を適用したメールマーケティングを行ったデータ。

In [2]:
# データのロード
base_path = "/Users/ryoto/workspace/hit-u/zemi/econml_demo/data"
mail = pd.read_csv(os.path.join(base_path,'E-MailAnalytics.csv'))
print(mail.shape)
mail.head()

(64000, 12)


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [3]:
print("num of No E-Mail : ", len(mail.query("segment == 'No E-Mail'")))
print("num of Mens E-Mail : ", len(mail.query("segment == 'Womens E-Mail'")))
print("num of No Womens : ", len(mail.query("segment == 'Mens E-Mail'")))

num of No E-Mail :  21306
num of Mens E-Mail :  21387
num of No Womens :  21307


## RCTのデータからATEを求めてBaseLineとする
簡略化のために女性向けのメールが配信されているデータを削除する。

In [4]:
 = mail.query("segment != 'Womens E-Mail'")
print(len(.query("segment == 'No E-Mail'")))
print(len(.query("segment == 'Womens E-Mail'")))
print(len(.query("segment == 'Mens E-Mail'")))

21306
0
21307


In [5]:
["segment"] = .segment.map(lambda x: 1 if x == 'Mens E-Mail' else 0)
 = .rename(columns={"segment" : "treatment"})
.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,treatment,visit,conversion,spend
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,0,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,1,0,0,0.0
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,1,0,0,0.0
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,1,1,0,0.0
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,0,0,0,0.0


In [6]:
.groupby("treatment").agg({"spend" : "mean", "conversion": "mean", "visit":"count"}).rename(columns={"visit" : "count"})

Unnamed: 0_level_0,spend,conversion,count
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.652789,0.005726,21306
1,1.422617,0.012531,21307


In [7]:
# treatment 0, 1 のデータフレームに分ける
treatment_1 = .query("treatment == 1")
treatment_0 = .query("treatment == 0")

In [8]:
# 介入が購買金額に与えた影響を計算する
ts_1 = treatment_1["spend"].mean()
ts_0 = treatment_0["spend"].mean()
print("介入が購買金額に与えた影響は,", np.round(ts_1 - ts_0, 3))

介入が購買金額に与えた影響は, 0.77


In [9]:
fig = px.violin(, y="spend", x="treatment", color="treatment", box=True, points="all")
fig.show()

In [10]:
# 介入がconversionに与えた影響を計算する
tc_1 = treatment_1["conversion"].mean()
tc_0 = treatment_0["conversion"].mean()
True_ATE = np.round((tc_1 - tc_0), 5)
print("介入がconversionに与えた影響は,", True_ATE)

介入がconversionに与えた影響は, 0.00681


In [11]:
fig = px.violin(, y="conversion", x="treatment", color="treatment", box=True, points="all")
fig.show()

## データの前処理

In [12]:
.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,treatment,visit,conversion,spend
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,0,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,1,0,0,0.0
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,1,0,0,0.0
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,1,1,0,0.0
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,0,0,0,0.0


In [13]:
# ラベルエンコーディング（OrdinalEncoder）
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()
encoded = oe.fit_transform([['history_segment', 'zip_code', "channel"]].values)
# decoded = oe.inverse_transform(encoded)

print('エンコード結果: ')
encoded_df = pd.DataFrame(encoded, columns = ["history_segment", "zip_code", "channel"])
encoded_df.head()

エンコード結果: 


Unnamed: 0,history_segment,zip_code,channel
0,2.0,0.0,2.0
1,4.0,0.0,2.0
2,4.0,0.0,1.0
3,1.0,2.0,2.0
4,2.0,0.0,0.0


In [14]:
_use = [
    ['recency', 'history', 'mens', 'womens', 'newbie', 'treatment', 'visit', 'conversion', 'spend']
    ]
demo_df = pd.concat([_use.reset_index(), encoded_df], axis = 1).drop(columns = "index").sample(2500, random_state=42)
print(demo_df["treatment"].sum(), len(demo_df) - demo_df["treatment"].sum())
demo_df.head()

1231 1269


Unnamed: 0,recency,history,mens,womens,newbie,treatment,visit,conversion,spend,history_segment,zip_code,channel
322,4,121.87,1,0,0,0,0,0,0.0,1.0,0.0,2.0
14339,3,184.83,1,0,1,1,0,0,0.0,1.0,2.0,1.0
2348,2,400.48,0,1,1,0,0,0,0.0,3.0,2.0,0.0
33454,10,175.25,0,1,0,1,1,0,0.0,1.0,0.0,2.0
25505,2,1640.42,1,1,1,1,0,0,0.0,6.0,0.0,0.0


## Meta-LearnerでDemoをやってみる

### T-Learner

In [15]:
# from sklearn.model_selection import train_test_split
# train_df, val_df = train_test_split(demo_df, test_size= 0.2)
# print(train_df.shape)
# print(val_df.shape)

In [16]:
# 集団を2つに分ける
df_0 = demo_df.query("treatment == 0")  # 介入を受けていない集団
df_1 = demo_df.query("treatment == 1")  # 介入を受けた集団

X_0 = df_0.drop(columns=["treatment", "conversion", "spend", "visit"])
X_1 = df_1.drop(columns=["treatment", "conversion", "spend", "visit"])
y_0 = df_0[["conversion"]].astype(int)
y_1 = df_1[["conversion"]].astype(int)

In [17]:
# 介入を受けていないモデル
reg_0 = RandomForestRegressor(n_estimators=1000, max_depth=7, random_state = 42)

reg_0.fit(X_0, y_0)

# 介入を受けたモデル
reg_1 = RandomForestRegressor(n_estimators=1000, max_depth=7, random_state = 42)

reg_1.fit(X_1, y_1)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



RandomForestRegressor(max_depth=7, n_estimators=1000, random_state=42)

In [18]:
# ATEを求める
X_val = demo_df.drop(columns = ["treatment","conversion", "spend", "visit"]) 
mu_0 = reg_0.predict(X_val)
mu_1 = reg_1.predict(X_val)

print("True ATE : ", True_ATE)
ATE = np.round((mu_1 - mu_0).mean(), 5)
print("ATE：", float(ATE))

True ATE :  0.00681
ATE： 0.00611


In [19]:
# 処置群における平均処置効果ATTと、対照群における平均処置効果ATU
ATT = df_1["conversion"] - reg_0.predict(X_1)
ATU = reg_1.predict(X_0) - df_0["conversion"]

print("TRUE ATT", np.round(demo_df.query("treatment == 1")["conversion"].mean(), 5))
print("ATT：", np.round(ATT.mean(), 5))
print("TRUE ATU", np.round(demo_df.query("treatment == 0")["conversion"].mean(), 5))
print("ATU：", np.round(ATU.mean(), 5))

TRUE ATT 0.01137
ATT： 0.00437
TRUE ATU 0.0063
ATU： 0.00837


### S-Learner

In [23]:
# 特徴量と目的変数を分ける
X_train = demo_df.drop(columns=["conversion", "spend", "visit"])
y_train = demo_df[["conversion"]].astype(int)

In [24]:
reg = RandomForestClassifier(n_estimators=1000, max_depth=7, random_state = 42)
reg.fit(X_train, y_train)

X_val = demo_df.drop(columns=["conversion", "spend", "visit"])
y_val = demo_df[["conversion"]]

X_val_0 = X_val.copy()
X_val_0["treatment"] = 0

X_val_1 = X_val.copy()
X_val_1["treatment"] = 1

pred_1 = reg.predict(X_val_1).mean()
pred_0 = reg.predict(X_val_0).mean()


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [25]:
pred_1 - pred_0

0.0