# 1章 セレクションバイアスとRCT

In [1]:
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('../scripts/')

import numpy as np
import pandas as pd

import rdata
from rdd import rdd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from causalimpact import CausalImpact
from statsmodels.stats.weightstats import ttest_ind
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

from propensity_score_matching import PropensityScoreMatching

## 1.4 メールマーケティングの効果検証

### 1.4.1 RCTを行ったデータの準備

In [2]:
email_data = pd.read_csv('http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv')
email_data.head(3)

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0


In [3]:
email_data.shape

(64000, 12)

In [4]:
male_df = email_data[email_data['segment'] != 'Womens E-Mail']
male_df.shape

(42613, 12)

In [5]:
male_df['treatment'] = male_df['segment'].map(lambda x: 1 if x == 'Mens E-Mail' else 0)
male_df.head(3)

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1


### 1.4.2 RCTデータの集計と有意差検定

In [6]:
# 集計
summary_by_segment = pd.pivot_table(
    data=male_df,
    values=['conversion', 'spend', 'visit'],
    index=['treatment'],
    aggfunc={'conversion': np.mean, 'spend': np.mean, 'visit': np.ma.count}
)

summary_by_segment.columns = ['conversion_rate', 'spend_mean', 'count']
summary_by_segment

Unnamed: 0_level_0,conversion_rate,spend_mean,count
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.005726,0.652789,21306
1,0.012531,1.422617,21307


In [7]:
# 有意差検定
rct_ttest = ttest_ind(
    male_df[male_df['treatment'] == 1]['spend'],
    male_df[male_df['treatment'] == 0]['spend'],
    usevar='pooled'
)

for key, val in zip(['t', 'p-value', 'df'], rct_ttest):
    print('{} = {}'.format(key, val))

t = 5.300090294465455
p-value = 1.163200872605976e-07
df = 42611.0


### 1.3.4 バイアスのあるデータによる効果の検証

In [8]:
# バイアスのあるデータの準備
treatment_data = male_df[male_df['treatment'] == 1]
control_data = male_df[male_df['treatment'] == 0]

treatment_biased = treatment_data.drop(treatment_data[~(
    (treatment_data['history'] > 300) |
    (treatment_data['recency'] < 6) |
    (treatment_data['recency'] == 'Multichannel')
)].sample(frac=0.5, random_state=1).index)

control_biased = control_data.drop(control_data[
    (control_data['history'] > 300) |
    (control_data['recency'] < 6) |
    (control_data['recency'] == 'Multichannel')
].sample(frac=0.5, random_state=1).index)

biased_data = pd.concat([treatment_biased, control_biased])
biased_data.head(3)

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1


In [9]:
# バイアスのあるデータの集計と有意差の検定
summary_by_segment_biased = pd.pivot_table(
    data=biased_data,
    values=['conversion', 'spend', 'visit'],
    index=['treatment'],
    aggfunc={'conversion': np.mean, 'spend': np.mean, 'visit': np.ma.count}
)
summary_by_segment_biased.columns = ['conversion_rate', 'spend_mean', 'count']

rct_ttest_biased = ttest_ind(
    biased_data[biased_data['treatment'] == 1]['spend'],
    biased_data[biased_data['treatment'] == 0]['spend'],
    usevar='pooled'
)

In [10]:
summary_by_segment_biased

Unnamed: 0_level_0,conversion_rate,spend_mean,count
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.005165,0.634625,14907
1,0.013575,1.560682,17017


In [11]:
for key, val in zip(['t', 'p-value', 'df'], rct_ttest_biased):
    print('{} = {}'.format(key, val))

t = 5.173854873955868
p-value = 2.3069800974507752e-07
df = 31922.0
