In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../data/raw/hillstrom.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64000 entries, 0 to 63999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   recency          64000 non-null  int64  
 1   history_segment  64000 non-null  object 
 2   history          64000 non-null  float64
 3   mens             64000 non-null  int64  
 4   womens           64000 non-null  int64  
 5   zip_code         64000 non-null  object 
 6   newbie           64000 non-null  int64  
 7   channel          64000 non-null  object 
 8   segment          64000 non-null  object 
 9   visit            64000 non-null  int64  
 10  conversion       64000 non-null  int64  
 11  spend            64000 non-null  float64
dtypes: float64(2), int64(6), object(4)
memory usage: 5.9+ MB


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [None]:
cr_by_visit = (df.groupby('segment')['visit'].apply(lambda s: (s==1).mean()).sort_index())
ctrl = cr_by_visit['No E-Mail']
delta_vs_ctrl = (cr_by_visit - ctrl)
result_visit_cr = (pd.DataFrame({
    'CR_visit': (cr_by_visit * 100).round(2),                  # CR, %
    'Δ_CR_visit_vs_control_pp': (delta_vs_ctrl * 100).round(2)    # дельта к контролю, процентные пункты
}))
result_visit_cr

Unnamed: 0_level_0,CR_visit,Δ_CR_visit_vs_control_pp
segment,Unnamed: 1_level_1,Unnamed: 2_level_1
Mens E-Mail,18.28,7.66
No E-Mail,10.62,0.0
Womens E-Mail,15.14,4.52


In [None]:
cr_by_conversion = (df.groupby('segment')['conversion'].apply(lambda s: (s==1).mean()).sort_index())
ctrl_conv = cr_by_conversion['No E-Mail']
delta_vs_ctrl_conv = (cr_by_conversion - ctrl_conv)
result_conv_cr = (pd.DataFrame({
    'CR_conv': (cr_by_conversion * 100).round(2),                  # CR, %
    'Δ_CR__conv_vs_control_pp': (delta_vs_ctrl_conv * 100).round(2)    # дельта к контролю, процентные пункты
}))
result_conv_cr

Unnamed: 0_level_0,CR_conv,Δ_CR__conv_vs_control_pp
segment,Unnamed: 1_level_1,Unnamed: 2_level_1
Mens E-Mail,1.25,0.68
No E-Mail,0.57,0.0
Womens E-Mail,0.88,0.31


In [None]:
arpu = df.groupby('segment')['spend'].mean()
ctrl_arpu = arpu['No E-Mail']
delta_arpu = (arpu - ctrl_arpu)
result_arpu = pd.DataFrame({
    'ARPU': arpu.round(2),
    'Δ_ARPU': delta_arpu.round(2)
    })
result_arpu

Unnamed: 0_level_0,ARPU,Δ_ARPU
segment,Unnamed: 1_level_1,Unnamed: 2_level_1
Mens E-Mail,1.42,0.77
No E-Mail,0.65,0.0
Womens E-Mail,1.08,0.42


In [None]:
result_by_cv_arpu = pd.concat([result_visit_cr, result_conv_cr, result_arpu], axis=1)
result_by_cv_arpu

Unnamed: 0_level_0,CR_visit,Δ_CR_visit_vs_control_pp,CR_conv,Δ_CR__conv_vs_control_pp,ARPU,Δ_ARPU
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Mens E-Mail,18.28,7.66,1.25,0.68,1.42,0.77
No E-Mail,10.62,0.0,0.57,0.0,0.65,0.0
Womens E-Mail,15.14,4.52,0.88,0.31,1.08,0.42


## Findings
- Goal: boost purchases and revenue. Primary metrics: CR_conv and ARPU.
- Control (No E-Mail): CR_conv = 0.57%, ARPU = 0.65.
- Mens E-Mail: CR_conv = 1.25% (**+0.68 p.p.** vs control), ARPU = 1.42 (**+0.77**).
- Womens E-Mail: CR_conv = 0.88% (**+0.31 p.p.** vs control), ARPU = 1.08 (**+0.42**).
**Conclusion:** Email increases both conversion and revenue; Mens outperforms Womens on average.
