# AB测试

### 导入数据

In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

In [2]:
# 导入数据
df = pd.read_csv('ab_data.csv')

# 查看数据前5行
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


### 查看数据，完成数据清洗

In [3]:
# 查看数据形状
df.shape

(294478, 5)

In [4]:
# 查看数据信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
user_id         294478 non-null int64
timestamp       294478 non-null object
group           294478 non-null object
landing_page    294478 non-null object
converted       294478 non-null int64
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [5]:
# 查看缺失值
df.isnull().sum()  # 没有缺失值

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [6]:
# 查看对于treatment和 new_page，或者control和old_page会出现不一致的行
df.loc[(df['group'] == 'treatment') != (df['landing_page'] == 'new_page')].count()

user_id         3893
timestamp       3893
group           3893
landing_page    3893
converted       3893
dtype: int64

In [7]:
# 去除无效数据，并将其赋值到df2中
df2 = df.loc[~((df['group'] == 'treatment') != (df['landing_page'] == 'new_page'))]
df2.count()

user_id         290585
timestamp       290585
group           290585
landing_page    290585
converted       290585
dtype: int64

In [8]:
# 查看是否有重复数据
df2.user_id.nunique()   # 从用户ID确定有重复数据

290584

In [9]:
# 查看重复的用户ID，以及相关信息
df2[df2['user_id'].duplicated(keep=False)]    # 重复的用户ID为773192

Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [10]:
# 删除重复值
df2 = df2.drop_duplicates(subset=['user_id'], keep='first')  # 保留重复项中的第一项

In [11]:
# 查看df2的形状
print(df2.shape)

# 查看df2中是否还有重复值
print(df2.user_id.nunique())

(290584, 5)
290584


### 计算用户转化率

In [12]:
# 整体转化率
df2['converted'].mean()  #整体转化率为11.96%

0.11959708724499628

In [13]:
# 新页面转化率
df2[df2['landing_page'] == 'new_page']['converted'].mean()   # 新页面的转化率为11.88%

0.11880806551510564

In [14]:
# 旧页面转化率
df2[df2['landing_page'] == 'old_page']['converted'].mean()   # 旧页面的转化率为12.04%

0.1203863045004612

In [15]:
# 用户收到新页面的概率
df2[df2['landing_page'] == 'new_page'].shape[0] / df2.shape[0]   # 用户收到新页面的概率为50%

0.5000619442226688

整体页面转化率为11.96%，新页面转化率为11.88%，旧页面的转化率为12.04%，用户收到新旧页面的概率参半，看似使用旧页面效果更好，是本身就是如此，还是由于一些随机因素导致的呢？
对此需要进行显著性检验：

### AB测试

#### 由于目标是新页面转化率高于旧页面转化率，设计原假设和备择假设如下：
#### H0: P_new - P_old <= 0
#### H1: P_new - P_old > 0

In [16]:
# 新页面转化率
p_new = df2[df2['landing_page'] == 'new_page']['converted'].mean()
p_new

0.11880806551510564

In [17]:
# 旧页面转化率
p_old = df2[df2['landing_page'] == 'old_page']['converted'].mean()
p_old

0.1203863045004612

In [18]:
# 新页面数量
n_new = df2[df2['landing_page'] == 'new_page'].shape[0]
n_new

145310

In [19]:
# 旧页面数量
n_old = df2[df2['landing_page'] == 'old_page'].shape[0]
n_old

145274

In [20]:
# 引入SciPy库
from scipy.stats import norm

In [21]:
# 计算显著性检验Z值
z_score = (p_old - p_new) / np.sqrt(p_old * (1 - p_old) / n_old + p_new * (1 - p_new) / n_new)
z_score

1.3109271488301917

In [22]:
# 计算置信区间
norm.ppf(1-0.05)

1.6448536269514722

由于z_score=1.31小于norm.ppf(1-0.05)=1.64，落在95%的置信区间中，所以无法拒绝原假设，建议延长测试时间，继续观测情况。