In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
random.seed(55)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ab_data.csv')

In [None]:
df.groupby(['group','converted']).agg('count')

In [None]:
df.info()

In [None]:
df[df.duplicated(['user_id'], keep=False)]

In [None]:
df.drop_duplicates(['user_id'], inplace=True)

In [None]:
assert len(df['user_id'].unique()) == df['user_id'].size

In [None]:
df.info()

In [None]:
df['converted'].mean()

In [None]:
df.groupby(['group']).describe()

In [None]:
df.groupby(['group']).agg({'converted' : ['sum', 'count', 'mean']})

In [None]:
df[['group', 'converted']].groupby(['group']).agg('mean').T

In [None]:
p_old_page = df[['group', 'converted']].query("group == 'control'")['converted'].mean()
p_new_page = df[['group', 'converted']].query("group == 'treatment'")['converted'].mean()
act_p_diff = p_new_page - p_old_page

print('p_old_page:\t{}\np_new_page:\t{}\np_diff:\t\t{}'.format(p_old_page, p_new_page, act_p_diff))

In [None]:
n_old =  len(df[['group']].query("group == 'control'"))
n_new =  len(df[['group']].query("group == 'treatment'"))

print('n_old:\t{}\nn_new:\t{}'.format(n_old, n_new))

In [None]:
p_diffs = []

for _ in range(10000):
    new_page_converted = np.random.choice([1, 0], size=n_new, p=[p_new_page, (1 - p_new_page)]).mean()
    old_page_converted = np.random.choice([1, 0], size=n_old, p=[p_old_page, (1 - p_old_page)]).mean()
    diff = new_page_converted - old_page_converted
    p_diffs.append(diff)

In [None]:
plt.hist(p_diffs)
plt.xlabel('p_diffs')
plt.ylabel('Frequency')
plt.title('Plot of 10K simulated p_diffs');

In [None]:
p_diffs = np.array(p_diffs)
(act_p_diff < p_diffs).mean()

In [None]:
import statsmodels.api as sm

convert_old = sum(df.query("group == 'control'")['converted'])
convert_new = sum(df.query("group == 'treatment'")['converted'])

z_score, p_value = sm.stats.proportions_ztest([convert_old, convert_new], [n_old, n_new], alternative='smaller')
print('z_critical_value: ', z_score)
print('p_critical_value: ', p_value)

In [None]:
from scipy.stats import norm

print('p-value: ', norm.cdf(z_score))
# Tells us how significant our z-score is

# for our single-sides test, assumed at 95% confidence level, we calculate: 
print('z_alfa: ', norm.ppf(1 - (0.05)))

In [None]:
# sm.stats.zt_ind_solve_power(effect_size=-0.0048, alpha=0.05, power=0.1, alternative='smaller')
from statsmodels.stats import power as pwr
from statsmodels.stats.proportion import proportion_effectsize

es = proportion_effectsize(p_new_page, p_old_page)
ratio = (n_new / n_old)
power = pwr.NormalIndPower().power(es, n_old / ratio, alpha=0.05, ratio=ratio, alternative='smaller')
beta = 1 - power

print('power: ', power)
print('beta: ', beta)