# Hypothesis Testing

## Import Libraries

In [1]:
!pip install pingouin

Collecting pingouin
  Downloading pingouin-0.5.5-py3-none-any.whl.metadata (19 kB)
Collecting pandas-flavor (from pingouin)
  Downloading pandas_flavor-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Downloading pingouin-0.5.5-py3-none-any.whl (204 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.4/204.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas_flavor-0.6.0-py3-none-any.whl (7.2 kB)
Installing collected packages: pandas-flavor, pingouin
Successfully installed pandas-flavor-0.6.0 pingouin-0.5.5


In [2]:
import numpy as np
import pandas as pd
from scipy import stats
import pingouin as pg

## Import Datasets

In [3]:
trx_amount_df = pd.read_csv('/content/drive/MyDrive/Dibimbing/Unified Materials/Hypothesis Testing/trx_amount_df.csv')
payment_duration_df = pd.read_csv('/content/drive/MyDrive/Dibimbing/Unified Materials/Hypothesis Testing/payment_duration_df.csv')
df_trx_abc = pd.read_csv('/content/drive/MyDrive/Dibimbing/Unified Materials/Hypothesis Testing/df_trx_abc.csv')

## Playing around with normal distribution

In [4]:
# IQ follow Normal(100, 15)
# Probability IQ between 100 and 115?
dist = stats.norm(loc=100, scale=15)
prob = dist.cdf(115) - dist.cdf(100)
print(f"Probability IQ between 100 and 115 is {prob:.4f}")

Probability IQ between 100 and 115 is 0.3413


In [6]:
# get probability from Z score
z_score = (115 - 100) / 15
prob = stats.norm.cdf(z_score) - stats.norm.cdf(0)
print(f"Probability IQ between 100 and 115 is {prob:.4f}")

Probability IQ between 100 and 115 is 0.3413


### Exercise

In [7]:
# pr(100_000 < x) from Normal(120_000, 50_000)

In [9]:
# standard normal distribution
# Compute probability of X greater than Z = 1.96


## T-test

### Transaction Amount Data

In [10]:
# snippet of data
trx_amount_df.head()

Unnamed: 0,control,variant
0,149900,122899
1,124500,233400
2,155900,149400
3,190900,102400
4,120600,187000


In [11]:
# general info
trx_amount_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   control  30 non-null     int64
 1   variant  30 non-null     int64
dtypes: int64(2)
memory usage: 608.0 bytes


In [12]:
# summary statistics
trx_amount_df.describe()

Unnamed: 0,control,variant
count,30.0,30.0
mean,122476.6,144553.266667
std,35996.270772,41901.599542
min,53500.0,61800.0
25%,106375.0,118075.0
50%,120600.0,147100.0
75%,144400.0,174500.0
max,193200.0,233400.0


From the mean values: variant > control. Is this difference significant?

In [13]:
# t-test
from statsmodels.stats.weightstats import ttest_ind

df = trx_amount_df

t_stat, p_val, _ = ttest_ind(df['variant'],
                              df['control'])
print(f"t_stat is {t_stat:.4f}")
print(f"p_val is {p_val:.4f}")

t_stat is 2.1890
p_val is 0.0326


p-value < 0.05 -> reject H0: result is significant

### Payment Duration Data

In [14]:
# data snippet
payment_duration_df.head()

Unnamed: 0,control,variant
0,3.46,4.38
1,4.1,4.14
2,3.56,3.02
3,2.97,3.23
4,3.27,3.37


In [15]:
# general info
payment_duration_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   control  1000 non-null   float64
 1   variant  1000 non-null   float64
dtypes: float64(2)
memory usage: 15.8 KB


In [16]:
# summary statistics
payment_duration_df.describe()

Unnamed: 0,control,variant
count,1000.0,1000.0
mean,3.39082,3.49901
std,0.5168,0.625253
min,1.65,1.59
25%,3.05,3.11
50%,3.4,3.47
75%,3.7325,3.91
max,4.96,5.44


From the mean values: variant > control. Is this difference significant?

In [17]:
# t-test
from statsmodels.stats.weightstats import ttest_ind

df = payment_duration_df

t_stat, p_val, _ = ttest_ind(df['variant'],
                              df['control'])
print(f"t_stat is {t_stat:.4f}")
print(f"p_val is {p_val:.4f}")

t_stat is 4.2176
p_val is 0.0000


p-value < 0.05 -> reject H0: result is significant

## ABC Test with Mean metrics

In [18]:
# data snippet
df_trx_abc.head()

Unnamed: 0,control,variant1,variant2
0,102400,184899,148000
1,76600,213500,119600
2,152200,114300,279500
3,99000,189700,105800
4,132600,164800,133300


In [19]:
# general info
df_trx_abc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   control   200 non-null    int64
 1   variant1  200 non-null    int64
 2   variant2  200 non-null    int64
dtypes: int64(3)
memory usage: 4.8 KB


In [20]:
# summary statistics
df_trx_abc.describe()

Unnamed: 0,control,variant1,variant2
count,200.0,200.0,200.0
mean,125963.445,138620.42,144345.48
std,41161.616418,49525.589042,49648.370821
min,13400.0,-18000.0,400.0
25%,99225.0,110274.25,110525.0
50%,130700.0,137300.0,142700.0
75%,156250.0,173200.0,175175.0
max,244800.0,275300.0,317800.0


Turns out there are anomalies in minimum values in variant1, we need to drop it

In [21]:
# how many negative values that need to be dropeed?
df_trx_abc.loc[df_trx_abc.variant1<0].shape[0]

2

In [22]:
# drop them
df_trx_abc = df_trx_abc.loc[df_trx_abc.variant1 >= 0]

In [23]:
# preprocess data
df_trx_abc_melt = df_trx_abc.melt(
    id_vars=None,
    var_name = 'experiment_group',
    value_name='trx_amount'
)
df_trx_abc_melt.head()

Unnamed: 0,experiment_group,trx_amount
0,control,102400
1,control,76600
2,control,152200
3,control,99000
4,control,132600


In [24]:
df_trx_abc_melt

Unnamed: 0,experiment_group,trx_amount
0,control,102400
1,control,76600
2,control,152200
3,control,99000
4,control,132600
...,...,...
589,variant2,191900
590,variant2,108100
591,variant2,72900
592,variant2,161000


In [25]:
# one-way ANOVA
anova_res = df_trx_abc_melt.anova(dv='trx_amount', between='experiment_group', detailed=True)
anova_res.loc[:,['Source','SS','DF','MS','F','p-unc']]

Unnamed: 0,Source,SS,DF,MS,F,p-unc
0,experiment_group,37485320000.0,2,18742660000.0,8.804103,0.000171
1,Within,1258153000000.0,591,2128855000.0,,


Conclusion: Reject H0! There is at least one group that has different transaction amount average!