In [None]:
#connect to personal google drive first
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#import libraries we'll use
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_selection import chi2

#to find the p value, we will use chi square analysis method
#in order to use chi square analysis, we must import the chi2 from scipy lib
from scipy.stats import chi2_contingency, chi2

In [None]:
#read the data and display it
df = pd.read_csv('/content/drive/MyDrive/ecommerce_ab_testing/ab_data.csv')
df

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,11:48.6,control,old_page,0
1,804228,01:45.2,control,old_page,0
2,661590,55:06.2,treatment,new_page,0
3,853541,28:03.1,treatment,new_page,0
4,864975,52:26.2,control,old_page,1
...,...,...,...,...,...
294475,734608,45:03.4,control,old_page,0
294476,697314,20:29.0,control,old_page,0
294477,715931,40:24.5,treatment,new_page,0
294478,759899,20:29.0,treatment,new_page,0


In [None]:
df.landing_page.unique()

array(['old_page', 'new_page'], dtype=object)

In [None]:
#display the general and statistcal information from the df table
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294480 entries, 0 to 294479
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294480 non-null  int64 
 1   timestamp     294480 non-null  object
 2   group         294480 non-null  object
 3   landing_page  294480 non-null  object
 4   converted     294480 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


Unnamed: 0,user_id,converted
count,294480.0,294480.0
mean,787973.538896,0.119658
std,91210.917091,0.324562
min,630000.0,0.0
25%,709031.75,0.0
50%,787932.5,0.0
75%,866911.25,0.0
max,945999.0,1.0


In [None]:
#while implementing the chi square, we have create 2 hypothesis i.e the Null Hypothesis and the Alternate Hypothesis.
null_hypo =  "there is no relation between page version and customer's decision to purchase products. they are independent"
alter_hypo = "there is a relation between page version and customer's decision to purchase products. they are dependent"

In [None]:
#if the p-value is < 0.05 then we reject the null_hypo and accept the alter_hypo. 
#else if the p-value is >= 0.05 then we'll do the opposite
acceptance_value = 0.05

#we'll count the rate. first, we will see the sum and count using pd.crosstab
observed_values = pd.crosstab(df["landing_page"], df["converted"]).values
observed_values

array([[129743,  17498],
       [129500,  17739]])

In [None]:
#rate = sum/count
#we'll count the rate for both version
old_page_impact_rate = 17498 / (129743 + 17498) 
new_page_impact_rate = 17739 / (129500 + 17739)
print(old_page_impact_rate, new_page_impact_rate)

0.118839182021312 0.12047759085568362


In [None]:
#implementing the chi square
chi2_statistic, p_value, dof, expected_values = chi2_contingency(observed_values, correction = False)
print("dof value: ", dof)
print("chi2_statistic value: ", chi2_statistic)
print("p_value: ", p_value)

#find the critical value
#The critical value = for any per-determined probability (p), the test would indicate a result that is less probable than p.
critical_value = chi2.ppf(1 - acceptance_value, dof)
print("critical value: ", critical_value)

#implementing the ab testng using acceptance_value
if p_value < 0.05:
    print("since the p_value is ", p_value, " which is < 0.05, the null hypothesis should be rejected and alternate hypothesis should be accepted")
    print("we'll accept this hypothesis: " + (alter_hypo))
else:
    print("since the p_value is ", p_value, " which is >= 0.05, the null hypothesis should be accepted and alternate hypothesis should be rejected")
    print("we'll accept this hypothesis: " + null_hypo)

dof value:  1
chi2_statistic value:  1.87605695553178
p_value:  0.17078297802593548
critical value:  3.841458820694124
since the p_value is  0.17078297802593548  which is >= 0.05, the null hypothesis should be accepted and alternate hypothesis should be rejected
we'll accept this hypothesis: there is no relation between page version and customer's decision to purchase products. they are independent
