In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
%matplotlib inline
import statsmodels.stats.api as sms
import math
import scipy.stats as stats

# Data Wrangling/Cleaning

In [47]:
# merging two data sets
base_df = pd.read_csv('Baseline.csv')
treatment_df = pd.read_csv('Testing.csv')

df = pd.merge(left=base_df, right=treatment_df, how='left', left_on='prequal_id', right_on='prequal_id')

In [48]:
# checking data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190976 entries, 0 to 190975
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   prequal_id         190976 non-null  object
 1   prequal_date       190976 non-null  object
 2   completed_prequal  190976 non-null  int64 
 3   assignment_date    8609 non-null    object
dtypes: int64(1), object(3)
memory usage: 7.3+ MB


In [49]:
# checking first 5 rows
df.head()

Unnamed: 0,prequal_id,prequal_date,completed_prequal,assignment_date
0,00081cb5-27bb-428a-bc53-076bacc7ad02,6/22/2019,1,6/22/2019
1,00120f05-bf9d-40db-99d1-05a8cbd8aa0e,4/16/2019,0,
2,00139f6d-0af4-49c5-b26f-f9c999a06bcb,5/9/2019,1,
3,0019854e-e4c2-42df-be79-59cf1a13ac89,1/17/2019,0,
4,0019cb64-a44c-4320-b149-9c0167c714e9,4/16/2019,1,


In [50]:
# convert dates into date time datatypes
df['prequal_date'] = pd.to_datetime(df['prequal_date'])
df['assignment_date'] = pd.to_datetime(df['assignment_date'])

In [51]:
# checking date time conversion
df.head()

Unnamed: 0,prequal_id,prequal_date,completed_prequal,assignment_date
0,00081cb5-27bb-428a-bc53-076bacc7ad02,2019-06-22,1,2019-06-22
1,00120f05-bf9d-40db-99d1-05a8cbd8aa0e,2019-04-16,0,NaT
2,00139f6d-0af4-49c5-b26f-f9c999a06bcb,2019-05-09,1,NaT
3,0019854e-e4c2-42df-be79-59cf1a13ac89,2019-01-17,0,NaT
4,0019cb64-a44c-4320-b149-9c0167c714e9,2019-04-16,1,NaT


In [52]:
# verifying info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 190976 entries, 0 to 190975
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   prequal_id         190976 non-null  object        
 1   prequal_date       190976 non-null  datetime64[ns]
 2   completed_prequal  190976 non-null  int64         
 3   assignment_date    8609 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 7.3+ MB


In [53]:
# filtering and assigning df with only June data
df = df[df['prequal_date'].dt.month.isin([6])]
df.head()

Unnamed: 0,prequal_id,prequal_date,completed_prequal,assignment_date
0,00081cb5-27bb-428a-bc53-076bacc7ad02,2019-06-22,1,2019-06-22
7,0023b297-93a5-4686-ab48-53f2fa762164,2019-06-05,1,NaT
8,003fedad-6a78-4a8b-bf1b-60a9390c8f2a,2019-06-11,0,NaT
23,012b89fd-ae0b-4532-890d-78f9aa1696c8,2019-06-11,0,NaT
24,012ee8be-85d0-40a1-98bc-ccca4d0f1860,2019-06-10,1,NaT


In [54]:
# checking for missing values
df.isna().sum()

prequal_id               0
prequal_date             0
completed_prequal        0
assignment_date      28057
dtype: int64

In [55]:
# fill missing zeroes
df['assignment_date'] = df['assignment_date'].fillna(0)

In [56]:
# creating a group type column based on assignment_date
df['group'] = np.where(df['assignment_date'] == 0, 'control', 'treatment')
df

Unnamed: 0,prequal_id,prequal_date,completed_prequal,assignment_date,group
0,00081cb5-27bb-428a-bc53-076bacc7ad02,2019-06-22,1,2019-06-22 00:00:00,treatment
7,0023b297-93a5-4686-ab48-53f2fa762164,2019-06-05,1,0,control
8,003fedad-6a78-4a8b-bf1b-60a9390c8f2a,2019-06-11,0,0,control
23,012b89fd-ae0b-4532-890d-78f9aa1696c8,2019-06-11,0,0,control
24,012ee8be-85d0-40a1-98bc-ccca4d0f1860,2019-06-10,1,0,control
...,...,...,...,...,...
190939,fe8ba9cd-1ad3-489e-986d-e9367890e212,2019-06-16,1,0,control
190947,fee1488f-f17d-49e5-96e7-a879b13b4d73,2019-06-22,0,2019-06-22 00:00:00,treatment
190948,fee57af3-f195-4c70-8445-71cfcb41a45f,2019-06-01,0,0,control
190950,fee61ca4-7158-45b3-9768-101c7d0e3556,2019-06-24,0,0,control


In [57]:
# checking df size
df.shape

(36666, 5)

In [58]:
# checking unique users
if df["prequal_id"].count() == df["prequal_id"].nunique(): 
    print("There are NO duplicate prequal_id's.")
else:
    print("There are duplicate prequal_id's.")

There are NO duplicate prequal_id's.


In [59]:
# proportion of users converted
df['completed_prequal'].mean()

0.539437080674194

In [60]:
df['group'].value_counts()

control      28057
treatment     8609
Name: group, dtype: int64

# Exploratory Data Analysis

In [61]:
# probability of converted individuals in the control group
df.completed_prequal[df.group == 'control'].mean()

0.5375842035855579

In [62]:
# probability of converted individuals in the treatment group
df.completed_prequal[df.group == 'treatment'].mean()

0.5454756650017424

In [63]:
# cross tabulation of percentages (non-conversion rate and conversion rate) by group
pd.crosstab(df['group'], df['completed_prequal']).apply(lambda r: r/r.sum(), axis=1)

completed_prequal,0,1
group,Unnamed: 1_level_1,Unnamed: 2_level_1
control,0.462416,0.537584
treatment,0.454524,0.545476


In [64]:
# Difference calculation
ctrl = 0.537584
trt = 0.545476

non_relative_diff = trt - ctrl
print('Non-relative difference of conversion rate: {:.2%}'.format(non_relative_diff))

relative_diff = (trt - ctrl) / ctrl
print('Relative difference of conversion rate: {:.2%}'.format(relative_diff))

Non-relative difference of conversion rate: 0.79%
Relative difference of conversion rate: 1.47%


# Hypothesis Testing

Research question: Is there evidence that the probability of a 3-field form increase or decrease conversion rate compared to a 5-field form?

- **Null hypothesis**: The control and experiment groups have the same probability of completing the form.
- **Alternative hypothesis**: The control and experiment groups have a different probability of completing the form.

We can safely assume that this test checked the assumptions:
- independence
- random sample
- sample size (n) > 30

In [65]:
# Subsetting necessary columns
df2 = df[['completed_prequal', 'group']]

# Renaming column to a simpler name
df2 = df2.rename(columns={'completed_prequal': 'converted'})

In [66]:
df2

Unnamed: 0,converted,group
0,1,treatment
7,1,control
8,0,control
23,0,control
24,1,control
...,...,...
190939,1,control
190947,0,treatment
190948,0,control
190950,0,control


In [67]:
# Calculating control group
control_group = (df2['group'] == 'control')

control_conv = df2['converted'][control_group].sum()
control_total = df2['converted'][control_group].count()

# Calculating treatment group
treatment_group = (df2['group'] == 'treatment')

treatment_conv = df2['converted'][treatment_group].sum()
treatment_total = df2['converted'][treatment_group].count()

# Percentage of each group
print('Percentage of control group: {:.2%}'.format(control_total / len(df2['converted'])))
print('Percentage of treatment group: {:.2%}'.format(treatment_total / len(df2['converted'])))

Percentage of control group: 76.52%
Percentage of treatment group: 23.48%


In [83]:
# Control group that has 5-field form
print('Number of control applicants who converted with 5-field form: {:,}'.format(control_conv))
print('Percentage of control applicants who converted: {:.2%}'.format(control_conv / control_total))

Number of control applicants who converted with 5-field form: 15,083
Percentage of control applicants who converted: 53.76%


In [84]:
# Treatment group that has 3-field form
print('Number of treatment applicants who converted with 3-field form: {:,}'.format(treatment_conv))
print('Percentage of treatment applicants who converted: {:.2%}'.format(treatment_conv / treatment_total))

Number of treatment applicants who converted with 3-field form: 4,696
Percentage of treatment applicants who converted: 54.55%


# A/B Testing

Now let's set some parameters for the A/B test.

In [70]:
# Calculating the baseline conversion--the control group.
baseline = control_conv / control_total
baseline

0.5375842035855579

In [71]:
# Assigning practical significance (effect size)--subjective and user-defined. A 1% change in conversion probability can be large in real world.
practical_sig = 0.01

In [72]:
# Calculating the sample size with base and practical significance using statsmodel
e_size = sms.proportion_effectsize(baseline, baseline + practical_sig)
e_size

-0.02007327798961067

In [73]:
# Assign power (sensitivity) as 0.8 and alpha 0.05 (confidence level is 95%)
sample_size = sms.NormalIndPower().solve_power(effect_size=e_size, power=0.8, alpha=0.05, ratio=1)

print('Sample size (n) for each group: {:,}'.format(round(sample_size)))

Sample size (n) for each group: 38,958


The test and control group assignment are not done correctly. Ideally, it is best to split the control and the treatment 50/50 so that each group have the same exposure. The control group is 76.52% of the data while the treatment group is 23.48% of the data. This means that one group will risk less exposure to an inferior variant during the test. In addition, sample sizes affect the conversion rate and confidence interval calculation (as shown above) which will cause skewness in the distribution and inaccuracies between the groups due to unequal sample sizes.

It is apparent that there is an issue when calculating the required sample size for each group. Prior to the calculation of the A/B testing, we see that the required sample size is 38,958. However, there is only 8,609 in the treatment group which is 4.5x less than the required amount and therefore, the results are invalid. Moreover, the treatment group would not be able to do any classical t-test because it did not meet the required sample size to make inferences. In order to avoid this situation, it is best to calculate the required sample size per group prior to assigning control and treatment groups. Also the test should have continued until we have received 38,958 observations in the treatment group.

Despite these issues, let's continue A/B testing anyway with the given data for the purpose of this project.

In [74]:
# Calculating the pooled probability of control and treatment groups--total number of users who converted divided by total number of users
pool_prob = (control_conv + treatment_conv) / (control_total + treatment_total)
pool_prob

0.539437080674194

In [75]:
# Calculating pooled standard error
pool_se = math.sqrt(pool_prob * ( 1 - pool_prob) * (1 / control_total + 1 / treatment_total))
pool_se

0.006141149640239014

In [76]:
# Calculating z-score; 0.975 represents 95% confidence interval of a two-tailed test ( 1 - (0.05/2))
z_score = stats.norm.ppf(0.975)
z_score

1.959963984540054

In [77]:
# Calculcate margin of error
moe = z_score * pool_se
moe

0.012036432118539577

In [78]:
# Calculate "d hat"--estimating the difference between probability of converted experiment and probability of converted control.
d_hat = (treatment_conv / treatment_total) - (control_conv / control_total)
d_hat

0.007891461416184464

In [79]:
# Testing the hypothesis and calculcating the confidence interval
lower_bound = d_hat - moe
upper_bound = d_hat + moe

if d_hat > upper_bound or d_hat < lower_bound or practical_sig < lower_bound:
    print('Reject the null hypothesis.')
else:
    print('Fail to reject the null hypothesis.')

print('The confidence interval is: [{}, {}]'.format(round(lower_bound, 4), round(upper_bound, 4)))

Fail to reject the null hypothesis.
The confidence interval is: [-0.0041, 0.0199]


# Conclusion

Based on the testing, we fail to reject the null hypothesis. Therefore, this hypothesis test is not statistically significant and we are unable to provide enough evidence for the alternative hypothesis. It is statistically proven that there is no difference or the difference is too minimal that it's not worth implementing a 3 web form field from a 5 web form field. In fact, I am confident that there is not a practically significant change because it does not meet the 1% practical significance.

The confidence interval is between -0.0041 and 0.0199 at the 95% confidence level. It is probable that the conversion rate would change by at least 0.41%. If we were to also consider the practical significance, a 1% change (like most industry standard), it does not meet the 1% threshold. In addition, these results make sense because we also did not meet the required sample size for the test, and therefore, we should continue testing as these results are not reliable.

