In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Data Reading and Cleaning

In [2]:
def read_raw_data():
    import pandas as pd
    """
    Summary: Read 4 files df_final_demo.txt, df_final_experiment_clients.txt, df_final_web_data_pt_1.txt and 
    df_final_web_data_pt_2.txt into separate data frames
    parameters: None
    Return: 4 Data Frame of each file, the last one being the concat of pt_1 and pt_2
    """
    df_demo = pd.read_csv(r"..\..\data_files\raw\df_final_demo.txt")
    df_exp = pd.read_csv(r"..\..\data_files\raw\df_final_experiment_clients.txt")
    df_web_1 = pd.read_csv(r"..\..\data_files\raw\df_final_web_data_pt_1.txt")
    df_web_2 = pd.read_csv(r"..\..\data_files\raw\df_final_web_data_pt_2.txt")
    df_web = pd.concat([df_web_1,df_web_2])
    return df_demo, df_exp, df_web 

In [3]:
# read data from given raw files. WARNING: Do not move csv files that are on ..\..\data_files folder
df_demo, df_exp, df_web = read_raw_data()

In [4]:
def drop_na (df, threshold=0):
    """
    Summary: Given a DataFrame df, drop na values
    Parameters: DataFrame df, threshold int (optional) default is 0
    Return: Data Frame df with drop null values as specified with treshold
    """
    if  threshold==0:
        df.dropna(inplace=True)
        
    else:
        
        try:
            # drop where there is more than threshold (int) columns with null values
            df.dropna(inplace=True,thresh=threshold)
            
        except:
            print ("Error, please check optional Parameters")
            
    return df

In [5]:
# drop na for experminet DF, as the client ids are not part of the experiment
df_exp = drop_na(df_exp)

In [6]:
def fill_na_mean(df,col):
    """
    Summary: Fill null values with mean of column col
    Parameters: DataFrame DF, string col (numerical column of DF)
    Return: Data frame with column col filled with mean for null values
    """
    # fill row that its missing age with mean value
    clnt_age_mean = df[col].mean()
    df[col].fillna(clnt_age_mean,inplace=True)
    return df


In [7]:
# drop where there is more than 2 columns with null values and fill with mean (only one occurence)
df_demo = drop_na(df_demo, 3)
df_demo = fill_na_mean(df_demo,'clnt_age')

# Merging & EDA

In [8]:
df_demo['gendr'].value_counts() # Undetermined leads, cannot work with it if we want to use gender, need to exclude

gendr
U    24122
M    23724
F    22746
X        3
Name: count, dtype: int64

In [9]:
#NOT USING MIGHT DELETE
#df_merged = pd.merge(left = df_web, right =df_demo, on = 'client_id') 

In [10]:
# Get only clients that are part of the experiment and merge with them and drop the ones with null Variation
df_online = pd.merge(df_demo,df_exp,how='left',on='client_id')
df_online.dropna(inplace=True)
df_online.head()

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0,Control
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0,Test
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0,Test
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0,Control


In [11]:
# check that there is no repetition of client_id
df_online['client_id'].nunique() == df_online.shape[0]

True

In [12]:
df_demo['clnt_age'].median()
# median of age is 47

47.0

In [13]:
df_online['clnt_age'].median()
# same median as well as average

48.0

In [14]:
df_online.describe() # numerical variables, client id info can be disregarded

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,num_accts,bal,calls_6_mnth,logons_6_mnth
count,50488.0,50488.0,50488.0,50488.0,50488.0,50488.0,50488.0,50488.0
mean,5006173.0,12.03173,150.415485,47.319778,2.254575,149514.7,3.093289,6.131873
std,2877417.0,6.860282,81.94483,15.518463,0.533671,302036.4,2.187991,2.175423
min,555.0,2.0,33.0,17.0,1.0,23789.44,0.0,3.0
25%,2515700.0,6.0,82.0,33.5,2.0,39878.41,1.0,4.0
50%,5025026.0,11.0,136.0,48.0,2.0,65733.6,3.0,6.0
75%,7477918.0,16.0,192.0,59.5,2.0,139956.5,5.0,8.0
max,9999832.0,55.0,669.0,96.0,7.0,16320040.0,6.0,9.0


In [15]:
df_online.describe(include='object') # categorical variables

Unnamed: 0,gendr,Variation
count,50488,50488
unique,4,2
top,U,Test
freq,17280,26961


In [16]:
# check Test and Control counts
df_online['Variation'].value_counts()

Variation
Test       26961
Control    23527
Name: count, dtype: int64

In [17]:
df_web.head(10) # KPI - check who got to confirm

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
5,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:17:15
6,9988021,580560515_7732621733,781255054_21935453173_531117,step_1,2017-04-17 15:17:01
7,9988021,580560515_7732621733,781255054_21935453173_531117,start,2017-04-17 15:16:22
8,8320017,39393514_33118319366,960651974_70596002104_312201,confirm,2017-04-05 13:10:05
9,8320017,39393514_33118319366,960651974_70596002104_312201,step_3,2017-04-05 13:09:43


In [18]:
df_web['process_step'].value_counts()

process_step
start      243945
step_1     163193
step_2     133062
step_3     112242
confirm    102963
Name: count, dtype: int64

In [19]:
df_web.info() # date_time is an object, need to change to datetime 

<class 'pandas.core.frame.DataFrame'>
Index: 755405 entries, 0 to 412263
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   client_id     755405 non-null  int64 
 1   visitor_id    755405 non-null  object
 2   visit_id      755405 non-null  object
 3   process_step  755405 non-null  object
 4   date_time     755405 non-null  object
dtypes: int64(1), object(4)
memory usage: 34.6+ MB


In [20]:
# Merge df_online with df_web
df_ab = pd.merge(df_online,df_web)
df_ab.head(12)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,visitor_id,visit_id,process_step,date_time
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,228976764_46825473280_96584,confirm,2017-04-02 11:51:13
1,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,228976764_46825473280_96584,confirm,2017-04-02 11:47:50
2,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,228976764_46825473280_96584,confirm,2017-04-02 11:46:45
3,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,228976764_46825473280_96584,step_3,2017-04-02 11:23:08
4,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,228976764_46825473280_96584,step_2,2017-04-02 11:22:24
5,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,228976764_46825473280_96584,step_1,2017-04-02 11:21:38
6,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,228976764_46825473280_96584,start,2017-04-02 11:21:28
7,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,104438405_2368283624_817211,start,2017-03-29 11:02:44
8,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,104438405_2368283624_817211,start,2017-03-29 11:01:40
9,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,Test,427070339_1413275162,104438405_2368283624_817211,start,2017-03-29 10:59:43


In [21]:
# Change date time col to dateime class
df_ab['date_time'] = pd.to_datetime(df_ab['date_time'])
df_ab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321207 entries, 0 to 321206
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   client_id         321207 non-null  int64         
 1   clnt_tenure_yr    321207 non-null  float64       
 2   clnt_tenure_mnth  321207 non-null  float64       
 3   clnt_age          321207 non-null  float64       
 4   gendr             321207 non-null  object        
 5   num_accts         321207 non-null  float64       
 6   bal               321207 non-null  float64       
 7   calls_6_mnth      321207 non-null  float64       
 8   logons_6_mnth     321207 non-null  float64       
 9   Variation         321207 non-null  object        
 10  visitor_id        321207 non-null  object        
 11  visit_id          321207 non-null  object        
 12  process_step      321207 non-null  object        
 13  date_time         321207 non-null  datetime64[ns]
dtypes: d

In [22]:
def split_df (df,col='Variation', comp = 'Test'):
    mask = df[col]==comp
    df_1 = df.loc[mask]
    df_2 = df.loc[~mask]
    return df_1, df_2


In [23]:
# split DF by test and control groups
df_test, df_control = split_df(df_ab)

# Success Metric (KPIs)
### Completion Rate 
The proportion of users who reach the final ‘confirm’ step.

In [24]:
def completion_rate (df):
    confirm_step_count = df[df['process_step'] == 'confirm']['client_id'].nunique()
    total_users = df['client_id'].nunique()
    completion_rate = confirm_step_count / total_users
    return completion_rate, confirm_step_count, total_users

In [25]:
completion_rate_tst, confirm_step_count_tst, total_users_tst = completion_rate(df_test)
print(f"Completion Rate Test Group : {completion_rate_tst:.2%}, Test Total: {total_users_tst}, Confirm Count: {confirm_step_count_tst} ")

Completion Rate Test Group : 69.29%, Test Total: 26961, Confirm Count: 18682 


In [26]:
completion_rate_ctl, confirm_step_count_ctl, total_users_ctl = completion_rate(df_control)
print(f"Completion Rate Control Group: {completion_rate_ctl:.2%}, Test Total: {total_users_ctl}, Confirm Count: {confirm_step_count_ctl} ")

Completion Rate Control Group: 65.58%, Test Total: 23527, Confirm Count: 15429 


###  Time Spent on Each Step
The average duration users spend on each step. Need to check with Raiana

In [43]:
# transform colunm to datetime 
# order by client and date
# and then shift with diferrence for new column
#df_ab = df_ab.sort_values(['client_id','date_time'], key=lambda x: pd.to_datetime(x))
df_ab = df_ab.sort_values(['client_id','date_time'])
df_ab['first_time']= df_ab.groupby('client_id')['date_time'].shift(1)
df_ab['previous_step']= df_ab.groupby('client_id')['process_step'].shift(1)
df_ab.dtypes

client_id                     int64
clnt_tenure_yr              float64
clnt_tenure_mnth            float64
clnt_age                    float64
gendr                        object
num_accts                   float64
bal                         float64
calls_6_mnth                float64
logons_6_mnth               float64
Variation                    object
visitor_id                   object
visit_id                     object
process_step                 object
date_time            datetime64[ns]
first_time           datetime64[ns]
diff                timedelta64[ns]
previous_step                object
dtype: object

In [45]:
# take diff
df_ab['diff'] = df_ab['date_time'] - df_ab['first_time']

In [46]:
df_test, df_control = split_df(df_ab)

In [53]:
df_ab.groupby('process_step')['diff'].mean()
df_ab.shape

(321207, 17)

In [31]:
df_test.groupby('process_step')['diff'].mean()

process_step
confirm   1 days 05:00:41.146959592
start     3 days 02:55:37.780723214
step_1    0 days 00:22:20.321410097
step_2    0 days 00:01:51.238731757
step_3    0 days 00:12:19.069989559
Name: diff, dtype: timedelta64[ns]

In [32]:
df_control.groupby('process_step')['diff'].mean()

process_step
confirm   0 days 18:21:25.156749327
start     4 days 06:00:00.843642966
step_1    0 days 00:30:39.337504650
step_2    0 days 00:00:39.347493332
step_3    0 days 00:09:35.350907253
Name: diff, dtype: timedelta64[ns]

In [52]:
# df_test filter for time spend on each step
# NOTE: negation of the filter should be the errors
mask_step1 = (df_test['process_step'] == 'step_1') & (df_test['previous_step'] == 'start')
mask_step2 = (df_test['process_step'] == 'step_2') & (df_test['previous_step'] == 'step_1')
mask_step3 = (df_test['process_step'] == 'step_3') & (df_test['previous_step'] == 'step_2')
mask_confirm = (df_test['process_step'] == 'confirm') & (df_test['previous_step'] == 'step_3')
df_test_time_each_step = df_test[mask_step1 | mask_step2 | mask_step3 | mask_confirm]
df_test_time_each_step.shape

(177787, 17)

In [56]:
df_control

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,visitor_id,visit_id,process_step,date_time,first_time,diff,previous_step
302427,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,start,2017-04-08 18:51:28,NaT,NaT,
302426,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_1,2017-04-08 18:52:17,2017-04-08 18:51:28,0 days 00:00:49,start
302425,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_1,2017-04-08 18:53:20,2017-04-08 18:52:17,0 days 00:01:03,step_1
302424,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_2,2017-04-08 18:53:29,2017-04-08 18:53:20,0 days 00:00:09,step_1
302423,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_3,2017-04-08 18:58:04,2017-04-08 18:53:29,0 days 00:04:35,step_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142971,9998346,50.0,602.0,61.5,F,2.0,149881.38,6.0,9.0,Control,292425655_16607136645,189177304_69869411700_783154,step_3,2017-03-29 15:30:25,2017-03-29 15:30:11,0 days 00:00:14,step_2
142968,9998346,50.0,602.0,61.5,F,2.0,149881.38,6.0,9.0,Control,292425655_16607136645,189177304_69869411700_783154,step_3,2017-03-29 15:37:28,2017-03-29 15:30:25,0 days 00:07:03,step_3
142969,9998346,50.0,602.0,61.5,F,2.0,149881.38,6.0,9.0,Control,292425655_16607136645,189177304_69869411700_783154,confirm,2017-03-29 15:37:28,2017-03-29 15:37:28,0 days 00:00:00,step_3
142970,9998346,50.0,602.0,61.5,F,2.0,149881.38,6.0,9.0,Control,292425655_16607136645,189177304_69869411700_783154,step_3,2017-03-29 15:37:28,2017-03-29 15:37:28,0 days 00:00:00,confirm


In [66]:
def filter_steps(df):
    mask_step1 = (df['process_step'] == 'step_1') & (df['previous_step'] == 'start')
    mask_step2 = (df['process_step'] == 'step_2') & (df['previous_step'] == 'step_1')
    mask_step3 = (df['process_step'] == 'step_3') & (df['previous_step'] == 'step_2')
    mask_confirm = (df['process_step'] == 'confirm') & (df['previous_step'] == 'step_3')
    df = df[mask_step1 | mask_step2 | mask_step3 | mask_confirm]
    return df

In [67]:
df_control_time_each_step = filter_steps(df_control)
df_test_time_each_step = filter_steps(df_test)


In [68]:
df_control_time_each_step.head(20)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,visitor_id,visit_id,process_step,date_time,first_time,diff,previous_step
302426,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_1,2017-04-08 18:52:17,2017-04-08 18:51:28,0 days 00:00:49,start
302424,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_2,2017-04-08 18:53:29,2017-04-08 18:53:20,0 days 00:00:09,step_1
302423,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_3,2017-04-08 18:58:04,2017-04-08 18:53:29,0 days 00:04:35,step_2
302420,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_2,2017-04-08 19:00:17,2017-04-08 19:00:15,0 days 00:00:02,step_1
284895,1186,8.0,99.0,22.0,U,2.0,31662.52,0.0,3.0,Control,446844663_31615102958,795373564_99931517312_810896,step_1,2017-04-08 18:05:13,2017-04-08 18:05:02,0 days 00:00:11,start
284894,1186,8.0,99.0,22.0,U,2.0,31662.52,0.0,3.0,Control,446844663_31615102958,795373564_99931517312_810896,step_2,2017-04-08 18:05:24,2017-04-08 18:05:13,0 days 00:00:11,step_1
285574,1195,21.0,262.0,54.5,M,2.0,28457.96,2.0,5.0,Control,766842522_69992551638,393817425_39015278493_996341,step_1,2017-04-05 20:15:59,2017-04-05 20:15:26,0 days 00:00:33,start
285573,1195,21.0,262.0,54.5,M,2.0,28457.96,2.0,5.0,Control,766842522_69992551638,393817425_39015278493_996341,step_2,2017-04-05 20:17:37,2017-04-05 20:15:59,0 days 00:01:38,step_1
285572,1195,21.0,262.0,54.5,M,2.0,28457.96,2.0,5.0,Control,766842522_69992551638,393817425_39015278493_996341,step_3,2017-04-05 20:18:08,2017-04-05 20:17:37,0 days 00:00:31,step_2
285571,1195,21.0,262.0,54.5,M,2.0,28457.96,2.0,5.0,Control,766842522_69992551638,393817425_39015278493_996341,confirm,2017-04-05 20:19:31,2017-04-05 20:18:08,0 days 00:01:23,step_3


In [69]:
df_test_time_each_step.head(20)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,visitor_id,visit_id,process_step,date_time,first_time,diff,previous_step
255407,555,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,2017-04-15 12:57:56,0 days 00:00:07,start
255406,555,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,2017-04-15 12:58:03,0 days 00:00:32,step_1
255405,555,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,2017-04-15 12:58:35,0 days 00:01:39,step_2
255404,555,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,2017-04-15 13:00:14,0 days 00:00:20,step_3
9748,647,12.0,151.0,57.5,M,2.0,30525.8,0.0,4.0,Test,66758770_53988066587,40369564_40101682850_311847,step_1,2017-04-12 15:41:35,2017-04-12 15:41:28,0 days 00:00:07,start
9747,647,12.0,151.0,57.5,M,2.0,30525.8,0.0,4.0,Test,66758770_53988066587,40369564_40101682850_311847,step_2,2017-04-12 15:41:53,2017-04-12 15:41:35,0 days 00:00:18,step_1
9746,647,12.0,151.0,57.5,M,2.0,30525.8,0.0,4.0,Test,66758770_53988066587,40369564_40101682850_311847,step_3,2017-04-12 15:45:02,2017-04-12 15:41:53,0 days 00:03:09,step_2
9745,647,12.0,151.0,57.5,M,2.0,30525.8,0.0,4.0,Test,66758770_53988066587,40369564_40101682850_311847,confirm,2017-04-12 15:47:45,2017-04-12 15:45:02,0 days 00:02:43,step_3
105315,1336,48.0,576.0,42.0,M,4.0,130537.18,6.0,9.0,Test,920624746_32603333901,583743392_96265099036_939815,step_1,2017-05-08 06:05:37,2017-05-08 06:05:12,0 days 00:00:25,start
105314,1336,48.0,576.0,42.0,M,4.0,130537.18,6.0,9.0,Test,920624746_32603333901,583743392_96265099036_939815,step_2,2017-05-08 06:06:03,2017-05-08 06:05:37,0 days 00:00:26,step_1


In [58]:
# df_control filter for time spend on each step
# NOTE: negation of the filter should be the errors
maskc_step1 = (df_control['process_step'] == 'step_1') & (df_control['previous_step'] == 'start')
maskc_step2 = (df_control['process_step'] == 'step_2') & (df_control['previous_step'] == 'step_1')
maskc_step3 = (df_control['process_step'] == 'step_3') & (df_control['previous_step'] == 'step_2')
maskc_confirm = (df_control['process_step'] == 'confirm') & (df_control['previous_step'] == 'step_3')
df_control_time_each_step = df_control[maskc_step1 | maskc_step2 | maskc_step3 | maskc_confirm]
df_control_time_each_step.shape
df_control_time_each_step.head(20)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,visitor_id,visit_id,process_step,date_time,first_time,diff,previous_step
302426,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_1,2017-04-08 18:52:17,2017-04-08 18:51:28,0 days 00:00:49,start
302424,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_2,2017-04-08 18:53:29,2017-04-08 18:53:20,0 days 00:00:09,step_1
302423,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_3,2017-04-08 18:58:04,2017-04-08 18:53:29,0 days 00:04:35,step_2
302420,1028,12.0,145.0,36.0,M,3.0,103520.22,1.0,4.0,Control,42237450_62128060588,557292053_87239438319_391157,step_2,2017-04-08 19:00:17,2017-04-08 19:00:15,0 days 00:00:02,step_1
284895,1186,8.0,99.0,22.0,U,2.0,31662.52,0.0,3.0,Control,446844663_31615102958,795373564_99931517312_810896,step_1,2017-04-08 18:05:13,2017-04-08 18:05:02,0 days 00:00:11,start
284894,1186,8.0,99.0,22.0,U,2.0,31662.52,0.0,3.0,Control,446844663_31615102958,795373564_99931517312_810896,step_2,2017-04-08 18:05:24,2017-04-08 18:05:13,0 days 00:00:11,step_1
285574,1195,21.0,262.0,54.5,M,2.0,28457.96,2.0,5.0,Control,766842522_69992551638,393817425_39015278493_996341,step_1,2017-04-05 20:15:59,2017-04-05 20:15:26,0 days 00:00:33,start
285573,1195,21.0,262.0,54.5,M,2.0,28457.96,2.0,5.0,Control,766842522_69992551638,393817425_39015278493_996341,step_2,2017-04-05 20:17:37,2017-04-05 20:15:59,0 days 00:01:38,step_1
285572,1195,21.0,262.0,54.5,M,2.0,28457.96,2.0,5.0,Control,766842522_69992551638,393817425_39015278493_996341,step_3,2017-04-05 20:18:08,2017-04-05 20:17:37,0 days 00:00:31,step_2
285571,1195,21.0,262.0,54.5,M,2.0,28457.96,2.0,5.0,Control,766842522_69992551638,393817425_39015278493_996341,confirm,2017-04-05 20:19:31,2017-04-05 20:18:08,0 days 00:01:23,step_3


In [94]:
steps_mean_test = df_test_time_each_step.groupby('previous_step')['diff'].mean()
steps_mean_test

previous_step
start    0 days 00:15:03.846748440
step_1   0 days 00:00:48.226330451
step_2   0 days 00:07:42.580510383
step_3   0 days 00:25:04.896828408
Name: diff, dtype: timedelta64[ns]

In [95]:
steps_mean_control = df_control_time_each_step.groupby('previous_step')['diff'].mean()
steps_mean_control

previous_step
start    0 days 00:14:59.401621934
step_1   0 days 00:00:34.029359271
step_2   0 days 00:01:27.175144941
step_3   0 days 00:25:05.294793161
Name: diff, dtype: timedelta64[ns]

In [None]:
df_test_time_each_step['diff']

In [102]:
from datetime import datetime, timedelta
import numpy as np
from scipy.stats import ttest_ind

# Generate two sets of datetime values for demonstration purposes
#np.random.seed(42)
#start_date = datetime(2022, 1, 1)
#date_series1 = [start_date + timedelta(days=np.random.randint(1, 30)) for _ in range(50)]
#date_series2 = [start_date + timedelta(days=np.random.randint(1, 30)) for _ in range(50)]

# Convert datetime values to numerical values (e.g., Unix timestamps)
#numeric_series1 = [date.total_seconds() for date in steps_mean_test]
#numeric_series2 = [date.total_seconds() for date in steps_mean_control]
print(type(numeric_series1[0]))
# Perform t-test
numeric_series1 = [date.total_seconds() for date in df_test_time_each_step.loc[df_test_time_each_step['previous_step'] == 'step_2','diff'] ]
numeric_series2 = [date.total_seconds() for date in df_control_time_each_step.loc[df_control_time_each_step['previous_step'] == 'step_2','diff']]
#print(numeric_series1)

t_statistic, p_value = ttest_ind(numeric_series1, numeric_series2)

# Check if the p-value is less than a significance level (e.g., 0.05)
significance_level = 0.05
if p_value < significance_level:
    print("The means of the datetime series are significantly different.")
else:
    print("The means of the datetime series are not significantly different.")

<class 'float'>
The means of the datetime series are not significantly different.


In [47]:
mask = df_test['diff'].isna()
df_test[mask]
df_test[~mask]
mask_2 = df_test['process_step'] == 'start'
df_test[(~mask) & (mask_2)] 

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,visitor_id,visit_id,process_step,date_time,first_time,diff,previous_step
83840,934,9.0,109.0,51.0,F,2.0,32522.88,0.0,3.0,Test,810392784_45004760546,7076463_57954418406_971348,start,2017-04-18 02:37:02,2017-04-18 02:36:30,0 days 00:00:32,start
83839,934,9.0,109.0,51.0,F,2.0,32522.88,0.0,3.0,Test,810392784_45004760546,7076463_57954418406_971348,start,2017-04-18 02:38:24,2017-04-18 02:37:02,0 days 00:01:22,start
83838,934,9.0,109.0,51.0,F,2.0,32522.88,0.0,3.0,Test,810392784_45004760546,7076463_57954418406_971348,start,2017-04-18 02:38:52,2017-04-18 02:38:24,0 days 00:00:28,start
3022,1346,14.0,177.0,46.0,F,2.0,822512.91,3.0,6.0,Test,123474046_4204671056,27144337_83739845380_214282,start,2017-06-06 18:23:51,2017-06-06 18:20:36,0 days 00:03:15,step_1
3018,1346,14.0,177.0,46.0,F,2.0,822512.91,3.0,6.0,Test,123474046_4204671056,27144337_83739845380_214282,start,2017-06-06 18:26:15,2017-06-06 18:24:50,0 days 00:01:25,step_3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41623,9998156,8.0,98.0,25.5,U,2.0,31723.51,5.0,8.0,Test,255983388_34400418109,254203981_1117450921_444839,start,2017-04-02 23:42:30,2017-04-02 23:42:06,0 days 00:00:24,start
112564,9999150,5.0,66.0,30.0,U,3.0,97141.71,6.0,9.0,Test,665127594_52605538620,982803842_91891255980_123078,start,2017-05-29 16:55:30,2017-05-29 16:55:18,0 days 00:00:12,step_1
310104,9999729,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test,843385170_36953471821,493310979_9209676464_421146,start,2017-04-20 14:21:27,2017-04-05 13:41:04,15 days 00:40:23,step_1
310101,9999729,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0,Test,843385170_36953471821,493310979_9209676464_421146,start,2017-04-20 14:28:57,2017-04-20 14:27:36,0 days 00:01:21,step_2


# Hypothesis Testing

## Completion Rate
Null Hypthesis: There is no difference between the completion rate between the Test Group and the control Group

Alt Hyphoesis: There is a difference between the completion rate of the Test Group and the control Group



In [34]:
# using chi-squaer ince its categorical variable
import scipy.stats as stats
data = [[confirm_step_count_tst, total_users_tst  * (1-completion_rate_tst)],
        [confirm_step_count_ctl, total_users_ctl * (1-completion_rate_ctl)]]
# Perform Chi-Square test
chi2, p_value, _, _ = stats.chi2_contingency(data)
# Output the results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p_value}")
# Check the significance level (e.g., 0.05)
alpha = 0.05
# Make a decision based on the p-value
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference in completion rates between the test and control groups.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in completion rates between the test and control groups.")

Chi-Square Statistic: 78.8519479876901
P-Value: 6.694339983392046e-19
Reject the null hypothesis: There is a significant difference in completion rates between the test and control groups.


In [35]:
# Other way
crosstable = pd.DataFrame({ "Control": [total_users_ctl-confirm_step_count_ctl, confirm_step_count_ctl],
              "Test": [total_users_tst-confirm_step_count_tst, confirm_step_count_tst]}, index= ['Not Completed','Completed'])
# and use crosstable instead of data 
crosstable

Unnamed: 0,Control,Test
Not Completed,8098,8279
Completed,15429,18682


In [None]:
"""import statsmodels.api as sm

def completion_rate(df):
    confirm_step_count = df[df['process_step'] == 'confirm']['client_id'].nunique()
    total_users = df['client_id'].nunique()
    completion_rate = confirm_step_count / total_users
    return completion_rate, confirm_step_count, total_users  

# Calculate completion rate, confirm step count, and total users for control group
completion_rate_ctl, confirm_step_count_ctl, total_users_ctl = completion_rate(df_control)

# Calculate completion rate, confirm step count, and total users for test group
completion_rate_tst, confirm_step_count_tst, total_users_tst = completion_rate(df_test)

# Perform the proportions z-test
stat, p_value = sm.stats.proportions_ztest([confirm_step_count_ctl, confirm_step_count_tst], [total_users_ctl, total_users_tst])

# Print the test statistic and p-value
print('Test Statistic:', stat)
print('p-value:', p_value)"""

In [59]:
df_test.head()

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,Variation,visitor_id,visit_id,process_step,date_time,first_time,diff,previous_step
255408,555,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,NaT,NaT,
255407,555,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,2017-04-15 12:57:56,0 days 00:00:07,start
255406,555,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,2017-04-15 12:58:03,0 days 00:00:32,step_1
255405,555,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,2017-04-15 12:58:35,0 days 00:01:39,step_2
255404,555,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0,Test,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,2017-04-15 13:00:14,0 days 00:00:20,step_3


In [63]:
from statsmodels.stats.proportion import proportions_ztest

# Calculate completion rates
completion_rate_ctl, confirm_step_count_ctl, total_users_ctl = completion_rate(df_control)
completion_rate_tst, confirm_step_count_tst, total_users_tst = completion_rate(df_test)

# Number of successes (confirmations) and trials (total users) for each group
successes = np.array([confirm_step_count_ctl, confirm_step_count_tst])
trials = np.array([total_users_ctl, total_users_tst])

# Perform the two-sample proportion Z-test
z_score, p_value = proportions_ztest(successes, trials)

# Set the significance level
alpha = 0.05
print(p_value)
# Check if the p-value is less than the significance level
if p_value < alpha:
    print("Reject the null hypothesis. The observed increase in completion rate is statistically significant.")
    if completion_rate_tst - completion_rate_ctl >= 0.05:  # Check if the increase meets or exceeds the 5% threshold
        print("The increase meets or exceeds the 5% threshold.")
    else:
        print("The increase does not meet the 5% threshold.")
else:
    print("Fail to reject the null hypothesis. The observed increase in completion rate is not statistically significant.")

6.144491429497383e-19
Reject the null hypothesis. The observed increase in completion rate is statistically significant.
The increase does not meet the 5% threshold.
