In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
import scipy.stats as st
%matplotlib inline

In [3]:
df_1 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/clean/df_merged.csv')
df_2 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_1.txt')
df_3 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_2.txt')
merged_df = pd.concat([df_2, df_3], axis=0)

df_merged = df_1.merge(merged_df, on='client_id', how='inner')
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])

### Groups sorting to control/test

In [6]:
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

# Sort control group
control_group_sorted = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

# Sort test group
test_group_sorted = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

### Adding age group

In [8]:
# define age bins and categorize ages
bins = [0, 30, 40, 50, 100] # You can decide the intervals based on your data
labels = ['Under 30', '30-39', '40-49', '50 and above']
df_merged['age_group'] = pd.cut(df_merged['clnt_age'], bins=bins, labels=labels)
control_group_sorted['age_group'] = pd.cut(control_group_sorted['clnt_age'], bins=bins, labels=labels)
test_group_sorted['age_group'] = pd.cut(test_group_sorted['clnt_age'], bins=bins, labels=labels)


### Function to get latest starts

In [11]:
# Function to get latest starts
def filter_latest_starts(group_df):
    starts_only = group_df[group_df['process_step'] == 'start']
    latest_starts = starts_only.loc[starts_only.groupby('visit_id')['date_time'].idxmax()]
    # Merge back to get the full process after the last start
    return df_merged.merge(latest_starts[['client_id', 'date_time']], on=['client_id', 'date_time'], how='inner')

# Apply to both groups
filtered_control = filter_latest_starts(control_group)
filtered_test = filter_latest_starts(test_group)

#Check if it works
client_total_entries = df_merged[df_merged["client_id"] == 2304905]
client_last_start = filtered_control[filtered_control['client_id'] == 2304905 ]

### Completion rate based on visit_id (Primary metric)

- number of customers who reached the end of a step out of total who started

In [15]:
#completion rate based on visit_id
def calculate_within_visit_completion_rate(group):
    # Total unique visits that started
    started_visits = group[group['process_step'] == 'start']['visit_id'].nunique()

    # Unique visits that completed each step
    completed_visits = (
        group[group['process_step'] != 'start']
        .groupby('process_step')['visit_id']
        .nunique()
        .reset_index(name='completed_visits')
    )

    # Add the total started visits as a constant column
    completed_visits['started_visits'] = started_visits

    # Calculate the completion rate
    completed_visits['completion_rate'] = (
        completed_visits['completed_visits'] / completed_visits['started_visits']
    ) * 100

    return completed_visits

# Calculate completion rates for the control and test groups
control_completion_rate = calculate_within_visit_completion_rate(control_group_sorted)
test_completion_rate = calculate_within_visit_completion_rate(test_group_sorted)

# Display the results
print("Control Group Within-Visit Completion Rates:")
display(control_completion_rate)

print("Test Group Within-Visit Completion Rates:")
display(test_completion_rate)

Control Group Within-Visit Completion Rates:


Unnamed: 0,process_step,completed_visits,started_visits,completion_rate
0,confirm,16039,30903,51.90111
1,step_1,23541,30903,76.17707
2,step_2,20131,30903,65.142543
3,step_3,18293,30903,59.1949


Test Group Within-Visit Completion Rates:


Unnamed: 0,process_step,completed_visits,started_visits,completion_rate
0,confirm,21725,33142,65.551264
1,step_1,28273,33142,85.308672
2,step_2,24493,33142,73.903204
3,step_3,22180,33142,66.924145


### Completion rate based on client_id (Secondary metric)

In [19]:
#completion rate based on client_id
def calculate_within_visit_completion_rate(group):
    # Total unique visits that started
    started_visits = group[group['process_step'] == 'start']['client_id'].nunique()

    # Unique visits that completed each step
    completed_visits = (
        group[group['process_step'] != 'start']
        .groupby('process_step')['client_id']
        .nunique()
        .reset_index(name='completed_visits')
    )

    # Add the total started visits as a constant column
    completed_visits['started_visits'] = started_visits

    # Calculate the completion rate
    completed_visits['completion_rate'] = (
        completed_visits['completed_visits'] / completed_visits['started_visits']
    ) * 100

    return completed_visits

# Calculate completion rates for the control and test groups
control_completion_rate_id = calculate_within_visit_completion_rate(control_group_sorted)
test_completion_rate_id = calculate_within_visit_completion_rate(test_group_sorted)

# Display the results
print("Control Group Within-Visit Completion Rates:")
display(control_completion_rate_id)

print("Test Group Within-Visit Completion Rates:")
display(test_completion_rate_id)

Control Group Within-Visit Completion Rates:


Unnamed: 0,process_step,completed_visits,started_visits,completion_rate
0,confirm,15428,23391,65.956992
1,step_1,20146,23391,86.127143
2,step_2,18644,23391,79.70587
3,step_3,17416,23391,74.455987


Test Group Within-Visit Completion Rates:


Unnamed: 0,process_step,completed_visits,started_visits,completion_rate
0,confirm,18682,26670,70.048744
1,step_1,24259,26670,90.95988
2,step_2,22252,26670,83.434571
3,step_3,20876,26670,78.275216


### Completion rates for control and test groups based on age group

In [22]:
def calculate_within_visit_completion_rate_by_age(group):
    # Total unique visits that started
    started_visits = group[group['process_step'] == 'start']['client_id'].nunique()

    # Unique visits that completed each step, grouped by age_group
    completed_visits = (
        group[group['process_step'] != 'start']
        .groupby(['process_step', 'age_group'])['client_id']
        .nunique()
        .reset_index(name='completed_visits')
    )

    # Add the total started visits as a constant column
    completed_visits['started_visits'] = started_visits

    # Calculate the completion rate
    completed_visits['completion_rate'] = (
        completed_visits['completed_visits'] / completed_visits['started_visits']
    ) * 100

    return completed_visits

# Calculate completion rates for the control and test groups based on age group
control_completion_rate_by_age = calculate_within_visit_completion_rate_by_age(control_group_sorted)
test_completion_rate_by_age = calculate_within_visit_completion_rate_by_age(test_group_sorted)

  .groupby(['process_step', 'age_group'])['client_id']
  .groupby(['process_step', 'age_group'])['client_id']


### Completion rate hypothesis test using two-proportion z-test

Since the new design (Test group) had a higher completion rate compared to the old design (Control group), you are required to confirm if this difference is statistically significant

In [25]:
# Function to perform a two-proportion z-test
def two_proportion_z_test(p1, p2, n1, n2):
    # Calculate the pooled proportion
    P = (p1 * n1 + p2 * n2) / (n1 + n2)
    
    # Calculate the standard error
    SE = (P * (1 - P) * (1 / n1 + 1 / n2)) ** 0.5
    
    # Calculate the z-statistic
    z = (p1 - p2) / SE
    
    # Calculate the p-value
    p_value = 2 * (1 - st.norm.cdf(abs(z)))  # Two-tailed test
    
    return z, p_value

# Hypothesis testing for each step
steps = ['confirm', 'step_1', 'step_2', 'step_3']

for step in steps:
    # Get completion rates for both control and test groups for the current step
    control_completions = control_completion_rate[control_completion_rate['process_step'] == step]['completion_rate'].values[0]
    test_completions = test_completion_rate[test_completion_rate['process_step'] == step]['completion_rate'].values[0]
    
    control_total = control_completion_rate[control_completion_rate['process_step'] == step]['started_visits'].values[0]
    test_total = test_completion_rate[test_completion_rate['process_step'] == step]['started_visits'].values[0]

    # Calculate proportions (completion rate)
    p_control = control_completions / 100  # Convert completion rate to a proportion
    p_test = test_completions / 100  # Convert completion rate to a proportion
    
    # Perform the two-proportion z-test
    z_stat, p_value = two_proportion_z_test(p_control, p_test, control_total, test_total)
    
    # Print results
    print(f"Step: {step}")
    print(f"Z-statistic: {z_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    
    # Hypothesis testing interpretation
    if p_value < 0.05:
        print(f"Reject the null hypothesis: There is a difference in completion rates between control and test group for step: {step}.")
    else:
        print(f"Fail to reject the null hypothesis: There is no significant difference in completion rates between control and test group for step: {step}.")
    print("\n")

Step: confirm
Z-statistic: -35.0921
P-value: 0.0000
Reject the null hypothesis: There is a difference in completion rates between control and test group for step: confirm.


Step: step_1
Z-statistic: -29.3782
P-value: 0.0000
Reject the null hypothesis: There is a difference in completion rates between control and test group for step: step_1.


Step: step_2
Z-statistic: -24.1018
P-value: 0.0000
Reject the null hypothesis: There is a difference in completion rates between control and test group for step: step_2.


Step: step_3
Z-statistic: -20.2670
P-value: 0.0000
Reject the null hypothesis: There is a difference in completion rates between control and test group for step: step_3.




### Completion Rate with a Cost-Effectiveness Threshold

Threshold: Vanguard has set this minimum increase in completion rate at 5%. This is the rate at which the projected benefits, in terms of increased user engagement and potential revenue, are estimated to outweigh the costs of the new design.
You are required to carry out another analysis, ensuring that the observed increase in completion rate from the A/B test meets or exceeds this 5% threshold. If the new design doesn’t lead to at least this level of improvement, it may not be justifiable from a cost perspective, regardless of its statistical significance.

In [28]:
from scipy.stats import ttest_ind

# calculate the completion rate for each group across all steps
control_mean = control_completion_rate['completion_rate'].mean()
test_mean = test_completion_rate['completion_rate'].mean()

# two-sample t-test to check if the completion rates are significantly different
_, p_value = st.ttest_ind(control_completion_rate['completion_rate'], 
                                test_completion_rate['completion_rate'], 
                                alternative='two-sided')

# display the results for completion rate difference significance
print(f"Average completion rate per Control group: {control_mean:.2f}%")
print(f"Average completion rate per Test group: {test_mean:.2f}%")
print(f"Statistic: {_:.4f}") #how much the means of the two groups differ relative to the variability of the data
print(f"P-value: {p_value:.4f}") #check if the difference in average tenure between the two groups is statistically significant

# hypothesis test: Is the completion rate difference significant?
alpha = 0.05  
if p_value < alpha:
    print("Reject the null hypothesis: The completion rates are significantly different between the Test and Control groups.")
else:
    print("Fail to reject the null hypothesis: The completion rates are not significantly different between the Test and Control groups.")

# check if the increase in completion rate meets the 5% threshold
completion_rate_increase = test_mean - control_mean
print(f"Completion rate increase: {completion_rate_increase:.2f}%")

# check if the increase meets or exceeds the 5% threshold
if completion_rate_increase >= 5:
    print("The completion rate increase meets the 5% threshold, justifying the cost of the new design.")
else:
    print("The completion rate increase does not meet the 5% threshold. The new design may not justify its cost.")


Average completion rate per Control group: 63.10%
Average completion rate per Test group: 72.92%
Statistic: -1.4365
P-value: 0.2009
Fail to reject the null hypothesis: The completion rates are not significantly different between the Test and Control groups.
Completion rate increase: 9.82%
The completion rate increase meets the 5% threshold, justifying the cost of the new design.


### Additional hypothesis test 

#### Tenure: Test if client tenure (how long they've been with Vanguard) influences engagement with the new design, using two-sample t-test

In [None]:
import scipy.stats as st

control_uniqe = control_group_sorted.drop_duplicates(subset='client_id')
test_unique = test_group_sorted.drop_duplicates(subset='client_id')

control_tenure = control_uniqe['clnt_tenure_yr'] 
test_tenure = test_unique['clnt_tenure_yr']        
# two-sample t-test to compare the means of tenure between the two groups
_, p_value = st.ttest_ind(control_tenure, test_tenure, equal_var=True)  # assuming equal variance
#results
print(f"Average tenure per Control group: {control_tenure.mean():.2f} years")
print(f"Average tenure per Test group: {test_tenure.mean():.2f} years")
print(f"Statistic: {_:.4f}")
print(f"P-value: {p_value:.4f}")

# hypothesis test: Is there a significant difference in tenure between the two groups?
alpha = 0.05  # significance level

if p_value < alpha:
    print("Reject the null hypothesis: The average tenure is significantly different between the Test and Control groups.")
else:
    print("Fail to reject the null hypothesis: The average tenure is not significantly different between the Test and Control groups.")


#### Age: Test if the average age of clients engaging with the new design is different from those using the old design

In [None]:
control_uniqe = control_group_sorted.drop_duplicates(subset='client_id')
test_unique = test_group_sorted.drop_duplicates(subset='client_id')

control_age = control_uniqe['clnt_age']
test_age = test_unique['clnt_age']

# two-sample t-test to compare the means of age between the two groups
_, p_value = st.ttest_ind(control_age, test_age, equal_var=True)  # Assuming equal variance

# results
print(f"Control Group Mean Age: {control_age.mean():.2f} years")
print(f"Test Group Mean Age: {test_age.mean():.2f} years")
print(f"Statistic: {_:.4f}")
print(f"P-value: {p_value:.4f}")

# hypothesis test: Is there a significant difference in age between the two groups?
alpha = 0.05  #

if p_value < alpha:
    print("Reject the null hypothesis: The average age is different between the Test and Control groups.")
else:
    print("Fail to reject the null hypothesis: The average age is not significantly different between the Test and Control groups.")

The statistical test (t-test) suggests that the average ages of the two groups are indeed different. Since the p-value is less than 0.05, this indicates a statistically significant difference in age between the Test and Control groups.

### Test & Control grouped by age group table

In [None]:
control_uniqe = control_group_sorted.drop_duplicates(subset='client_id')
test_unique = test_group_sorted.drop_duplicates(subset='client_id')

# Calculate average age group for each age group in Control and Test groups
control_age_group = control_uniqe["age_group"].value_counts()
test_age_group= test_unique["age_group"].value_counts()

age_groups_concat = pd.concat(
    [control_age_group, test_age_group], 
    axis=1, 
    keys=["Control Group Count", "Test Group Count"]
)


# Rename the columns for clarity
age_groups_concat.rename(columns={"index": "Age Group"}, inplace=True)
age_groups_concat = age_groups_concat.sort_values(by="age_group", ascending=True) #sort values
# Reset the index to create a proper DataFrame structure
age_groups_concat = age_groups_concat.reset_index()

### Age group x Gender table

In [None]:
control_age_group_gender = control_uniqe.groupby("age_group")["gender"].value_counts().unstack()
test_age_group_gender = test_unique.groupby("age_group")["gender"].value_counts().unstack()

# Reset the index to create a proper DataFrame structure
control_age_group_gender = control_age_group_gender.reset_index()
test_age_group_gender = test_age_group_gender.reset_index()

### Age groups x Balances table

In [None]:
#filter control and test group based on unique client_id
control_age_group_balance= control_uniqe.groupby("age_group")["balance"].mean().round(2)
test_age_group_balance = test_unique.groupby("age_group")["balance"].mean().round(2)

# Convert the grouped Series to DataFrames
control_age_group_balance_df = control_age_group_balance.reset_index()
test_age_group_balance_df = test_age_group_balance.reset_index()

# Rename the columns for clarity
control_age_group_balance_df.rename(columns={"age_group": "Age Group", "balance": "Control Group Balance"}, inplace=True)
test_age_group_balance_df.rename(columns={"age_group": "Age Group", "balance": "Test Group Balance"}, inplace=True)
