In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
import scipy.stats as st
%matplotlib inline

In [2]:
df_1 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/clean/df_merged.csv')
df_2 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_1.txt')
df_3 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_2.txt')
merged_df = pd.concat([df_2, df_3], axis=0)

df_merged = df_1.merge(merged_df, on='client_id', how='inner')
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])

### Groups sorting to control/test

In [4]:
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

# Sort control group
control_group_sorted = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

# Sort test group
test_group_sorted = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

print(control_group_sorted.shape)
print(test_group_sorted.shape)

(143408, 14)
(177779, 14)


In [5]:
# define age bins and categorize ages
bins = [0, 30, 40, 50, 100] 
labels = ['Under 30', '30-39', '40-49', '50 and above']
df_merged['age_group'] = pd.cut(df_merged['clnt_age'], bins=bins, labels=labels)
control_group_sorted['age_group'] = pd.cut(control_group_sorted['clnt_age'], bins=bins, labels=labels)
test_group_sorted['age_group'] = pd.cut(test_group_sorted['clnt_age'], bins=bins, labels=labels)

### Calculate Error Rates
The error rates are calculated using the formula:  
- **Control Group Error Rate** = (Clients with errors in control) / (Total clients in control)  
- **Test Group Error Rate** = (Clients with errors in test) / (Total clients in test)
- an analysis ensuring that the observed increase in completion rate from the A/B test meets or exceeds this 5% threshold

In [None]:
# Error rate calculation function
def calculate_errors(group):
    # Map process steps to step indices
    group['step_index'] = group['process_step'].map({'start': 0, 'step_1': 1, 'step_2': 2, 'step_3': 3, 'confirm': 4})
    
    # Calculate error: Negative diff indicates backward movement
    group['error'] = group['step_index'].diff().apply(lambda x: x < 0)
    
    return group

# Apply the error calculation to both control and test groups
control_group_sorted = calculate_errors(control_group_sorted)
test_group_sorted = calculate_errors(test_group_sorted)

# Calculate the overall error rates (as percentages)
control_error_rate = control_group_sorted['error'].mean() * 100
test_error_rate = test_group_sorted['error'].mean() * 100

# Print the overall error rates
print(f"Control Group Error Rate: {control_error_rate:.2f}%")
print(f"Test Group Error Rate: {test_error_rate:.2f}%")

# Calculate the difference in error rates between control and test group
error_rate_difference = control_error_rate - test_error_rate  # Control error rate minus Test error rate

# Set the threshold for a 5% difference
threshold = 5

# Check if the test group error rate is at least 5% smaller than the control group
if error_rate_difference >= threshold:
    print(f"The test group has an error rate that is at least {threshold}% smaller than the control group.")
else:
    print(f"The test group does not have an error rate that is at least {threshold}% smaller than the control group.")


### Calculating errors per each step

In [12]:
# Mapping process steps to step indices
step_index_map = {'start': 0, 'step_1': 1, 'step_2': 2, 'step_3': 3, 'confirm': 4}

# Function to calculate backward steps (errors)
def calculate_errors(group):
    # Map process_step to step_index
    group['step_index'] = group['process_step'].map(step_index_map)
    
    # Calculate the difference between consecutive step indices
    group['error'] = group['step_index'].diff().apply(lambda x: x < 0)  # Negative diff indicates backward step
    
    # Fill NaN values for the first row in each group (no previous step to compare)
    group['error'] = group['error'].fillna(False)
    
    return group

# Apply the error calculation to the control and test groups
control_group_sorted = calculate_errors(control_group_sorted)
test_group_sorted = calculate_errors(test_group_sorted)

# Calculate the overall error rates for control and test groups
control_error_rate = control_group_sorted['error'].mean() * 100  # Multiply by 100 to get percentage
test_error_rate = test_group_sorted['error'].mean() * 100

# Print overall error rates
print(f"Control Group Error Rate: {control_error_rate:.2f}%")
print(f"Test Group Error Rate: {test_error_rate:.2f}%")

# Calculate error rates per step (process step by process step)
def error_rate_per_step(group):
    return group.groupby('process_step')['error'].mean() * 100  # Multiply by 100 to get percentage

# Calculate error rates per step for both groups
control_error_rates_per_step = error_rate_per_step(control_group_sorted)
test_error_rates_per_step = error_rate_per_step(test_group_sorted)

# Display the error rates per step for both groups
print("\nControl Group Error Rates Per Step:")
print(control_error_rates_per_step)

print("\nTest Group Error Rates Per Step:")
print(test_error_rates_per_step)


Control Group Error Rate: 19.21%
Test Group Error Rate: 17.64%

Control Group Error Rates Per Step:
process_step
confirm     0.000000
start      56.941730
step_1      0.792393
step_2      0.332341
step_3      0.574889
Name: error, dtype: float64

Test Group Error Rates Per Step:
process_step
confirm     0.000000
start      54.635322
step_1      0.588387
step_2      0.342178
step_3      0.753245
Name: error, dtype: float64


### Erorr rates by age group

In [None]:
def calculate_average_error_by_age(group, age_column='age_group'):
    # Group data by age group and calculate the average error rate
    error_summary = (
        group.groupby(age_column)['error']
        .mean()
        .reset_index(name='average_error_rate')  # Calculate mean error rate
    )
    # Sort the results by age group
    error_summary['average_error_rate'] *= 100 # get the percentage
    error_summary = error_summary.sort_values(by=age_column)
    return error_summary

# Calculate for control and test groups
control_error_by_age = calculate_average_error_by_age(control_group_sorted)
test_error_by_age = calculate_average_error_by_age(test_group_sorted)

control_error_by_age['average_error_rate'] = control_error_by_age['average_error_rate'].round(2)
test_error_by_age['average_error_rate'] = test_error_by_age['average_error_rate'].round(2)

### Hypothesis testing

### Binominal hypothesis test

In [25]:
from scipy.stats import binomtest  #import the binomial test function

# calculate the total errors and total steps for each group
control_errors = control_group_sorted['error'].sum()  # total errors in the control group
control_total = len(control_group_sorted)  # total steps (rows) in the control group
test_errors = test_group_sorted['error'].sum()  # total errors in the test group
test_total = len(test_group_sorted)  # total steps (rows) in the test group

# calculate the error rates for each group
control_error_rate = (control_errors / control_total) * 100  # Error rate as a percentage for control group
test_error_rate = (test_errors / test_total) * 100  # Error rate as a percentage for test group

# calculate the percentage difference in error rates between the two groups
percentage_difference = control_error_rate - test_error_rate  # Difference in error rates (control - test)

# perform a one-tailed binomial test
# H0 =  Test group error rate >= Control group error rate
# H1 = Test group error rate < Control group error rate
result = binomtest(test_errors, test_total, control_errors / control_total, alternative='less')

# extract the p-value from the test result
p_value = result.pvalue 

# display the calculated error rates, percentage difference, and p-value
print(f"Control group error rate: {control_error_rate:.2f}%")  # display control group error rate
print(f"Test group error rate: {test_error_rate:.2f}%")  # display test group error rate
print(f"Percentage difference: {percentage_difference:.2f}%")  # display difference in error rates
print(f"P-value: {p_value:.4f}")  # display the p-value for the binomial test

# interpret the results of the binomial test
alpha = 0.05  # Set the significance level (threshold for p-value)
if p_value < alpha:
    # if p-value is less than alpha, reject the null hypothesis
    print("The test group has significantly lower error rate than the control group.")
else:
    # if p-value is greater than or equal to alpha, fail to reject the null hypothesis
    print("The test group does not have a significantlly lower error rate than the control group.")

# interpret the practical difference between error rates
if percentage_difference >= 5:
    # if the percentage difference meets or exceeds the 5% threshold
    print("The test group's error rate is at least 5% lower than the control group.")
else:
    # if the percentage difference is less than the 5% threshold
    print("The test group's error rate is not at least 5% lower than the control group.")


Control group error rate: 19.21%
Test group error rate: 17.64%
Percentage difference: 1.57%
P-value: 0.0000
The test group has significantly lower error rate than the control group.
The test group's error rate is not at least 5% lower than the control group.


### Two Sample T-test

In [27]:

#identifying error columns
control_error_rate = control_group_sorted['error']
test_error_rate = test_group_sorted['error']

# converting boolean values to integers (1 for True, 0 for False)
control_error_rate = control_error_rate.astype(int)
test_error_rate = test_error_rate.astype(int)

# droping nan values
control_error_rate = control_error_rate.dropna()
test_error_rate = test_error_rate.dropna()

# check if there is data after dropping NaN values
print(f"Control Error Rate Length: {len(control_error_rate)}")
print(f"Test Error Rate Length: {len(test_error_rate)}")

# Perform the independent t-test
try:
    _, p_value = st.ttest_ind(control_error_rate, test_error_rate, equal_var=False, alternative='two-sided')
    print(f"Statistic: {_:.4f}")
    print(f"P-value: {p_value:.4f}")

    # Set significance level
    alpha = 0.05  # Significance level

    # Decision rule
    if p_value < alpha:
        print("Reject the null hypothesis: There is a significant difference in error rates between the control and test groups.")
    else:
        print("Fail to reject the null hypothesis: There is no significant difference in error rates between the control and test groups.")
except Exception as e:
    print(f"An error occurred: {e}")

Control Error Rate Length: 143408
Test Error Rate Length: 177779
Statistic: 11.3619
P-value: 0.0000
Reject the null hypothesis: There is a significant difference in error rates between the control and test groups.


### Conclusion and Comparison:

Statistical Significance: Both the binomial test and the two-sample t-test show that there is a statistically significant difference between the error rates of the control and test groups (p-value = 0.0000 in both cases).

Practical Significance: The percentage difference in error rates between the two groups is 1.57%, which is not above 5% threshold. This means the test group is better (in terms of error rate), but the improvement is small, and it may not be large enough to justify significant action.

The tests show a statistically significant difference in error rates between the groups. However, the 1.57% improvement in the test group may not be large enough to justify changes, especially if we expect at least a 5% improvement. If even a small reduction in errors is valuable, the result can still be useful, otherwise it might not be worth acting on.

### Creating data frame to convert to csv

In [None]:
# Create a DataFrame combining the error rates for both groups
error_rates_df = pd.DataFrame({
    'process_step': control_error_rates_per_step.index,
    'control_group_error_rate (%)': control_error_rates_per_step.values,
    'test_group_error_rate (%)': test_error_rates_per_step.values
})

# Round the error rates to 2 decimal places
error_rates_df['control_group_error_rate (%)'] = error_rates_df['control_group_error_rate (%)'].round(2)
error_rates_df['test_group_error_rate (%)'] = error_rates_df['test_group_error_rate (%)'].round(2)

# Display the DataFrame for inspection
print(error_rates_df)

In [10]:
# Error rate calculation function
def calculate_errors(group):
    # Map process steps to step indices
    group['step_index'] = group['process_step'].map({'start': 0, 'step_1': 1, 'step_2': 2, 'step_3': 3, 'confirm': 4})
    
    # Calculate error: Negative diff indicates backward movement
    group['error'] = group['step_index'].diff().apply(lambda x: x < 0)
    
    # Calculate total errors and total steps
    total_errors = group['error'].sum()
    total_steps = len(group)
    
    return total_errors, total_steps

# Calculate errors and steps for both control and test groups
control_errors, control_steps = calculate_errors(control_group_sorted)
test_errors, test_steps = calculate_errors(test_group_sorted)

# Calculate error rates
control_error_rate = (control_errors / control_steps) * 100
test_error_rate = (test_errors / test_steps) * 100

# Print the overall error rates
print(f"Control Group Error Rate: {control_error_rate:.2f}%")
print(f"Test Group Error Rate: {test_error_rate:.2f}%")

# Calculate the difference in error rates
error_rate_difference = control_error_rate - test_error_rate

# Set the threshold for a 5% difference
threshold = 5

# Hypothesis test: Is the test group's error rate at least 5% smaller?
if error_rate_difference >= threshold:
    print(f"The test group has an error rate that is at least {threshold}% smaller than the control group.")
else:
    print(f"The test group does not have an error rate that is at least {threshold}% smaller than the control group.")


Control Group Error Rate: 19.21%
Test Group Error Rate: 17.64%
The test group does not have an error rate that is at least 5% smaller than the control group.
