In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [None]:
df_1 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/clean/df_merged.csv')
df_2 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_1.txt')
df_3 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_2.txt')
merged_df = pd.concat([df_2, df_3], axis=0)

df_merged = df_1.merge(merged_df, on='client_id', how='inner')
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])
df_merged.value_counts()C

### Groups sorting to control/test

In [None]:
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

# Sort control group
control_group_sorted = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

# Sort test group
test_group_sorted = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

### Function to get latest starts

In [None]:
# Function to get latest starts
def filter_latest_starts(group_df):
    starts_only = group_df[group_df['process_step'] == 'start']
    latest_starts = starts_only.loc[starts_only.groupby('client_id')['date_time'].idxmax()]
    # Merge back to get the full process after the last start
    return df_merged.merge(latest_starts[['client_id', 'date_time']], on=['client_id', 'date_time'], how='inner')

# Apply to both groups
filtered_control = filter_latest_starts(control_group)
filtered_test = filter_latest_starts(test_group)

#Check if it works
client_total_entries = df_merged[df_merged["client_id"] == 2304905]
display(client_total_entries)
client_last_start = filtered_control[filtered_control['client_id'] == 2304905 ]
display(client_last_start)

### Completion time without outliers for control group per each step

In [None]:
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'], errors='coerce')  # Coerce invalid formats to NaT
# Filter for the test group only
control_group = df_merged[df_merged['variation'] == 'Control']
# Sort by client_id and date_time to ensure chronological order of events
control_group = control_group.sort_values(by=['client_id', 'date_time'])
# Function to calculate completion time for each step in the test group
def calculate_completion_time(group_df):
    # Create a new column to store the completion time for each step
    group_df['next_step_time'] = group_df.groupby('client_id')['date_time'].shift(-1)
    # Only keep rows where the next step exists (i.e., not NaT)
    group_df = group_df.dropna(subset=['next_step_time'])
    # Calculate the completion time as the time difference between current and next step
    group_df['completion_time'] = group_df['next_step_time'] - group_df['date_time']
    return group_df[['client_id', 'process_step', 'date_time', 'next_step_time', 'completion_time']]
# Apply the function to the test group
control_group_completion_times = calculate_completion_time(control_group)

# Convert completion_time to minutes for easier interpretation
control_group_completion_times['completion_time_minutes'] = control_group_completion_times['completion_time'].dt.total_seconds() / 60

# Calculate the IQR (Interquartile Range) for completion time
Q1 = control_group_completion_times['completion_time_minutes'].quantile(0.25)
Q3 = control_group_completion_times['completion_time_minutes'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers based on IQR
filtered_data = control_group_completion_times[
    (control_group_completion_times['completion_time_minutes'] >= lower_bound) &
    (control_group_completion_times['completion_time_minutes'] <= upper_bound)
]

# Now calculate the average completion time in minutes for each process step, after removing outliers
average_completion_time_minutes_filtered = filtered_data.groupby('process_step')['completion_time_minutes'].mean().reset_index()

# Display the result
print(average_completion_time_minutes_filtered)


### Completion time analysis with outliers

In [None]:
# filter to get the latest start for each client
starts_only = df_merged[df_merged['process_step'] == 'start']
latest_starts = starts_only.loc[starts_only.groupby('client_id')['date_time'].idxmax()]

# filter to get the last confirmation for each client
confirmation_only = df_merged[df_merged['process_step'] == 'confirm']
latest_confirms = confirmation_only.loc[confirmation_only.groupby('client_id')['date_time'].idxmax()]

# merge to have both latest start and confirm per client
# Confirming datetime columns are of Timestamp type in merged DataFrame
latest_start_confirms = pd.merge(latest_starts, latest_confirms, on='client_id', suffixes=('_start', '_confirm'))

# calculate process duration for those who completed the process
latest_start_confirms['process_duration'] = latest_start_confirms['date_time_confirm'] - latest_start_confirms['date_time_start']

# Scalculate the average duration and compare with mode and mean
print("Average duration:", latest_start_confirms['process_duration'].mean())
print("Duration mode:", latest_start_confirms['process_duration'].mode())
print("Duration median:", latest_start_confirms['process_duration'].median())


### Completion time analysis without outliers in general (not divided by A/B)

In [None]:
# Convert the timedelta to seconds for easier manipulation
latest_start_confirms['process_duration_seconds'] = latest_start_confirms['process_duration'].dt.total_seconds()

# Calculate the IQR (Interquartile Range)
Q1 = latest_start_confirms['process_duration_seconds'].quantile(0.25)
Q3 = latest_start_confirms['process_duration_seconds'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers based on IQR
filtered_data = latest_start_confirms[(latest_start_confirms['process_duration_seconds'] >= lower_bound) &
                                      (latest_start_confirms['process_duration_seconds'] <= upper_bound)]

# Convert process_duration back to Timedelta
filtered_data['process_duration'] = pd.to_timedelta(filtered_data['process_duration_seconds'], unit='s')

# Calculate the average process duration again after removing outliers
print("Average duration in total for both groups without outliers:", filtered_data['process_duration'].mean())
print("Median duration in total for both groups without outliers:", filtered_data['process_duration'].median())


### Completion rate analysis

- number of customers who reached the end of a step out of total who started

In [None]:
#completion rate based on visit_id
def calculate_within_visit_completion_rate(group):
    # Total unique visits that started
    started_visits = group[group['process_step'] == 'start']['visit_id'].nunique()

    # Unique visits that completed each step
    completed_visits = (
        group[group['process_step'] != 'start']
        .groupby('process_step')['visit_id']
        .nunique()
        .reset_index(name='completed_visits')
    )

    # Add the total started visits as a constant column
    completed_visits['started_visits'] = started_visits

    # Calculate the completion rate
    completed_visits['completion_rate'] = (
        completed_visits['completed_visits'] / completed_visits['started_visits']
    ) * 100

    return completed_visits

# Calculate completion rates for the control and test groups
control_completion_rate = calculate_within_visit_completion_rate(control_group_sorted)
test_completion_rate = calculate_within_visit_completion_rate(test_group_sorted)

# Display the results
print("Control Group Within-Visit Completion Rates:")
display(control_completion_rate)

print("Test Group Within-Visit Completion Rates:")
display(test_completion_rate)

### Calculate Error Rates
The error rates are calculated using the formula:  
- **Control Group Error Rate** = (Clients with errors in control) / (Total clients in control)  
- **Test Group Error Rate** = (Clients with errors in test) / (Total clients in test)
- an analysis ensuring that the observed increase in completion rate from the A/B test meets or exceeds this 5% threshold

In [None]:
#Error rate calculation

def calculate_errors(group):
    # Check if current step is before the last completed step
    group['step_index'] = group['process_step'].map({'start': 0, 'step_1': 1, 'step_2': 2, 'step_3': 3, 'confirm': 4})
    group['error'] = group['step_index'].diff().apply(lambda x: x < 0)  # Negative diff indicates a backward step
    return group

# Apply error calculation
control_group_sorted = calculate_errors(control_group_sorted)
test_group_sorted = calculate_errors(test_group_sorted)

# Calculate Error Rates
control_error_rate = control_group_sorted['error'].mean() * 100
test_error_rate = test_group_sorted['error'].mean() * 100

print(f"Control Group Error Rate: {control_error_rate:.2f}%")
print(f"Test Group Error Rate: {test_error_rate:.2f}%")

#Check if the test group's error rate is smaller than the control group by at least 5%
error_rate_difference = control_error_rate - test_error_rate  # Difference between control and test error rates
threshold = 5  # 5% threshold for the difference

# Perform the analysis
if error_rate_difference >= threshold:
    print(f"The test group has an error rate that is at least {threshold}% smaller than the control group.")
else:
    print(f"The test group does not have an error rate that is at least {threshold}% smaller than the control group.")

### Drop-off Rate (Bounce rate)
- percentage of clients who left the process at each step without progressing to the next step

In [None]:
# Calculate Bounce rate for Control and Test groups
def calculate_dropoff_rate(group):
    steps = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
    dropoff_rates = {}  # To store the drop-off percentages at each step
    
    # Count the drop-offs at each step
    for i in range(len(steps) - 1):
        current_step = steps[i]
        next_step = steps[i + 1]
        # Find the total number of clients who started at current_step
        started = group[group['process_step'] == current_step]['client_id'].nunique()
        # Find the number of clients who reached the next step
        reached_next = group[group['process_step'] == next_step]['client_id'].nunique()
        # Calculate the drop-off rate as a percentage
        if started > 0:
            dropoff_rate = ((started - reached_next) / started) * 100
        else:
            dropoff_rate = 0
        
        dropoff_rates[current_step] = dropoff_rate
    
    return dropoff_rates

# Calculate drop-off rates for control and test groups
control_dropoff_rate = calculate_dropoff_rate(control_group_sorted)
test_dropoff_rate = calculate_dropoff_rate(test_group_sorted)

print(f"Control group bounce rates(%):")
for step, rate in control_dropoff_rate.items():
    print(f"{step}: {rate:.2f}%")

print(f"\nTest group bounce rates(%):")
for step, rate in test_dropoff_rate.items():
    print(f"{step}: {rate:.2f}%")

### Additional KPI
### Calculate Bounce rate for Control and Test groups


### Adding age groups

In [None]:
# define age bins and categorize ages
bins = [0, 30, 40, 50, 100] # You can decide the intervals based on your data
labels = ['Under 30', '30-39', '40-49', '50 and above']
df_merged['age_group'] = pd.cut(df_merged['clnt_age'], bins=bins, labels=labels)