In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [None]:
df_1 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/clean/df_merged.csv')
df_2 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_1.txt')
df_3 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_2.txt')
merged_df = pd.concat([df_2, df_3], axis=0)

df_merged = df_1.merge(merged_df, on='client_id', how='inner')
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])
df_merged

In [None]:
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

#Sort control group
control_group_sorted = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

# Sort test group
test_group_sorted = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

In [None]:
# Function to get latest starts
def filter_latest_starts(group_df):
    starts_only = group_df[group_df['process_step'] == 'start']
    latest_starts = starts_only.loc[starts_only.groupby('client_id')['date_time'].idxmax()]
    # Merge back to get the full process after the last start
    return df_merged.merge(latest_starts[['client_id', 'date_time']], on=['client_id', 'date_time'], how='inner')

# Apply to both groups
filtered_control = filter_latest_starts(control_group)
filtered_test = filter_latest_starts(test_group)

#Check if it works
client_total_entries = df_merged[df_merged["client_id"] == 2304905]
display(client_total_entries)
client_last_start = filtered_control[filtered_control['client_id'] == 2304905 ]
display(client_last_start)

In [None]:
# filter to get the latest start for each client
starts_only = df_merged[df_merged['process_step'] == 'start']
latest_starts = starts_only.loc[starts_only.groupby('client_id')['date_time'].idxmax()]

# filter to get the last confirmation for each client
confirmation_only = df_merged[df_merged['process_step'] == 'confirm']
latest_confirms = confirmation_only.loc[confirmation_only.groupby('client_id')['date_time'].idxmax()]

# merge to have both latest start and confirm per client
# Confirming datetime columns are of Timestamp type in merged DataFrame
latest_start_confirms = pd.merge(latest_starts, latest_confirms, on='client_id', suffixes=('_start', '_confirm'))

# calculate process duration for those who completed the process
latest_start_confirms['process_duration'] = latest_start_confirms['date_time_confirm'] - latest_start_confirms['date_time_start']

# Scalculate the average duration and compare with mode and mean
print("Average duration:", latest_start_confirms['process_duration'].mean())
print("Duration mode:", latest_start_confirms['process_duration'].mode())
print("Duration median:", latest_start_confirms['process_duration'].median())

In [None]:
#completion rate based on visit_id
def calculate_within_visit_completion_rate(group):
    # Total unique visits that started
    started_visits = group[group['process_step'] == 'start']['visit_id'].nunique()

    # Unique visits that completed each step
    completed_visits = (
        group[group['process_step'] != 'start']
        .groupby('process_step')['visit_id']
        .nunique()
        .reset_index(name='completed_visits')
    )

    # Add the total started visits as a constant column
    completed_visits['started_visits'] = started_visits

    # Calculate the completion rate
    completed_visits['completion_rate'] = (
        completed_visits['completed_visits'] / completed_visits['started_visits']
    ) * 100

    return completed_visits

# Calculate completion rates for the control and test groups
control_completion_rate = calculate_within_visit_completion_rate(control_group_sorted)
test_completion_rate = calculate_within_visit_completion_rate(test_group_sorted)

# Display the results
print("Control Group Within-Visit Completion Rates:")
display(control_completion_rate)

print("Test Group Within-Visit Completion Rates:")
display(test_completion_rate)

In [None]:
def calculate_errors(group):
    # Check if current step is before the last completed step
    group['step_index'] = group['process_step'].map({'start': 0, 'step_1': 1, 'step_2': 2, 'step_3': 3, 'confirm': 4})
    group['error'] = group['step_index'].diff().apply(lambda x: x < 0)  # Negative diff indicates a backward step
    return group

# Apply error calculation
control_group_sorted = calculate_errors(control_group_sorted)
test_group_sorted = calculate_errors(test_group_sorted)

# Calculate Error Rates
control_error_rate = control_group_sorted['error'].mean() * 100
test_error_rate = test_group_sorted['error'].mean() * 100

print(f"Control Group Error Rate: {control_error_rate:.2f}%")
print(f"Test Group Error Rate: {test_error_rate:.2f}%")
