In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
%matplotlib inline

In [4]:
df_1 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/clean/df_merged.csv')
df_2 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_1.txt')
df_3 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_2.txt')
merged_df = pd.concat([df_2, df_3], axis=0)

df_merged = df_1.merge(merged_df, on='client_id', how='inner')
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'], errors='coerce')  # coerce invalid formats to NaT

### Groups sorting to control/test

In [6]:
# define the custom sorting order for the process steps
process_step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
# apply the custom order to the process_step column for both groups
df_merged['process_step'] = pd.Categorical(df_merged['process_step'], categories=process_step_order, ordered=True)

#filter groups based on test/control
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

# sort control group
control_group_sorted = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])
# sort test group
test_group_sorted = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

In [8]:
# Function to get latest starts
def filter_latest_starts(group_df):
    starts_only = group_df[group_df['process_step'] == 'start']
    latest_starts = starts_only.loc[starts_only.groupby('visit_id')['date_time'].idxmax()]
    # Merge back to get the full process after the last start
    return df_merged.merge(latest_starts[['visit_id', 'date_time']], on=['visit_id', 'date_time'], how='inner')

# Apply to both groups
filtered_control = filter_latest_starts(control_group)
filtered_test = filter_latest_starts(test_group)

#Check if it works
client_total_entries = df_merged[df_merged["client_id"] == 2304905]
client_last_start = filtered_control[filtered_control['client_id'] == 2304905 ]

### Completion time without outliers for control group per each step

In [11]:
# Convert 'date_time' column to datetime, coercing invalid formats to NaT
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'], errors='coerce')

# Function to calculate completion time for each step in the control group, considering multiple visits
def calculate_completion_time_with_visits(group_df):
    # Sort by client_id, visit_id, and date_time to ensure chronological order
    group_df = group_df.sort_values(by=['client_id', 'visit_id', 'date_time'])
    
    # Create a new column to store the next step time for each visit
    group_df['next_step_time'] = group_df.groupby(['client_id', 'visit_id'])['date_time'].shift(-1)
    
    # Only keep rows where the next step exists (i.e., not NaT)
    group_df = group_df.dropna(subset=['next_step_time'])
    
    # Calculate the completion time as the time difference between current and next step
    group_df['completion_time'] = group_df['next_step_time'] - group_df['date_time']
    
    return group_df[['client_id', 'visit_id', 'process_step', 'date_time', 'next_step_time', 'completion_time']]

# Apply the function to the control group considering multiple visits
control_group_completion_times_with_visits = calculate_completion_time_with_visits(control_group_sorted)

# Convert completion_time to minutes for easier interpretation
control_group_completion_times_with_visits['completion_time_minutes'] = control_group_completion_times_with_visits['completion_time'].dt.total_seconds() / 60

# Calculate the IQR (Interquartile Range) for completion time
Q1 = control_group_completion_times_with_visits['completion_time_minutes'].quantile(0.25)
Q3 = control_group_completion_times_with_visits['completion_time_minutes'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers based on IQR
filtered_data = control_group_completion_times_with_visits[
    (control_group_completion_times_with_visits['completion_time_minutes'] >= lower_bound) &
    (control_group_completion_times_with_visits['completion_time_minutes'] <= upper_bound)
]

# calculate the average completion time in minutes for each process step, after removing outliers
average_completion_time_minutes_filtered_control = filtered_data.groupby('process_step')['completion_time_minutes'].mean().reset_index()
display(average_completion_time_minutes_filtered_control)

  average_completion_time_minutes_filtered_control = filtered_data.groupby('process_step')['completion_time_minutes'].mean().reset_index()


Unnamed: 0,process_step,completion_time_minutes
0,start,0.483977
1,step_1,0.526296
2,step_2,1.083293
3,step_3,1.112592
4,confirm,0.733098


### Completion time without outliers for test group per each step

In [40]:
# Function to calculate completion time for each step in the test group, considering multiple visits
def calculate_completion_time_with_visits(group_df):
    # Sort by client_id, visit_id, and date_time to ensure chronological order
    group_df = group_df.sort_values(by=['client_id', 'visit_id', 'date_time'])
    
    # Create a new column to store the next step time for each visit
    group_df['next_step_time'] = group_df.groupby(['client_id', 'visit_id'])['date_time'].shift(-1)
    
    # Only keep rows where the next step exists (i.e., not NaT)
    group_df = group_df.dropna(subset=['next_step_time'])
    
    # Calculate the completion time as the time difference between current and next step
    group_df['completion_time'] = group_df['next_step_time'] - group_df['date_time']
    
    return group_df[['client_id', 'visit_id', 'process_step', 'date_time', 'next_step_time', 'completion_time']]

# Apply the function to the test group considering multiple visits
test_group_completion_times_with_visits = calculate_completion_time_with_visits(test_group_sorted)

# Convert completion_time to minutes for easier interpretation
test_group_completion_times_with_visits['completion_time_minutes'] = test_group_completion_times_with_visits['completion_time'].dt.total_seconds() / 60

# Calculate the IQR (Interquartile Range) for completion time
Q1 = test_group_completion_times_with_visits['completion_time_minutes'].quantile(0.25)
Q3 = test_group_completion_times_with_visits['completion_time_minutes'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers based on IQR
filtered_data = test_group_completion_times_with_visits[
    (test_group_completion_times_with_visits['completion_time_minutes'] >= lower_bound) &
    (test_group_completion_times_with_visits['completion_time_minutes'] <= upper_bound)
]

#calculate the average completion time in minutes for each process step, after removing outliers
average_completion_time_minutes_filtered_test = filtered_data.groupby('process_step')['completion_time_minutes'].mean().reset_index()
display(average_completion_time_minutes_filtered_test)

  average_completion_time_minutes_filtered_test = filtered_data.groupby('process_step')['completion_time_minutes'].mean().reset_index()


Unnamed: 0,process_step,completion_time_minutes
0,start,0.463561
1,step_1,0.607196
2,step_2,1.040396
3,step_3,0.982517
4,confirm,1.093128


### Completion time analysis with outliers

In [43]:
# filter to get the latest start for each client
starts_only = df_merged[df_merged['process_step'] == 'start']
latest_starts = starts_only.loc[starts_only.groupby('client_id')['date_time'].idxmax()]

# filter to get the last confirmation for each client
confirmation_only = df_merged[df_merged['process_step'] == 'confirm']
latest_confirms = confirmation_only.loc[confirmation_only.groupby('client_id')['date_time'].idxmax()]

# merge to have both latest start and confirm per client
# Confirming datetime columns are of Timestamp type in merged DataFrame
latest_start_confirms = pd.merge(latest_starts, latest_confirms, on='client_id', suffixes=('_start', '_confirm'))

# calculate process duration for those who completed the process
latest_start_confirms['process_duration'] = latest_start_confirms['date_time_confirm'] - latest_start_confirms['date_time_start']

# Scalculate the average duration and compare with mode and mean
print("Average duration:", latest_start_confirms['process_duration'].mean())
print("Duration mode:", latest_start_confirms['process_duration'].mode())
print("Duration median:", latest_start_confirms['process_duration'].median())


Average duration: 0 days 22:42:16.093893650
Duration mode: 0   0 days 00:02:16
Name: process_duration, dtype: timedelta64[ns]
Duration median: 0 days 00:03:48


### Completion time analysis without outliers in general (not divided by A/B)

In [46]:
# Convert the timedelta to seconds for easier manipulation
latest_start_confirms['process_duration_seconds'] = latest_start_confirms['process_duration'].dt.total_seconds()

# Calculate the IQR (Interquartile Range)
Q1 = latest_start_confirms['process_duration_seconds'].quantile(0.25)
Q3 = latest_start_confirms['process_duration_seconds'].quantile(0.75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers based on IQR
filtered_data = latest_start_confirms[(latest_start_confirms['process_duration_seconds'] >= lower_bound) &
                                      (latest_start_confirms['process_duration_seconds'] <= upper_bound)]

# Convert process_duration back to Timedelta
filtered_data['process_duration'] = pd.to_timedelta(filtered_data['process_duration_seconds'], unit='s')

# Calculate the average process duration again after removing outliers
print("Average duration in total for both groups without outliers:", filtered_data['process_duration'].mean())
print("Median duration in total for both groups without outliers:", filtered_data['process_duration'].median())


Average duration in total for both groups without outliers: 0 days 00:04:18.338016588
Median duration in total for both groups without outliers: 0 days 00:03:33


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['process_duration'] = pd.to_timedelta(filtered_data['process_duration_seconds'], unit='s')


### Completion time analysis only for users that have done the whole process (from start to confirm)

In [16]:
# Define the required process steps
required_steps = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

# Function to check if a session went through all required steps
def has_completed_all_steps(group_df, required_steps):
    steps_in_session = group_df['process_step'].unique()
    return all(step in steps_in_session for step in required_steps)

# Filter for clients that went through all required steps (i.e., complete sessions)
full_sessions = df_merged.groupby('client_id').filter(lambda group: has_completed_all_steps(group, required_steps))

# Filter for the test group only from full sessions
test_group = full_sessions[full_sessions['variation'] == 'Test']
# Sort by client_id, visit_id, and process_step (in the custom order), followed by date_time
test_group = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

# Filter for the control group only from full sessions
control_group = full_sessions[full_sessions['variation'] == 'Control']
# Sort by client_id, visit_id, and process_step (in the custom order), followed by date_time
control_group = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

# Function to calculate completion time for each step, only considering complete sessions
def calculate_completion_time_with_visits(group_df):
    # Create a new column to store the next step time for each visit
    group_df['next_step_time'] = group_df.groupby(['client_id', 'visit_id'])['date_time'].shift(-1)
    
    # Only keep rows where the next step exists (i.e., not NaT)
    group_df = group_df.dropna(subset=['next_step_time'])
    
    # Calculate the completion time as the time difference between current and next step
    group_df['completion_time'] = group_df['next_step_time'] - group_df['date_time']
    
    return group_df[['client_id', 'visit_id', 'process_step', 'date_time', 'next_step_time', 'completion_time']]

# Apply the function to the test group considering multiple visits
test_group_completion_times_with_visits = calculate_completion_time_with_visits(test_group)

# Apply the same function to the control group considering multiple visits
control_group_completion_times_with_visits = calculate_completion_time_with_visits(control_group)

# Convert completion_time to minutes for easier interpretation
test_group_completion_times_with_visits['completion_time_minutes'] = test_group_completion_times_with_visits['completion_time'].dt.total_seconds() / 60
control_group_completion_times_with_visits['completion_time_minutes'] = control_group_completion_times_with_visits['completion_time'].dt.total_seconds() / 60

# Calculate the IQR (Interquartile Range) for completion time for test group
Q1_test = test_group_completion_times_with_visits['completion_time_minutes'].quantile(0.25)
Q3_test = test_group_completion_times_with_visits['completion_time_minutes'].quantile(0.75)
IQR_test = Q3_test - Q1_test

# Define the upper and lower bounds for outliers in test group
lower_bound_test = Q1_test - 1.5 * IQR_test
upper_bound_test = Q3_test + 1.5 * IQR_test

# Filter out the outliers based on IQR for test group
filtered_test_data = test_group_completion_times_with_visits[
    (test_group_completion_times_with_visits['completion_time_minutes'] >= lower_bound_test) &
    (test_group_completion_times_with_visits['completion_time_minutes'] <= upper_bound_test)
]

# Now calculate the average completion time in minutes for each process step, after removing outliers for test group
average_completion_whole_process_test = filtered_test_data.groupby('process_step')['completion_time_minutes'].mean().reset_index()

# Calculate the IQR (Interquartile Range) for completion time for control group
Q1_control = control_group_completion_times_with_visits['completion_time_minutes'].quantile(0.25)
Q3_control = control_group_completion_times_with_visits['completion_time_minutes'].quantile(0.75)
IQR_control = Q3_control - Q1_control

# Define the upper and lower bounds for outliers in control group
lower_bound_control = Q1_control - 1.5 * IQR_control
upper_bound_control = Q3_control + 1.5 * IQR_control

# Filter out the outliers based on IQR for control group
filtered_control_data = control_group_completion_times_with_visits[
    (control_group_completion_times_with_visits['completion_time_minutes'] >= lower_bound_control) &
    (control_group_completion_times_with_visits['completion_time_minutes'] <= upper_bound_control)
]

# Now calculate the average completion time in minutes for each process step, after removing outliers for control group
average_completion_whole_process_control = filtered_control_data.groupby('process_step')['completion_time_minutes'].mean().reset_index()

# Display the results for both groups
print("Test Group Average Completion Time (Filtered):")
print(average_completion_whole_process_test)
print("\nControl Group Average Completion Time (Filtered):")
print(average_completion_whole_process_control)


Test Group Average Completion Time (Filtered):
  process_step  completion_time_minutes
0        start                 0.607488
1       step_1                 0.754744
2       step_2                 1.266677
3       step_3                 1.111256
4      confirm                 1.267647

Control Group Average Completion Time (Filtered):
  process_step  completion_time_minutes
0        start                 0.558910
1       step_1                 0.647344
2       step_2                 1.215537
3       step_3                 1.288399
4      confirm                 0.601297


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_df['completion_time'] = group_df['next_step_time'] - group_df['date_time']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_df['completion_time'] = group_df['next_step_time'] - group_df['date_time']
  average_completion_whole_process_test = filtered_test_data.groupby('process_step')['completion_time_minutes'].mean().reset_index()
  average_completion_whole_process_control = filtered_control_data.groupby('process_step')['completion_time_minutes'].mean().reset_index()
