In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
import scipy.stats as stats
from scipy import stats
%matplotlib inline

In [None]:
df_1 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/clean/df_merged.csv')
df_2 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_1.txt')
df_3 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_2.txt')
merged_df = pd.concat([df_2, df_3], axis=0)

df_merged = df_1.merge(merged_df, on='client_id', how='inner')
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])

### Groups sorting to control/test

In [None]:
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

# Sort control group
control_group_sorted = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

# Sort test group
test_group_sorted = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

print(control_group_sorted.shape)
print(test_group_sorted.shape)

### Adding age groups

In [None]:
# define age bins and categorize ages
bins = [0, 30, 40, 50, 100] # You can decide the intervals based on your data
labels = ['Under 30', '30-39', '40-49', '50 and above']
df_merged['age_group'] = pd.cut(df_merged['clnt_age'], bins=bins, labels=labels)
control_group_sorted['age_group'] = pd.cut(control_group_sorted['clnt_age'], bins=bins, labels=labels)
test_group_sorted['age_group'] = pd.cut(test_group_sorted['clnt_age'], bins=bins, labels=labels)

### Additional KPI
### Calculate Bounce rate for Control and Test groups

- percentage of clients who left the process at each step without progressing to the next step

In [None]:
# Calculate Bounce rate for Control and Test groups
def calculate_dropoff_rate(group):
    steps = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
    dropoff_rates = {}  # To store the drop-off percentages at each step
    
    # Count the drop-offs at each step
    for i in range(len(steps) - 1):
        current_step = steps[i]
        next_step = steps[i + 1]
        # Find the total number of clients who started at current_step
        started = group[group['process_step'] == current_step]['client_id'].nunique()
        # Find the number of clients who reached the next step
        reached_next = group[group['process_step'] == next_step]['client_id'].nunique()
        # Calculate the drop-off rate as a percentage
        if started > 0:
            dropoff_rate = ((started - reached_next) / started) * 100
        else:
            dropoff_rate = 0
        
        dropoff_rates[current_step] = dropoff_rate
    
    return dropoff_rates

# Calculate drop-off rates for control and test groups
control_dropoff_rate = calculate_dropoff_rate(control_group_sorted)
test_dropoff_rate = calculate_dropoff_rate(test_group_sorted)

print(f"Control group bounce rates(%):")
for step, rate in control_dropoff_rate.items():
    print(f"{step}: {rate:.2f}%")

print(f"\nTest group bounce rates(%):")
for step, rate in test_dropoff_rate.items():
    print(f"{step}: {rate:.2f}%")

### Calculate Bounce rate for Control and Test groups per Age group

In [None]:
import pandas as pd

# Define custom order for age groups
age_group_order = ['Under 30', '30-39', '40-49', '50 and above']

# Calculate Drop-off Rate for each Age Group in Control and Test Groups
def calculate_dropoff_rate_by_age(group):
    # Define the steps in the process
    steps = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
    dropoff_rates = {}  # To store the drop-off percentages at each step, grouped by age
    
    # Group by age group to calculate drop-off rates per age group
    age_groups = group['age_group'].unique()

    # Convert the age_group column to a categorical type with the custom order
    group['age_group'] = pd.Categorical(group['age_group'], categories=age_group_order, ordered=True)
    
    # Iterate over each age group
    for age_group in age_groups:
        group_by_age = group[group['age_group'] == age_group]
        age_group_dropoff = {}  # To store drop-off rates for this particular age group
        
        # Count the drop-offs at each step
        for i in range(len(steps) - 1):
            current_step = steps[i]
            next_step = steps[i + 1]
            # Find the total number of clients who started at current_step
            started = group_by_age[group_by_age['process_step'] == current_step]['client_id'].nunique()
            # Find the number of clients who reached the next step
            reached_next = group_by_age[group_by_age['process_step'] == next_step]['client_id'].nunique()
            # Calculate the drop-off rate as a percentage
            if started > 0:
                dropoff_rate = ((started - reached_next) / started) * 100
            else:
                dropoff_rate = 0
            # Store the drop-off rate for the current step
            age_group_dropoff[current_step] = dropoff_rate
        
        # Store the drop-off rates for this age group
        dropoff_rates[age_group] = age_group_dropoff
    
    return dropoff_rates

# Calculate drop-off rates for control and test groups by age group
control_dropoff_rate_by_age = calculate_dropoff_rate_by_age(control_group_sorted)
test_dropoff_rate_by_age = calculate_dropoff_rate_by_age(test_group_sorted)

# Function to print drop-off rates in the correct order
def print_dropoff_rates_by_age(dropoff_rates):
    for age_group in age_group_order:
        if age_group in dropoff_rates:  # Check if the age group exists in the dropoff rates
            print(f"\nAge Group: {age_group}")
            for step, rate in dropoff_rates[age_group].items():
                print(f"{step}: {rate:.2f}%")

# Display the drop-off rates for control group by age group
print(f"Control Group Bounce Rates by Age Group:")
print_dropoff_rates_by_age(control_dropoff_rate_by_age)

# Display the drop-off rates for test group by age group
print(f"\nTest Group Bounce Rates by Age Group:")
print_dropoff_rates_by_age(test_dropoff_rate_by_age)


### Function to calculate the bounce rate

In [None]:

# Define the sequence of steps in the process
    steps = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
    # Initialize a list to store drop-off rates for all age groups
    dropoff_data = []

    # Loop over each unique age group in the specified column
    for age in group[age_column].unique():
        # Filter the dataset for the current age group
        age_group_data = group[group[age_column] == age]
        # Initialize a dictionary to store drop-off rates for the current age group
        dropoff_rates = {}

        # Loop through the steps (except the last one) to calculate drop-off rates
        for i in range(len(steps) - 1):
            # Get the current step
            current_step = steps[i]
            # Get the next step
            next_step = steps[i + 1]
                        
            # Find the total number of clients who started at current_step for the current age group
            started = age_group_data[age_group_data['process_step'] == current_step]['client_id'].nunique()
            
            # Find the number of clients who reached the next step for the current age group
            reached_next = age_group_data[age_group_data['process_step'] == next_step]['client_id'].nunique()
            
            # Calculate the drop-off rate as a percentage
            if started > 0:
                dropoff_rate = ((started - reached_next) / started) * 100
            else:
                dropoff_rate = 0
            
            dropoff_rates[current_step] = dropoff_rate

        # Append the drop-off rates for this age group
        dropoff_data.append({'age_group': age, **dropoff_rates})

    return pd.DataFrame(dropoff_data)

# Calculate drop-off rates for the control group by age group
control_dropoff_rate_by_age = calculate_dropoff_rate_by_age_group(control_group_sorted)

# Calculate drop-off rates for the test group by age group
test_dropoff_rate_by_age = calculate_dropoff_rate_by_age_group(test_group_sorted)

# Display the tables
print("Control Group Drop-off Rates by Age Group:")
print(control_dropoff_rate_by_age)

print("\nTest Group Drop-off Rates by Age Group:")
print(test_dropoff_rate_by_age)


### Creating age group columns

In [None]:
# Define the custom age group order
age_group_order = ['Under 30', '30-39', '40-49', '50 and above']

# Sort the Control Group Drop-off Rates table by the custom age group order
control_dropoff_rate_by_age['age_group'] = pd.Categorical(control_dropoff_rate_by_age['age_group'], categories=age_group_order, ordered=True)
control_dropoff_rate_by_age_sorted = control_dropoff_rate_by_age.sort_values('age_group')

# Sort the Test Group Drop-off Rates table by the custom age group order
test_dropoff_rate_by_age['age_group'] = pd.Categorical(test_dropoff_rate_by_age['age_group'], categories=age_group_order, ordered=True)
test_dropoff_rate_by_age_sorted = test_dropoff_rate_by_age.sort_values('age_group')

### Visualise bar chart

In [None]:


# Set up the bar chart for drop-off rates by age group
def plot_grouped_bar_chart(dropoff_rate_df, title, ax):
    # Plot each step for all age groups
    age_groups = dropoff_rate_df['age_group']
    steps = ['start', 'step_1', 'step_2', 'step_3']
    
    # Bar positions on the X axis
    bar_width = 0.2
    index = range(len(age_groups))
    
    # Create bars for each step
    for i, step in enumerate(steps):
        ax.bar([p + bar_width * i for p in index], 
               dropoff_rate_df[step], 
               bar_width, label=step)
    
    # Labeling
    ax.set_xlabel('Age Group')
    ax.set_ylabel('Drop-off Rate (%)')
    ax.set_title(title)
    ax.set_xticks([p + bar_width * 1.5 for p in index])  # Adjust for the grouped bars
    ax.set_xticklabels(age_groups)
    ax.legend()

# Create a figure to plot Control and Test group drop-off rates
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot for Control group
plot_grouped_bar_chart(control_dropoff_rate_by_age_sorted, 
                       "Control group Bounce Rates by Age Group", axes[0])

# Plot for Test group
plot_grouped_bar_chart(test_dropoff_rate_by_age_sorted, 
                       "Test group Bounce rates by Age group", axes[1])
plt.savefig("grouped_bar_chart.png", dpi=300)
plt.tight_layout()
plt.show()

### Visualise heatmaps

In [None]:

# Prepare the data for heatmap
def plot_heatmap(dropoff_rate_df, title):
    # Pivot the data to have 'age_group' as columns and steps as rows
    heatmap_data = dropoff_rate_df.set_index('age_group')[['start', 'step_1', 'step_2', 'step_3']]
    
    # Create the heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.2f', linewidths=0.5)
    plt.title(title)
    plt.ylabel('Steps')
    plt.xlabel('Age Group')
    plt.show()
# Plot for Control group
plot_heatmap(control_dropoff_rate_by_age_sorted, "Control group Bounce rates Heatmap")

# Plot for Test group
plot_heatmap(test_dropoff_rate_by_age_sorted, "Test group Bounce rates Heatmap")
