In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
%matplotlib inline

In [None]:
df_1 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/clean/df_merged.csv')
df_2 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_1.txt')
df_3 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_web_data_pt_2.txt')
merged_df = pd.concat([df_2, df_3], axis=0)

df_merged = df_1.merge(merged_df, on='client_id', how='inner')
df_merged['date_time'] = pd.to_datetime(df_merged['date_time'])

In [None]:

# Function to show demographics analysis
def show_demographics_analysis(df):
    required_columns = ['client_id', 'clnt_age', 'gender', 'variation']
    bins = [18, 30, 40, 50, 60, 100]
    labels = ['18-29', '30-39', '40-49', '50-59', '60+']
    df['age_group'] = pd.cut(df['clnt_age'], bins=bins, labels=labels, right=False)
    df['gender_group'] = df['gender']
    df['variation_group'] = df['variation'].apply(lambda x: 'Test' if x == 1 else 'Control')
    st.subheader("Demographics: Age Groups")
    st.write(df.groupby('client_id')['age_group'].first().value_counts())
    st.subheader("Demographics: Gender Distribution")
    st.write(df.groupby('client_id')['gender_group'].first().value_counts())
    st.subheader("Demographics: Test/Control Group Distribution")
    st.write(df.groupby('client_id')['variation'].first().apply(lambda x: 'Test' if x == 1 else 'Control').value_counts())
    st.subheader("Top 5 Rows of Client Demographics (Age, Gender, Group)")
    st.write(df[['client_id', 'age_group', 'gender_group', 'variation_group']].drop_duplicates().head())
    st.write(f"Available columns in the dataset: {df.columns}")
    st.subheader("Client Demographic Details")
    st.write(df[['client_id', 'clnt_age', 'clnt_gender', 'variation']].head())

### Groups sorting to control/test

In [None]:
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

# Sort control group
control_group_sorted = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

# Sort test group
test_group_sorted = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

print(control_group_sorted.shape)
print(test_group_sorted.shape)

### Adding age groups

In [None]:
# define age bins and categorize ages
bins = [0, 30, 40, 50, 100] # You can decide the intervals based on your data
labels = ['Under 30', '30-39', '40-49', '50 and above']
df_merged['age_group'] = pd.cut(df_merged['clnt_age'], bins=bins, labels=labels)
control_group_sorted['age_group'] = pd.cut(control_group_sorted['clnt_age'], bins=bins, labels=labels)
test_group_sorted['age_group'] = pd.cut(test_group_sorted['clnt_age'], bins=bins, labels=labels)

### Additional KPI
### Calculate Bounce rate for Control and Test groups

- percentage of clients who left the process at each step without progressing to the next step

In [None]:
# Calculate Bounce rate for Control and Test groups
def calculate_dropoff_rate(group):
    steps = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
    dropoff_rates = {}  # To store the drop-off percentages at each step
    
    # Count the drop-offs at each step
    for i in range(len(steps) - 1):
        current_step = steps[i]
        next_step = steps[i + 1]
        # find the total number of clients who started at current_step
        started = group[group['process_step'] == current_step]['client_id'].nunique()
        # find the number of clients who reached the next step
        reached_next = group[group['process_step'] == next_step]['client_id'].nunique()
        # calculate the drop-off rate as a percentage
        if started > 0:
            dropoff_rate = ((started - reached_next) / started) * 100
        else:
            dropoff_rate = 0
        
        dropoff_rates[current_step] = dropoff_rate
    
    return dropoff_rates

# Calculate drop-off rates for control and test groups
control_dropoff_rate = calculate_dropoff_rate(control_group_sorted)
test_dropoff_rate = calculate_dropoff_rate(test_group_sorted)

print(f"Control group bounce rates(%):")
for step, rate in control_dropoff_rate.items():
    print(f"{step}: {rate:.2f}%")

print(f"\nTest group bounce rates(%):")
for step, rate in test_dropoff_rate.items():
    print(f"{step}: {rate:.2f}%")

### Calculate Bounce rate for Control and Test groups per Age group

In [None]:
import pandas as pd

# Define custom order for age groups
age_group_order = ['Under 30', '30-39', '40-49', '50 and above']

# Calculate Drop-off Rate for each Age Group in Control and Test Groups
def calculate_dropoff_rate_by_age(group):
    # Define the steps in the process
    steps = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
    dropoff_rates = {}  # To store the drop-off percentages at each step, grouped by age
    
    # Group by age group to calculate drop-off rates per age group
    age_groups = group['age_group'].unique()

    # Convert the age_group column to a categorical type with the custom order
    group['age_group'] = pd.Categorical(group['age_group'], categories=age_group_order, ordered=True)
    
    # Iterate over each age group
    for age_group in age_groups:
        group_by_age = group[group['age_group'] == age_group]
        age_group_dropoff = {}  # To store drop-off rates for this particular age group
        
        # Count the drop-offs at each step
        for i in range(len(steps) - 1):
            current_step = steps[i]
            next_step = steps[i + 1]
            # Find the total number of clients who started at current_step
            started = group_by_age[group_by_age['process_step'] == current_step]['client_id'].nunique()
            # Find the number of clients who reached the next step
            reached_next = group_by_age[group_by_age['process_step'] == next_step]['client_id'].nunique()
            # Calculate the drop-off rate as a percentage
            if started > 0:
                dropoff_rate = ((started - reached_next) / started) * 100
            else:
                dropoff_rate = 0
            # Store the drop-off rate for the current step
            age_group_dropoff[current_step] = dropoff_rate
        
        # Store the drop-off rates for this age group
        dropoff_rates[age_group] = age_group_dropoff
    
    return dropoff_rates

# Calculate drop-off rates for control and test groups by age group
control_dropoff_rate_by_age = calculate_dropoff_rate_by_age(control_group_sorted)
test_dropoff_rate_by_age = calculate_dropoff_rate_by_age(test_group_sorted)
control_dropoff_rate_by_age

### Hypothesis test using two-proportion z-test

- Null Hypothesis (H₀): The bounce rate of the Test group is equal to or higher than that of the Control group.
- Alternative Hypothesis (H₁): The bounce rate of the Test group is lower than that of the Control group

In [None]:
from scipy.stats import norm
import pandas as pd

# Function to calculate counts for z-test
def calculate_counts(group):
    steps = ['start', 'step_1', 'step_2', 'step_3', 'confirm']
    counts = []  # Store counts as tuples (N_started, N_dropped)
    
    for i in range(len(steps) - 1):
        current_step = steps[i]
        next_step = steps[i + 1]
        # Total users who started at this step
        started = group[group['process_step'] == current_step]['client_id'].nunique()
        # Total users who dropped off at this step
        reached_next = group[group['process_step'] == next_step]['client_id'].nunique()
        dropped = started - reached_next
        counts.append((started, dropped))
    
    return counts

def two_proportion_z_test(n1, x1, n2, x2):
    # Calculate proportions
    p1 = x1 / n1 if n1 > 0 else 0
    p2 = x2 / n2 if n2 > 0 else 0
    
    # Pooled proportion
    p = (x1 + x2) / (n1 + n2)
    
    # Calculate z-statistic
    z = (p1 - p2) / ((p * (1 - p) * (1 / n1 + 1 / n2)) ** 0.5)
    
    # Calculate one-tailed p-value (Test < Control)
    p_value = norm.cdf(z)  # Use norm.cdf for one-tailed test
    
    return z, p_value

# Calculate counts for Control and Test groups
control_counts = calculate_counts(control_group_sorted)
test_counts = calculate_counts(test_group_sorted)

# Perform z-tests for each step
steps = ['start', 'step_1', 'step_2', 'step_3']
z_test_results = []

for i, step in enumerate(steps):
    n1, x1 = control_counts[i]  # Control group: (N_started, N_dropped)
    n2, x2 = test_counts[i]     # Test group: (N_started, N_dropped)
    
    # Perform z-test for proportions
    z_stat, p_value = two_proportion_z_test(n1, x1, n2, x2)
    
    # Store results
    z_test_results.append({
        'Step': step,
        'Control Bounce Rate (%)': (x1 / n1) * 100 if n1 > 0 else 0,
        'Test Bounce Rate (%)': (x2 / n2) * 100 if n2 > 0 else 0,
        'Z-Statistic': z_stat,
        'P-Value': p_value,
        'Significant': p_value < 0.05  # Alpha = 0.05
    })

# Perform z-tests for each step and decide on hypothesis
z_test_results = []

for i, step in enumerate(steps):
    n1, x1 = control_counts[i]  # Control group: (N_started, N_dropped)
    n2, x2 = test_counts[i]     # Test group: (N_started, N_dropped)
    
    # Perform z-test for proportions
    z_stat, p_value = two_proportion_z_test(n1, x1, n2, x2)
    
    # One-tailed test: Test group bounce rate < Control group bounce rate
    reject_null = p_value < 0.05  # Alpha = 0.05
    
    # Store results
    z_test_results.append({
        'Step': step,
        'Control Bounce Rate (%)': (x1 / n1) * 100 if n1 > 0 else 0,
        'Test Bounce Rate (%)': (x2 / n2) * 100 if n2 > 0 else 0,
        'Z-Statistic': z_stat,
        'P-Value (One-tailed)': p_value,
        'Reject Null Hypothesis': reject_null
    })

# Convert results to DataFrame for display
z_test_results_df = pd.DataFrame(z_test_results)

# Display results
print(z_test_results_df)
      