In [None]:
import functions as f
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
import scipy.stats as st
from palette import vanguard_palette

In [None]:
df = pd.read_csv("../data/clean/df_join_clean.csv")
print(f"Shape of final DataFrame: {df.shape}")

### Completion Rates

In [None]:
# Step 1: Identify completed sessions (process_step = 4 exists for a given visit_id)
df['session_completed'] = df.groupby(['client_id', 'visitor_id', 'visit_id'])['process_step'].transform(lambda x: 1 if (x == 4).any() else 0)

# Step 1: Filter the DataFrame to only include rows where process_step == 4
df_step_4 = df[df['process_step'] == 4]

# Step 2: Sort by 'date_time' in ascending order to ensure the earliest step 4 is kept first
df_step_4_sorted = df_step_4.sort_values(by='date_time', ascending=True)

# Step 3: Drop duplicates for 'client_id', 'visit_id', and 'visitor_id' but keep only the first (earliest) occurrence of process_step == 4
df_step_4_cleaned = df_step_4_sorted.drop_duplicates(subset=['client_id', 'visit_id', 'visitor_id'], keep='first')

# Step 4: Filter the rest of the rows that are NOT process_step == 4
df_non_step_4 = df[df['process_step'] != 4]

# Step 5: Combine the cleaned step 4 rows with the non-step 4 rows
df = pd.concat([df_non_step_4, df_step_4_cleaned])

# Step 6: Reset the index of the final DataFrame
df = df.reset_index(drop=True)

# Verify the result
print(f"Shape of final DataFrame: {df.shape}")


# Calculate Completion Rate
completion_status = df.groupby(['client_id', 'visitor_id', 'visit_id'])['process_step'].max() == 4
completion_rate = completion_status.mean().round(2)
print(f"The completion rate for the whole group is: {completion_rate:.2f}")


### Control Group

In [None]:
df_control = df[df['Variation'] == 'Control']

# Identify completed sessions (where process_step = 4 exists for a given visit_id)
control_completion_status = df_control.groupby(['client_id', 'visitor_id', 'visit_id'])['process_step'].max() == 4

# Calculate Completion Rate
control_completion_avg = control_completion_status.mean().round(2)
print(f"The completion average for the control group is: {control_completion_avg:.2f}")

### Test Group

In [None]:
df_test = df[df['Variation'] == 'Test']

# Identify completed sessions (where process_step = 4 exists for a given visit_id)
test_completion_status = df_test.groupby(['client_id', 'visitor_id', 'visit_id'])['process_step'].max() == 4

# Calculate Completion Rate
test_completion_avg = test_completion_status.mean().round(2)
print(f"The completion average for the test group is: {test_completion_avg:.2f}")

### Hypothesis Tests

##### Threshold: Vanguard has set this minimum increase in completion rate at 5%. This is the rate at which the projected benefits, in terms of increased user engagement and potential revenue, are estimated to outweigh the costs of the new design.

#### Completion Rate Hypothesis Test

In [None]:
completion_frequency = df.groupby(['Variation', 'session_completed']).size().unstack(fill_value=0)
completion_percentage = completion_frequency.div(completion_frequency.sum(axis=1), axis=0).round(4)


frequency_table = df['session_completed'].value_counts()
frequency_table_proportion = df['session_completed'].value_counts(normalize=True)
display(completion_percentage)
display(frequency_table)

plt.figure(figsize=(10, 6))

ax = sns.countplot(data=df, x='Variation', hue='session_completed', palette=vanguard_palette)


for container in ax.containers:
    ax.bar_label(container, fmt='{:,.0f}', fontsize=10, color='black')


plt.legend(labels=['Not Completed','Completed'])
plt.show()

$$H_{0}: completion-rate-test <= completion-rate_control$$
$$H_{1}: completion-rate-test > completion-rate-control$$

In [None]:
#Set Hypothesis

#The test group completion rate is performing better on average than the control group.

alpha = 0.05

#Proportion Test

# Compute test statistics
x_test = df_test['session_completed'].sum()  # Completed in test group
x_control = df_control['session_completed'].sum()  # Completed in control group

n_test = len(df_test)  # Total test users
n_control = len(df_control)  # Total control users

p1_test = x_test / n_test  # Completion rate test group
p1_control = x_control / n_control  # Completion rate control group

# Compute pooled proportion and standard error
p_pooled = (x_test + x_control) / (n_test + n_control)  # Pooled proportion
q_pooled = 1 - p_pooled  # Complement of pooled proportion

SE = np.sqrt(p_pooled * q_pooled * (1/n_test + 1/n_control))

# Compute Z-score
z = (p1_test - p1_control) / SE

# Compute one-tailed p-value (right-tailed test)
p_value = 1 - st.norm.cdf(z)

# Hypothesis test decision
if p_value < alpha:
    print("Reject the null hypothesis: The test group has a significantly higher completion rate than the control group.")
else:
    print("Fail to reject the null hypothesis: There is no statistically significant evidence that the test group performs better than the control group.")

print(f"Z-score: {z:.4f}, p-value: {p_value:.4f}")

$$H_{0}: \text{completion\_rate\_test} - \text{completion\_rate\_control} >= 0.05$$
$$H_{1}: \text{completion\_rate\_test} - \text{completion\_rate\_control} < 0.05$$

In [None]:
#Proportion Test

# Compute test statistics
x_test = df_test['session_completed'].sum()  # Completed in test group
x_control = df_control['session_completed'].sum()  # Completed in control group

n_control = len(df_control)  # Total control users
n_test = len(df_test)  # Total test users

p1_test = x_test / n_test  # Completion rate test group
p1_control = x_control / n_control  # Completion rate control group

# Define the expected difference under H0
delta = 0.05  # 5% increase

# Compute standard error
p_pooled = (x_test + x_control) / (n_test + n_control)  # Pooled proportion
q_pooled = 1 - p_pooled

SE = np.sqrt(p_pooled * q_pooled * (1/n_test + 1/n_control))

# Compute Z-score
z = (p1_test - p1_control - delta) / SE  # Subtract delta

# Compute one-tailed p-value
p_value = st.norm.cdf(z)

if p_value < alpha:
    print("Reject the null hypothesis: The completion rate increase is less than 5%.")
else:
    print("Fail to reject the null hypothesis: The completion rate increase is at least 5%.")

print(f"Z-score: {z:.4f}, p-value: {p_value:.4f}")

#### Average Client Tenure Hypothesis Test

In [None]:
# Compute crosstab (normalized by row)
frequency_crosstab = pd.crosstab(
    [df['age_group'], df['Variation']], df['session_completed'], normalize='index'
).round(2)

# Reset index for better plotting with seaborn
frequency_crosstab = frequency_crosstab.reset_index()

# Rename columns for clarity
frequency_crosstab.columns = ['Age Group', 'Variation', 'Not Completed', 'Completed']

# Convert from wide to long format for seaborn
frequency_crosstab_melted = frequency_crosstab.melt(
    id_vars=['Age Group', 'Variation'], 
    value_vars=['Not Completed', 'Completed'],
    var_name='Completion Status', 
    value_name='Proportion'
)

# Set figure size
fig, axes = plt.subplots(2, 1, figsize=(10, 12), sharex=True)  # One below the other

# Define test & control datasets
df_test = frequency_crosstab_melted[frequency_crosstab_melted['Variation'] == 'Test']
df_control = frequency_crosstab_melted[frequency_crosstab_melted['Variation'] == 'Control']

# Plot Test Group
sns.barplot(
    data=df_test, x='Age Group', y='Proportion', hue='Completion Status',
    palette=vanguard_palette, ax=axes[0], hue_order=['Completed', 'Not Completed']
)
axes[0].set_title("Test Group", fontsize=14)
axes[0].set_xlabel("Age Group", fontsize=12)
axes[0].set_ylabel("Proportion", fontsize=12)
axes[0].legend(title="Completion Status", loc='lower right')

# Plot Control Group
sns.barplot(
    data=df_control, x='Age Group', y='Proportion', hue='Completion Status',
    palette=vanguard_palette, ax=axes[1], hue_order=['Completed', 'Not Completed']
)
axes[1].set_title("Control Group", fontsize=14)
axes[1].set_xlabel("Age Group", fontsize=12)
axes[1].set_ylabel("")
axes[1].legend(title="Completion Status", loc='lower right')

# Add bar labels for better readability
for ax in axes:
    for container in ax.containers:
        ax.bar_label(container, fmt='%.2f', fontsize=10, color='black')

# Improve layout
sns.despine()
plt.tight_layout()
plt.show()


$$H_{0}: \text{avg\_age\_test} <= \text{avg\_age\_control}$$
$$H_{1}: \text{avg\_age\_test} > \text{avg\_age\_control}$$

In [None]:
#NEED TO DO CHISQUARE TEST FOR AGE AND TENURE GROUPS

#The average age of clients engaging with the new process is the same as those engaging with the old process.

#Chisquare Test

crosstab_age = pd.crosstab(df['Variation'], df['age_group'])
chi2_stat, chi2_p_value, _, expected_frequency = st.chi2_contingency(crosstab_age)

chi2_stat,chi2_p_value

if chi2_p_value > alpha:
    print(f"Fail to reject the null hypothesis (H0). There is no significant difference in age distribution between the test and control groups (p-value: {chi2_p_value:.4f}).")
else:
    print(f"Reject the null hypothesis (H0). There is a significant difference in age distribution between the test and control groups (p-value: {chi2_p_value:.4f}).")

In [None]:
df_test = df[df['Variation']=='Test']
df_control = df[df['Variation']=='Control']

test_mean = df_test['clnt_age'].mean()
control_mean = df_control['clnt_age'].mean()

test_mean,control_mean

#### Average Client Tenure Hypothesis Test

In [None]:
plt.figure(figsize=(8,5))

sns.violinplot(x=df['Variation'],y=df['clnt_tenure_yr'],palette=vanguard_palette)
plt.title('Tenure Distribution by Group',fontsize=14)
plt.xlabel('Group',fontsize=12)
plt.ylabel('Client Tenure',fontsize=12)
plt.show()

$$H_{0}: \text{avg\_tenure\_test} = \text{avg\_tenure\_control}$$
$$H_{1}: \text{avg\_tenure\_test} \_  != \text{avg\_tenure\_control}$$

In [None]:
df_test = df[df['Variation']=='Test']
df_control = df[df['Variation']=='Control']

test_mean = df_test['clnt_tenure_yr'].mean()
control_mean = df_control['clnt_tenure_yr'].mean()

test_mean,control_mean

In [None]:
#Set Hypothesis

#The average client tenure engaging with the new process is the same as those engaging with the old process.

#H0  avg_tenure_test = avg_tenure_control
#H1: avg_tenure_test != avg_tenure_control


tenure_control_df = df[(df['Variation'] == 'Control')]['clnt_tenure_yr']
tenure_test_df = df[(df['Variation'] == 'Test')]['clnt_tenure_yr']

stat, p_value = st.ttest_ind(tenure_test_df, tenure_control_df, equal_var=False, alternative='two-sided')

if p_value > alpha:
    print(f"We fail to reject the null hypothesis (H0). There is no significant difference in average tenure between the test and control groups (p-value: {p_value}).")
else:
    print(f"We reject the null hypothesis (H0). There is a significant difference in average tenure between the test and control groups (p-value: {p_value}).")
