In [None]:
import functions as f
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math

In [None]:
df_demo = pd.read_csv("../data/clean/df_demo_clean.csv")
df_wd = pd.read_csv("../data/clean/df_web_data_clean.csv")
df_experiment_clients = pd.read_csv("../data/clean/df_experiment_clients_clean.csv")

In [None]:
#Change date_time from object to date

df_wd['date_time'] = pd.to_datetime(df_wd['date_time'], errors='coerce')
df_demo

### Completion Rates

In [None]:
# Identify completed sessions (where process_step = 4 exists for a given visit_id)
completion_status = df_wd.groupby(['client_id', 'visitor_id', 'visit_id'])['process_step'].max() == 4

# Calculate Completion Rate
completion_rate = completion_status.mean().round(2)
print(f"The completion rate for the whole group is: {completion_rate:.2f}")

### Control Group

In [None]:
df_wd = df_wd.merge(df_experiment_clients)
df_wd = pd.merge(df_wd, df_demo[['client_id','age_group']], on='client_id', how='left')
df_wd = pd.merge(df_wd, df_demo[['client_id','tenure']], on='client_id', how='left')
df_wd = pd.merge(df_wd, df_demo[['client_id','clnt_age']], on='client_id', how='inner')
df_wd = pd.merge(df_wd, df_demo[['client_id','clnt_tenure_yr']], on='client_id', how='inner')
df_wd

In [None]:
df_wd_control = df_wd[df_wd['Variation'] == 'Control']


# Identify completed sessions (where process_step = 4 exists for a given visit_id)
control_completion_status = df_wd_control.groupby(['client_id', 'visitor_id', 'visit_id'])['process_step'].max() == 4

# Calculate Completion Rate
control_completion_rate = completion_status.mean().round(2)
print(f"The completion rate for the control group is: {control_completion_rate:.2f}")

### Test Group

In [None]:
df_wd_test = df_wd[df_wd['Variation'] == 'Test']

# Identify completed sessions (where process_step = 4 exists for a given visit_id)
test_completion_status = df_wd_test.groupby(['client_id', 'visitor_id', 'visit_id'])['process_step'].max() == 4

# Calculate Completion Rate
test_completion_rate = test_completion_status.mean().round(2)
print(f"The completion rate for the test group is: {test_completion_rate:.2f}")

### Hypothesis Tests

##### Threshold: Vanguard has set this minimum increase in completion rate at 5%. This is the rate at which the projected benefits, in terms of increased user engagement and potential revenue, are estimated to outweigh the costs of the new design.

#### Completion Rate Hypothesis Test

In [None]:
vanguard_red = '#BA0C2F'
vanguard_gray = '#4D4D4F'
vanguard_beige = '#F2E5D5'
vanguard_dark = '#D9D9D6'
vanguard_palette = [vanguard_red,vanguard_gray,vanguard_dark,vanguard_beige]

df_wd['session_completed'] = df_wd.groupby(['client_id', 'visitor_id', 'visit_id'])['process_step'].transform(lambda x: 1 if 4 in x.values else 0)

frequency_table = df_wd['session_completed'].value_counts()
frequency_table_proportion = df_wd['session_completed'].value_counts(normalize=True)

display(frequency_table_proportion)
sns.countplot(data=df_wd, x='Variation',palette=vanguard_palette, hue='session_completed')
plt.legend(labels=['Not Completed','Completed'])
plt.show()

In [None]:
#Set Hypothesis
import scipy.stats as st

#The test group completion rate is performing better on average than the control group.

alpha = 0.05

#H0: test_completion_rate <= control_completion_rate
#H1: test_completion_rate > control_completion_rate


control_df = df_wd[(df_wd['Variation'] == 'Control') & (df_wd['session_completed'] == 1)]['process_step']
test_df = df_wd[(df_wd['Variation'] == 'Test') & (df_wd['session_completed'] == 1)]['process_step']

stat, p_value = st.ttest_ind(test_df,control_df,equal_var=False,alternative='greater')

if p_value > alpha:
    print("we are not able to reject the null hypothesis")
else:
    print(f"H1 is true. We reject the null hypothesis because the completion rates for the test group the p_value ({p_value}) is lower than the chosen alpha ({alpha}) ")

##### The new design found in the test group leads to the required level of improvement.

#### Average Client Engagement Hypothesis Test

In [None]:
plt.figure(figsize=(8,5))

sns.boxplot(x=df_wd['Variation'],y=df_wd['clnt_age'],palette=[vanguard_red,vanguard_beige])
plt.title('Age Distribution by Group',fontsize=14)
plt.xlabel('Group',fontsize=12)
plt.ylabel('Client Age',fontsize=12)
plt.show()

In [None]:
#Set Hypothesis

#The average age of clients engaging with the new process is the same as those engaging with the old process.

#H0  avg_age_test = avg_age_control
#H1: avg_age_test != avg_age_control


age_control_df = df_wd[(df_wd['Variation'] == 'Control')]['clnt_age']
age_test_df = df_wd[(df_wd['Variation'] == 'Test')]['clnt_age']

stat, p_value = st.ttest_ind(age_test_df, age_control_df, equal_var=False, alternative='two-sided')

if p_value > alpha:
    print(f"We fail to reject the null hypothesis (H0). There is no significant difference in average age between the test and control groups (p-value: {p_value}).")
else:
    print(f"We reject the null hypothesis (H0). There is a significant difference in average age between the test and control groups (p-value: {p_value}).")


#### Average Client Tenure Hypothesis Test

In [None]:
plt.figure(figsize=(8,5))

sns.violinplot(x=df_wd['Variation'],y=df_wd['clnt_tenure_yr'],palette=[vanguard_gray,vanguard_dark])
plt.title('Tenure Distribution by Group',fontsize=14)
plt.xlabel('Group',fontsize=12)
plt.ylabel('Client Tenure',fontsize=12)
plt.show()

In [None]:
#Set Hypothesis

#The average client tenure engaging with the new process is the same as those engaging with the old process.

#H0  avg_tenure_test = avg_tenure_control
#H1: avg_tenure_test != avg_tenure_control


tenure_control_df = df_wd[(df_wd['Variation'] == 'Control')]['clnt_tenure_yr']
tenure_test_df = df_wd[(df_wd['Variation'] == 'Test')]['clnt_tenure_yr']

stat, p_value = st.ttest_ind(tenure_test_df, tenure_control_df, equal_var=False, alternative='two-sided')

if p_value > alpha:
    print(f"We fail to reject the null hypothesis (H0). There is no significant difference in average tenure between the test and control groups (p-value: {p_value}).")
else:
    print(f"We reject the null hypothesis (H0). There is a significant difference in average tenure between the test and control groups (p-value: {p_value}).")


### Rest of Notes EDA 

In [None]:
frequency_crosstab = pd.crosstab(df_wd['age_group'],df_wd['session_completed'],normalize='index').round(2)
frequency_crosstab_sorted = frequency_crosstab.sort_values(by=frequency_crosstab.columns.tolist(),ascending=False)
display(frequency_crosstab_sorted)

sns.countplot(data=df_wd, x='age_group',palette=vanguard_palette, hue='session_completed')
plt.legend(labels=['Not Completed','Completed'])
plt.show()

In [None]:
ax = frequency_crosstab.plot(kind='bar', stacked=True, rot=0, colormap='Set2')
for i in ax.containers:
    ax.bar_label(i,label_type='center')
plt.legend(labels=['Not Completed','Completed'])
plt.show()