In [None]:
project/
│
├── app.py                # Main script to run the Streamlit app
├── data_loader.py        # Handles loading data
├── pages/
│   ├── about.py          # About the project page
│   ├── summary.py        # Data summary page
│   ├── unique_values.py  # Unique values page
│   ├── stats.py          # Basic statistics page
│   ├── demographics.py   # Demographics analysis page
│   ├── duration.py       # Process duration analysis page
│   ├── hypothesis.py     # Hypothesis testing page
│   └── completion.py     # Completion time analysis page
└── utils_
    ├── display.py        # Helper functions for displaying data

In [None]:
# data_loader.py
import pandas as pd
import streamlit as st

@st.cache_data
def load_data():
    url = r"C:\Users\Cecilia\Downloads\ironhack\coursework\group_work\group_project_week5_6\second_project\data\clean\combined_cleaned_data1.csv"
    try:
        df = pd.read_csv(url)
        return df
    except FileNotFoundError:
        st.error("File not found. Please check the file path.")
        return None

In [None]:
# app.py
import streamlit as st
from data_loader import load_data  # assuming load_data is in the data_loader.py file
from pages import about, summary, unique_values, stats, demographics, hypothesis, duration, completion

def main():
    """
    Main function to run the Streamlit app for A/B Test Demo.
    
    Sets up the page configuration, loads data, and manages navigation
    through different pages of the app.
    """
    st.set_page_config(page_title="A/B Test Demo for Group 7")
    
    # Load the data here
    df = load_data()

    # If df is None, show an error and don't continue
    if df is None:
        st.error("Data could not be loaded.")
        return

    st.sidebar.title("Navigation")
    page = st.sidebar.radio("Select a page:", [
        "About the Project", 
        "Data Summary", 
        "Unique Values", 
        "Basic Statistics", 
        "Demographics Analysis", 
        "Hypothesis Testing",
        "Process Duration Analysis",  # Make sure the Process Duration Analysis is listed here
        "Completion Time Analysis"
    ])

    # Handle page navigation and pass the df to the page
    if page == "About the Project":
        about.show_about_project()
    elif page == "Data Summary":
        summary.show_data_summary(df)
    elif page == "Unique Values":
        unique_values.show_unique_values_in_categorical_columns(df)
    elif page == "Basic Statistics":
        stats.show_basic_statistics(df)
    elif page == "Demographics Analysis":
        demographics.show_demographics(df)  # This matches the function name in demographics.py
    elif page == "Process Duration Analysis":
        duration.show_process_duration(df)  # Ensure this matches the function name in duration.py
    elif page == "Hypothesis Testing":
        hypothesis.show_hypothesis_testing_page(df)  # Ensure this matches the function name in hypothesis.py
    elif page == "Completion Time Analysis":
        completion.show_completion_time(df)  # Make sure the function matches the one in completion.py


if __name__ == "__main__":
    main()

In [None]:
# pages/about.py
import streamlit as st

def show_about_project():
    st.title("About the Project")
    
    st.header("Project Overview")
    st.write(
        """
        An A/B test was set into motion from 3/15/2017 to 6/20/2017 by the Vanguard team.

        Control Group: Clients interacted with Vanguard’s traditional online process.
        Test Group: Clients experienced the new, spruced-up digital interface.

        * **Day 1 & 2 (Week 5)**  
          EDA & Data Cleaning  
          Client behavior analysis - explained below (trying to find relations and come up with hypotheses)

        * **Day 3 (Week 5)**  
          Performance Metrics  
          Success Indicators  
          Redesign Outcome

        * **Day 4 & 5 (Week 5)**  
          Hypothesis Testing  
          Completion Rate  
          Completion Rate with a Cost-Effectiveness Threshold  
          Other Hypothesis Examples  
          Experiment Evaluation  
          Design Effectiveness  
          Duration Assessment  
          Additional Data Needs

        * **Day 1 & 2 (Week 6)**  
          Tableau  
          Tableau Tasks

        * **Day 3 & 4 (Week 6)**
        """
    )

    st.header("Getting Started")
    st.write(
        """
        ## Metadata
        This comprehensive set of fields will guide your analysis, helping you unravel the intricacies of client behavior and preferences.

        - **client_id**: Every client’s unique ID.
        - **variation**: Indicates if a client was part of the experiment.
        - **visitor_id**: A unique ID for each client-device combination.
        - **visit_id**: A unique ID for each web visit/session.
        - **process_step**: Marks each step in the digital process.
        - **date_time**: Timestamp of each web activity.
        - **clnt_tenure_yr**: Represents how long the client has been with Vanguard, measured in years.
        - **clnt_tenure_mnth**: Further breaks down the client’s tenure with Vanguard in months.
        - **clnt_age**: Indicates the age of the client.
        - **gendr**: Specifies the client’s gender.
        - **num_accts**: Denotes the number of accounts the client holds with Vanguard.
        - **bal**: Gives the total balance spread across all accounts for a particular client.
        - **calls_6_mnth**: Records the number of times the client reached out over a call in the past six months.
        - **logons_6_mnth**: Reflects the frequency with which the client logged onto Vanguard’s platform over the last six months.

        ## Bonus: Additional Tasks (Optional)
        If you complete all of the tasks and have some extra time before the presentation, you can explore the following additional questions and tasks:

        - Client Behavior Analysis
        - Power and Effect Size
        - Streamlit  
          Add Streamlit to your project to achieve Customization and Real-time Analysis
        """
    )

In [None]:
# pages/summary.py
import streamlit as st
import pandas as pd

def show_data_summary(df):
    st.subheader("CSV Data Overview")
    st.write(f"Number of rows: {df.shape[0]}")
    st.write(f"Number of columns: {df.shape[1]}")
    st.write("First 5 rows of the dataset:")
    st.dataframe(df.head())

# Assuming you're loading your data in the main part of the app or another script
if __name__ == "__main__":
    # Example: loading a CSV file
    # Change the path below to your actual file location
    df = pd.read_csv("path_to_your_data.csv")  # Load data from CSV

    # Now call the function and pass the DataFrame `df`
    show_data_summary(df)

In [None]:
# pages/unique_values.py
import streamlit as st

def show_unique_values_in_categorical_columns(df):
    st.title("Unique Values in Categorical Columns")
    
    # Get all categorical columns
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Check if there are any categorical columns
    if not categorical_columns:
        st.warning("No categorical columns found in the file.")
        return

    st.subheader("Unique Values in Categorical Columns:")
    for column in categorical_columns:
        # Get unique values for each categorical column
        unique_values = df[column].unique()
        st.write(f"Column: {column}")
        st.write(f"Unique values: {unique_values}")

In [None]:
# pages/stats.py
import streamlit as st

def show_basic_statistics(df):
    # Select only numeric columns
    numeric_df = df.select_dtypes(include=['number'])

    # Check if there are numeric columns
    if numeric_df.empty:
        st.warning("No numeric columns found in the file.")
        return
    
    # Display basic statistics for numeric columns
    st.subheader("Basic Statistics for Numeric Columns:")
    statistics = numeric_df.describe().T  # Transpose for better readability
    st.write(statistics)

In [None]:
# pages/demographics.py
import pandas as pd
import plotly.express as px
import streamlit as st

# Function to perform demographic analysis
def analyze_demographics(df):
    """
    Function to perform demographic analysis and generate interactive plots using Plotly.
    """
    # Ensure 'clnt_age' is present and numeric
    if 'clnt_age' not in df.columns:
        st.error("The DataFrame does not contain the 'clnt_age' column.")
        return

    if not pd.api.types.is_numeric_dtype(df['clnt_age']):
        st.error("The 'clnt_age' column is not numeric.")
        return

    # Create the 'age_group' column based on 'clnt_age' ranges
    bins = [0, 18, 30, 40, 50, 60, 100]  # Define the age group ranges
    labels = ['0-18', '19-30', '31-40', '41-50', '51-60', '60+']  # Age group labels
    df['age_group'] = pd.cut(df['clnt_age'], bins=bins, labels=labels, right=False)

    # Debugging: Show first few rows of the dataframe to confirm 'age_group' column
    st.write("First few rows of the dataframe with 'age_group':")
    st.write(df[['clnt_age', 'age_group']].head())

    # Check if 'age_group' column exists now
    if 'age_group' not in df.columns:
        st.error("The 'age_group' column was not created.")
        return

    # Aggregating based on 'gender' and 'age_group'
    logs_calls_accounts = df.groupby(['gender', 'age_group']).agg({
        'num_accts': 'mean',
        'calls_6_mnth': 'mean',
        'logons_6_mnth': 'mean'
    }).reset_index().round(2)

    # Debugging: Show the aggregated result
    st.write("Aggregated data (grouped by 'gender' and 'age_group'):")
    st.write(logs_calls_accounts)

    # Plot for Average Number of Accounts
    fig1 = px.line(
        logs_calls_accounts, 
        x='age_group', 
        y='num_accts', 
        color='gender',
        title="Average Number of Accounts by Age Group and Gender",
        labels={'num_accts': 'Average Number of Accounts'},
        markers=True
    )
    st.plotly_chart(fig1)

    # Plot for Calls in the Last 6 Months
    fig2 = px.line(
        logs_calls_accounts, 
        x='age_group', 
        y='calls_6_mnth', 
        color='gender',
        title="Average Calls in Last 6 Months by Age Group and Gender",
        labels={'calls_6_mnth': 'Average Calls in Last 6 Months'},
        line_shape='linear',
        markers=True
    )
    st.plotly_chart(fig2)

    # Plot for Logons in the Last 6 Months
    fig3 = px.line(
        logs_calls_accounts, 
        x='age_group', 
        y='logons_6_mnth', 
        color='gender',
        title="Average Logons in Last 6 Months by Age Group and Gender",
        labels={'logons_6_mnth': 'Average Logons in Last 6 Months'},
        line_shape='linear',
        markers=True
    )
    st.plotly_chart(fig3)

# Function to display the demographics analysis in Streamlit
def show_demographics(df):
    """
    Show Demographics Analysis in the Streamlit app.
    This function is used to call the analysis and display the results.
    """
    st.title("Demographics Analysis")

    # Perform the demographic analysis (aggregation and plotting)
    analyze_demographics(df)

    # Add some explanation or results display here
    st.write("Demographics analysis will be displayed here, including charts and tables.")

In [None]:
#pages/duration.py
import pandas as pd
import streamlit as st

# Define the custom sorting order for the process steps
process_step_order = ['start', 'step_1', 'step_2', 'step_3', 'confirm']

# Convert 'process_step' to a categorical column with a custom sorting order
df_merged['process_step'] = pd.Categorical(df_merged['process_step'], categories=process_step_order, ordered=True)

# Filter groups based on test/control
control_group = df_merged[df_merged['variation'] == 'Control']
test_group = df_merged[df_merged['variation'] == 'Test']

# Sort the control group and test group by client_id, visit_id, process_step, and date_time
control_group_sorted = control_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])
test_group_sorted = test_group.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])

# Function to get the latest starts
def filter_latest_starts(group_df):
    # Filter the 'start' process step
    starts_only = group_df[group_df['process_step'] == 'start']
    
    # Get the latest 'start' for each 'visit_id' (group by visit_id, and get the row with max date_time)
    latest_starts = starts_only.loc[starts_only.groupby('visit_id')['date_time'].idxmax()]
    
    # Merge the latest starts back with the original dataframe to keep full process after the latest start
    return group_df.merge(latest_starts[['visit_id', 'date_time']], on=['visit_id', 'date_time'], how='inner')

# Apply to both groups (Control and Test)
filtered_control = filter_latest_starts(control_group_sorted)
filtered_test = filter_latest_starts(test_group_sorted)

# Display the complete tables for the filtered groups
st.title("Control Group Sorted and Filtered")
st.dataframe(filtered_control)

st.title("Test Group Sorted and Filtered")
st.dataframe(filtered_test)

# Check if it works for a specific client (e.g., client_id == 2304905)
client_total_entries = df_merged[df_merged["client_id"] == 2304905]
client_last_start_control = filtered_control[filtered_control['client_id'] == 2304905]
client_last_start_test = filtered_test[filtered_test['client_id'] == 2304905]

# Display the results for the specific client
st.title("Total Entries for Client 2304905")
st.dataframe(client_total_entries)

st.title("Last Start for Client 2304905 in Control Group")
st.dataframe(client_last_start_control)

st.title("Last Start for Client 2304905 in Test Group")
st.dataframe(client_last_start_test)

In [21]:
#pages/hypothesis.py
import streamlit as st
import scipy.stats as stats

# Function for two-proportion z-test
def two_proportion_z_test(p1, p2, n1, n2):
    P = (p1 * n1 + p2 * n2) / (n1 + n2)
    SE = (P * (1 - P) * (1 / n1 + 1 / n2)) ** 0.5
    z = (p1 - p2) / SE
    p_value = 2 * (1 - stats.norm.cdf(abs(z)))  # Two-tailed test
    return z, p_value

# Function to show hypothesis testing page
def show_hypothesis_testing_page(df):
    st.title("Hypothesis Testing for Completion Rates")
    
    # Check if 'completion_rate' exists, otherwise create it
    if 'completion_rate' not in df.columns:
        if 'completed_visits' in df.columns and 'started_visits' in df.columns:
            df['completion_rate'] = df['completed_visits'] / df['started_visits'] * 100
        else:
            st.error("Missing required columns ('completed_visits' or 'started_visits') to calculate 'completion_rate'.")
            return  # Exit function if required columns are missing
    
    # Separate control and test groups based on 'variation' column
    if 'variation' not in df.columns:
        st.error("Missing 'variation' column to distinguish between control and test groups.")
        return

    control_group = df[df['variation'] == 'Control']
    test_group = df[df['variation'] == 'Test']
    
    if control_group.empty or test_group.empty:
        st.error("Missing data for control or test group.")
        return

    steps = ['confirm', 'step_1', 'step_2', 'step_3']

    # Iterate through the steps to perform hypothesis testing for completion rates
    for step in steps:
        st.subheader(f"Step: {step}")
        
        control_completions = control_group[control_group['process_step'] == step]['completion_rate'].mean()
        test_completions = test_group[test_group['process_step'] == step]['completion_rate'].mean()
        
        control_total = control_group[control_group['process_step'] == step]['started_visits'].sum()
        test_total = test_group[test_group['process_step'] == step]['started_visits'].sum()

        if control_total == 0 or test_total == 0:
            st.warning(f"No visits started for control or test group at step {step}. Skipping hypothesis test for this step.")
            continue
        
        p_control = control_completions / 100
        p_test = test_completions / 100
        
        z_stat, p_value = two_proportion_z_test(p_control, p_test, control_total, test_total)
        
        st.write(f"Z-statistic: {z_stat:.4f}")
        st.write(f"P-value: {p_value:.4f}")
        
        if p_value < 0.05:
            st.write(f"**Reject the null hypothesis**: There is a significant difference in completion rates between control and test group for step: {step}.")
        else:
            st.write(f"**Fail to reject the null hypothesis**: There is no significant difference in completion rates between control and test group for step: {step}.")
        st.write("\n")

    st.subheader("Hypothesis Test: Tenure")
    
    # Drop duplicates based on 'client_id' to ensure we are comparing unique clients
    control_unique = control_group.drop_duplicates(subset='client_id')
    test_unique = test_group.drop_duplicates(subset='client_id')

    # Ensure 'clnt_tenure_yr' exists in the dataset
    if 'clnt_tenure_yr' not in df.columns:
        st.error("Missing 'clnt_tenure_yr' column for tenure analysis.")
        return
    
    control_tenure = control_unique['clnt_tenure_yr']
    test_tenure = test_unique['clnt_tenure_yr']
        
    t_stat, p_value_tenure = stats.ttest_ind(control_tenure, test_tenure, equal_var=True)

    st.write(f"Average Tenure in Control group: {control_tenure.mean():.2f} years")
    st.write(f"Average Tenure in Test group: {test_tenure.mean():.2f} years")
    st.write(f"T-statistic: {t_stat:.4f}")
    st.write(f"P-value: {p_value_tenure:.4f}")

    if p_value_tenure < 0.05:
        st.write("**Reject the null hypothesis**: There is a significant difference in tenure between control and test groups.")
    else:
        st.write("**Fail to reject the null hypothesis**: There is no significant difference in tenure between control and test groups.")

In [None]:
#pages/completion.py
import streamlit as st
import pandas as pd  # Ensure pandas is imported

def show_completion_time(df):
    st.title("Completion Time Analysis")
    
    # Ensure the 'date_time' column is in datetime format
    if 'date_time' not in df.columns:
        st.error("Missing 'date_time' column in the dataset.")
        return
    
    # Coerce errors to NaT (Not a Time)
    df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')  

    # Drop rows where 'date_time' is NaT after coercion
    df = df.dropna(subset=['date_time'])

    # Calculate the completion time for each process step
    df['next_step_time'] = df.groupby('client_id')['date_time'].shift(-1)
    
    # Ensure 'next_step_time' is not NaT before computing completion time
    df = df.dropna(subset=['next_step_time'])

    df['completion_time'] = df['next_step_time'] - df['date_time']
    
    # Display average completion time per process step
    st.subheader("Average Completion Time Per Process Step")
    st.write(df.groupby('process_step')['completion_time'].mean())

    # Now, calculate completion rate for within-visit and client-based analysis
    st.subheader("Completion Rate by Visit")
    control_group = df[df['variation'] == 'Control']
    test_group = df[df['variation'] == 'Test']
    
    # Completion rate based on visit_id
    def calculate_within_visit_completion_rate(group):
        # Total unique visits that started
        started_visits = group[group['process_step'] == 'start']['visit_id'].nunique()

        # Unique visits that completed each step
        completed_visits = (
            group[group['process_step'] != 'start']
            .groupby('process_step')['visit_id']
            .nunique()
            .reset_index(name='completed_visits')
        )

        # Add the total started visits as a constant column
        completed_visits['started_visits'] = started_visits

        # Calculate the completion rate
        completed_visits['completion_rate'] = (
            completed_visits['completed_visits'] / completed_visits['started_visits']
        ) * 100

        return completed_visits

    # Calculate completion rates for the control and test groups
    control_completion_rate = calculate_within_visit_completion_rate(control_group)
    test_completion_rate = calculate_within_visit_completion_rate(test_group)

    # Display completion rates for control and test groups
    st.write("Control Group Completion Rate by Visit:")
    st.dataframe(control_completion_rate)

    st.write("Test Group Completion Rate by Visit:")
    st.dataframe(test_completion_rate)

    # Completion rate based on client_id
    def calculate_within_visit_completion_rate_by_client(group):
        # Total unique visits that started
        started_visits = group[group['process_step'] == 'start']['client_id'].nunique()

        # Unique visits that completed each step
        completed_visits = (
            group[group['process_step'] != 'start']
            .groupby('process_step')['client_id']
            .nunique()
            .reset_index(name='completed_visits')
        )

        # Add the total started visits as a constant column
        completed_visits['started_visits'] = started_visits

        # Calculate the completion rate
        completed_visits['completion_rate'] = (
            completed_visits['completed_visits'] / completed_visits['started_visits']
        ) * 100

        return completed_visits

    # Calculate completion rates for the control and test groups based on client_id
    control_completion_rate_id = calculate_within_visit_completion_rate_by_client(control_group)
    test_completion_rate_id = calculate_within_visit_completion_rate_by_client(test_group)

    # Display completion rates for control and test groups based on client_id
    st.write("Control Group Completion Rate by Client:")
    st.dataframe(control_completion_rate_id)

    st.write("Test Group Completion Rate by Client:")
    st.dataframe(test_completion_rate_id)

    # Completion rate by age group
    def calculate_within_visit_completion_rate_by_age(group):
        # Total unique visits that started
        started_visits = group[group['process_step'] == 'start']['client_id'].nunique()

        # Unique visits that completed each step, grouped by age_group
        completed_visits = (
            group[group['process_step'] != 'start']
            .groupby(['process_step', 'age_group'])['client_id']
            .nunique()
            .reset_index(name='completed_visits')
        )

        # Add the total started visits as a constant column
        completed_visits['started_visits'] = started_visits

        # Calculate the completion rate
        completed_visits['completion_rate'] = (
            completed_visits['completed_visits'] / completed_visits['started_visits']
        ) * 100

        return completed_visits

    # Calculate completion rates for the control and test groups based on age group
    control_completion_rate_by_age = calculate_within_visit_completion_rate_by_age(control_group)
    test_completion_rate_by_age = calculate_within_visit_completion_rate_by_age(test_group)

    # Display completion rates by age group
    st.write("Control Group Completion Rate by Age:")
    st.dataframe(control_completion_rate_by_age)

    st.write("Test Group Completion Rate by Age:")
    st.dataframe(test_completion_rate_by_age)

In [None]:
# utils/display.py
import streamlit as st

# Function to show an error message
def show_error(message: str):
    st.error(message)

# Function to display a dataframe
def display_dataframe(df, rows=5):
    if df is not None:
        st.dataframe(df.head(rows))
    else:
        show_error("Data is not available.")

# Function to display basic statistics of the dataframe
def show_basic_statistics(df):
    if df is not None:
        numeric_df = df.select_dtypes(include=['number'])
        if not numeric_df.empty:
            st.subheader("Basic Statistics")
            st.write(numeric_df.describe().T)
        else:
            show_error("No numeric columns found for statistics.")
    else:
        show_error("Data is not available for statistics.")

# Function to show unique values for categorical columns
def show_unique_values(df):
    if df is not None:
        categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
        if categorical_columns:
            for column in categorical_columns:
                st.write(f"Column: {column}")
                st.write(f"Unique values: {df[column].unique()}")
        else:
            show_error("No categorical columns found.")
    else:
        show_error("Data is not available for unique value display.")