In [98]:
#structure
your_project_directory/
│
├── aggregateapp.py              # Main entry point of the application
├── file_info.py                 # Contains the code for file information
├── unique_values.py             # Contains the code for showing unique values
├── basic_statistics.py          # Contains the code for basic statistics
├── demographics.py              # Contains the code for demographics analysis
├── hypothesis_testing.py        # Contains the code for hypothesis testing
└── utils.py                     # Utility functions (e.g., for Z-test)

SyntaxError: invalid character '│' (U+2502) (4237427503.py, line 3)

In [100]:
#file_info.py

import streamlit as st

def show_file_info(df):
    st.subheader("File Information:")
    st.write(f"Number of Columns: {df.shape[1]}")
    st.write(f"Number of Rows: {df.shape[0]}")
    st.write(f"Column Names: {df.columns.tolist()}")
    st.subheader("Preview of the File:")
    st.write(df.head())

In [None]:
#unique_values.py

import streamlit as st

def show_unique_values_in_categorical_columns(df):
    st.title("Unique Values in Categorical Columns")
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    if not categorical_columns:
        st.warning("No categorical columns found in the file.")
        return

    st.subheader("Unique Values in Categorical Columns:")
    for column in categorical_columns:
        unique_values = df[column].unique()
        st.write(f"Column: {column}")
        st.write(f"Unique values: {unique_values}")

In [102]:
#basic_statistics.py

import streamlit as st

def show_basic_statistics(df):
    numeric_df = df.select_dtypes(include=['number'])

    if numeric_df.empty:
        st.warning("No numeric columns found in the file.")
        return
    
    st.subheader("Basic Statistics for Numeric Columns:")
    statistics = numeric_df.describe().T  # Transpose for better readability
    st.write(statistics)

In [None]:
#demographics.py

import pandas as pd
import streamlit as st

def show_demographics_analysis(df):
    # Ensure 'age_column' exists in your DataFrame, for example 'clnt_age'
    age_column = 'clnt_age'  # Adjust to your dataset column name

    # Create age groups for demographics analysis
    bins = [18, 30, 40, 50, 60, 100]  # Adjust the age ranges as needed
    labels = ['18-29', '30-39', '40-49', '50-59', '60+']
    
    # Create a new column 'age_group' in the DataFrame
    df['age_group'] = pd.cut(df[age_column], bins=bins, labels=labels, right=False)

    # Display age group counts
    st.subheader("Demographics: Age Groups")
    st.write(df['age_group'].value_counts())
    
    # Check if 'clnt_gender' and 'clnt_region' columns exist in the DataFrame
    available_columns = df.columns
    st.write(f"Available columns in the dataset: {available_columns}")

    # Display 'clnt_gender' and 'clnt_region' if they exist
    if 'clnt_gender' in available_columns and 'clnt_region' in available_columns:
        st.subheader("Additional Demographics Info")
        st.write(df[['clnt_age', 'clnt_gender', 'clnt_region']].head())  # Modify based on your dataset
    else:
        st.warning("Some demographic columns ('clnt_gender', 'clnt_region') are missing in the dataset.")
        st.write("Displaying available demographic columns:")
        # Modify this part to display any other columns you want to analyze
        st.write(df[['clnt_age']].head())  # Adjust as needed

In [None]:
#hypothesis_testing.py

import pandas as pd
import streamlit as st
from scipy import stats

def two_proportion_z_test(p1, p2, n1, n2):
    # Calculate the pooled proportion
    P = (p1 * n1 + p2 * n2) / (n1 + n2)
    
    # Calculate the standard error
    SE = (P * (1 - P) * (1 / n1 + 1 / n2)) ** 0.5
    
    # Calculate the z-statistic
    z = (p1 - p2) / SE
    
    # Calculate the p-value
    p_value = 2 * (1 - stats.norm.cdf(abs(z)))  # Two-tailed test
    
    return z, p_value

def show_hypothesis_testing_page(df):
    st.title("Hypothesis Testing for Completion Rates")
    
    # Ensure the relevant columns are available in the DataFrame
    available_columns = df.columns
    st.write(f"Available columns in the dataset: {available_columns}")

    # Check if 'completion_rate' is present, and if not, calculate it
    if 'completion_rate' not in available_columns:
        # Assuming 'completed_visits' and 'started_visits' are available for calculation
        if 'completed_visits' in available_columns and 'started_visits' in available_columns:
            df['completion_rate'] = df['completed_visits'] / df['started_visits'] * 100
        else:
            st.error("Missing required columns ('completed_visits' or 'started_visits') to calculate 'completion_rate'.")
            return

    # Separate the control and test groups
    control_group = df[df['variation'] == 'Control']
    test_group = df[df['variation'] == 'Test']

    # Assuming 'process_step' is a column to group by
    steps = ['confirm', 'step_1', 'step_2', 'step_3']

    # Loop through each step and perform hypothesis testing
    for step in steps:
        st.subheader(f"Step: {step}")
        
        # Get completion rates for both control and test groups for the current step
        control_completions = control_group[control_group['process_step'] == step]['completion_rate'].values[0]
        test_completions = test_group[test_group['process_step'] == step]['completion_rate'].values[0]
        
        control_total = control_group[control_group['process_step'] == step]['started_visits'].values[0]
        test_total = test_group[test_group['process_step'] == step]['started_visits'].values[0]

        # Calculate proportions (completion rate)
        p_control = control_completions / 100  # Convert completion rate to a proportion
        p_test = test_completions / 100  # Convert completion rate to a proportion
        
        # Perform the two-proportion z-test
        z_stat, p_value = two_proportion_z_test(p_control, p_test, control_total, test_total)
        
        # Displaying the results
        st.write(f"Z-statistic: {z_stat:.4f}")
        st.write(f"P-value: {p_value:.4f}")
        
        # Hypothesis testing interpretation
        if p_value < 0.05:
            st.write(f"**Reject the null hypothesis**: There is a significant difference in completion rates between control and test group for step: {step}.")
        else:
            st.write(f"**Fail to reject the null hypothesis**: There is no significant difference in completion rates between control and test group for step: {step}.")
        st.write("\n")

    # ----------------------------------------
    # Additional Hypothesis Testing: Tenure
    st.subheader("Hypothesis Test: Tenure")
    
    # Remove duplicates based on 'client_id' to get unique clients
    control_uniqe = control_group.drop_duplicates(subset='client_id')
    test_unique = test_group.drop_duplicates(subset='client_id')

    # Extract the tenure data for both groups
    control_tenure = control_uniqe['clnt_tenure_yr']
    test_tenure = test_unique['clnt_tenure_yr']
        
    # Perform two-sample t-test for tenure
    _, p_value_tenure = st.ttest_ind(control_tenure, test_tenure, equal_var=True)  # assuming equal variance

    # Display the results
    st.write(f"Average Tenure in Control group: {control_tenure.mean():.2f} years")
    st.write(f"Average Tenure in Test group: {test_tenure.mean():.2f} years")
    st.write(f"T-statistic: {_:.4f}")
    st.write(f"P-value: {p_value_tenure:.4f}")

    # Hypothesis test: Is there a significant difference in tenure between the two groups?
    if p_value_tenure < 0.05:
        st.write("**Reject the null hypothesis**: The average tenure is significantly different between the Test and Control groups.")
    else:
        st.write("**Fail to reject the null hypothesis**: The average tenure is not significantly different between the two groups.")
    
    st.write("\n")

    # ----------------------------------------
    # Additional Hypothesis Testing: Age
    st.subheader("Hypothesis Test: Age")
    
    # Extract the age data for both groups
    control_age = control_uniqe['clnt_age']
    test_age = test_unique['clnt_age']

    # Perform two-sample t-test for age
    _, p_value_age = st.ttest_ind(control_age, test_age, equal_var=True)  # assuming equal variance

    # Display the results
    st.write(f"Control Group Mean Age: {control_age.mean():.2f} years")
    st.write(f"Test Group Mean Age: {test_age.mean():.2f} years")
    st.write(f"T-statistic: {_:.4f}")
    st.write(f"P-value: {p_value_age:.4f}")

    # Hypothesis test: Is there a significant difference in age between the two groups?
    if p_value_age < 0.05:
        st.write("**Reject the null hypothesis**: The average age is different between the Test and Control groups.")
    else:
        st.write("**Fail to reject the null hypothesis**: The average age is not significantly different between the Test and Control groups.")
    
    st.write("\n")

In [None]:
#utils.py

import pandas as pd
from scipy.stats import norm

def two_proportion_z_test(p1, p2, n1, n2):
    P = (p1 * n1 + p2 * n2) / (n1 + n2)
    SE = (P * (1 - P) * (1 / n1 + 1 / n2)) ** 0.5
    z = (p1 - p2) / SE
    p_value = 2 * (1 - norm.cdf(abs(z)))  # Two-tailed test
    return z, p_value

In [None]:
#aggregateapp.py

import streamlit as st
import pandas as pd
from file_info import show_file_info
from unique_values import show_unique_values_in_categorical_columns
from basic_statistics import show_basic_statistics
from demographics import show_demographics_analysis
from hypothesis_testing import show_hypothesis_testing_page

# Set up Streamlit app layout
st.set_page_config(page_title="A/B Test Demo for Group 7")

# Initialize session state to manage pages
if 'page' not in st.session_state:
    st.session_state.page = 'home'
if 'df' not in st.session_state:
    st.session_state.df = None

# Navigation buttons function
def navigation_buttons():
    col1, col2, col3, col4, col5, col6 = st.columns(6)
    
    with col1:
        if st.button("Home"):
            st.session_state.page = 'home'
    
    with col2:
        if st.button("File Info"):
            st.session_state.page = 'file_info'
    
    with col3:
        if st.button("Unique Values"):
            st.session_state.page = 'unique_values'
    
    with col4:
        if st.button("Basic Statistics"):
            st.session_state.page = 'basic_statistics'
    
    with col5:
        if st.button("Demographics"):
            st.session_state.page = 'demographics'
    
    with col6:
        if st.button("Hypothesis Testing"):
            st.session_state.page = 'hypothesis_testing'

# Home page: File Upload and Project Presentation
def show_home_page():
    st.title("Welcome to A/B Test Demo for Group 7")
    
    # Add project presentation text
    st.subheader("The Digital Challenge")
    st.write("""
        The digital world is evolving, and so are Vanguard’s clients. Vanguard believed that a more intuitive and modern User Interface (UI), 
        coupled with timely in-context prompts (cues, messages, hints, or instructions provided to users directly within the context of their 
        current task or action), could make the online process smoother for clients. The critical question was: 
        Would these changes encourage more clients to complete the process?
    """)
    
    # File upload section
    st.write("Please upload your file to start")
    uploaded_file = st.file_uploader("Upload your file (CSV or TXT)", type=['csv', 'txt'])

    if uploaded_file is not None:
        df = pd.read_csv(uploaded_file)
        st.session_state.df = df  # Store the dataframe in session state
        navigation_buttons()  # Display navigation buttons
        show_file_info(df)

# File Info Page
def show_file_info_page(df):
    navigation_buttons()  # Display navigation buttons
    show_file_info(df)

# Unique Values Page
def show_unique_values_page(df):
    navigation_buttons()  # Display navigation buttons
    show_unique_values_in_categorical_columns(df)

# Basic Statistics Page
def show_basic_statistics_page(df):
    navigation_buttons()  # Display navigation buttons
    show_basic_statistics(df)

# Demographics Analysis Page
def show_demographics_page(df):
    navigation_buttons()  # Display navigation buttons
    show_demographics_analysis(df)

# Hypothesis Testing Page
def show_hypothesis_testing_page_function(df):
    navigation_buttons()  # Display navigation buttons
    show_hypothesis_testing_page(df)

# Main app logic to render the selected page
if st.session_state.page == 'home':
    show_home_page()

elif st.session_state.page == 'file_info':
    if st.session_state.df is not None:
        show_file_info_page(st.session_state.df)
    else:
        st.warning("Please upload a file to proceed.")

elif st.session_state.page == 'unique_values':
    if st.session_state.df is not None:
        show_unique_values_page(st.session_state.df)
    else:
        st.warning("Please upload a file to proceed.")

elif st.session_state.page == 'basic_statistics':
    if st.session_state.df is not None:
        show_basic_statistics_page(st.session_state.df)
    else:
        st.warning("Please upload a file to proceed.")

elif st.session_state.page == 'demographics':
    if st.session_state.df is not None:
        show_demographics_page(st.session_state.df)
    else:
        st.warning("Please upload a file to proceed.")

elif st.session_state.page == 'hypothesis_testing':
    if st.session_state.df is not None:
        show_hypothesis_testing_page_function(st.session_state.df)
    else:
        st.warning("Please upload a file to proceed.")