In [None]:
# Import things
import pandas as pd

# Project Template

This document serves as a barebones template that you can start from when creating a new project. Please adjust it as you see fit to suit your project's needs.

## Problem Statement and Your Outline

### Load, Explore, and Clean Data

Load your dataset, explore it, perform data cleaning, and then examine the ready-to-use data you're prepared. 

In [None]:
# Import Libraries

import pandas as pd
from scipy import stats
import numpy as np

In [6]:
# Load Data
df = pd.read_csv("C:\\Users\\User\\Downloads\\BSIP_Project_Students\\bank-full.csv", sep=";")

print("--- Original Data Information ---")
print(f"Original Total Records: {len(df)}")


--- Original Data Information ---
Original Total Records: 45211


### Prepare Data for Modeling

If there is a need to do further processing, such as feature engineering, you can do it here.

In [8]:
# Outlier Analysis and Removal (IQR Method on 'age') ---

# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds (1.5 * IQR rule)
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the DataFrame to remove outliers
df_cleaned = df[(df['age'] >= lower_bound) & (df['age'] <= upper_bound)]

# Report on the removal
num_removed = len(df) - len(df_cleaned)
print(f"IQR Lower Bound: {lower_bound:.2f} years")
print(f"IQR Upper Bound: {upper_bound:.2f} years")
print(f"Outliers Removed: {num_removed}")
print(f"Cleaned Total Records: {len(df_cleaned)}")


IQR Lower Bound: 10.50 years
IQR Upper Bound: 70.50 years
Outliers Removed: 487
Cleaned Total Records: 44724


In [11]:
# Define the Two Sample Groups on CLEANED Data ---
# Group 1: Clients who subscribed (y = 'yes')
age_yes_cleaned = df_cleaned[df_cleaned['y'] == 'yes']['age']

# Group 2: Clients who did not subscribe (y = 'no')
age_no_cleaned = df_cleaned[df_cleaned['y'] == 'no']['age']

stats_summary_cleaned = pd.DataFrame({
    'Group': ['Subscribed (y=yes)', 'Not Subscribed (y=no)'],
    'N (Cleaned)': [len(age_yes_cleaned), len(age_no_cleaned)],
    'Mean Age (Cleaned)': [age_yes_cleaned.mean(), age_no_cleaned.mean()],
    'Std Dev (Cleaned)': [age_yes_cleaned.std(), age_no_cleaned.std()]
})

print("\n--- Descriptive Statistics (Cleaned Data) ---")
# Using .to_string(index=False) for a simple, non-bordered table
print(stats_summary_cleaned.to_string(index=False))


--- Descriptive Statistics (Cleaned Data) ---
                Group  N (Cleaned)  Mean Age (Cleaned)  Std Dev (Cleaned)
   Subscribed (y=yes)         5071           40.159535          11.557334
Not Subscribed (y=no)        39653           40.594886           9.756943


In [12]:
# Two-Sample T-Test (Welch's Test: equal_var=False) ---
t_stat_cleaned, p_value_cleaned = stats.ttest_ind(
    a=age_yes_cleaned,
    b=age_no_cleaned,
    equal_var=False
)

### Create Model(s)

In [None]:
# Modelling

### Evaluate Results

In [14]:
# results

print("\nTwo-Sample T-Test Results")
print(f"T-Statistic: {t_stat_cleaned:.4f}")
print(f"P-value: {p_value_cleaned:.5f}")

alpha = 0.05
if p_value_cleaned < alpha:
    print(f"\nDecision (at alpha={alpha}): Reject the Null Hypothesis ($H_0$)")
    print("Conclusion: There is sufficient evidence to conclude that the average age is significantly different.")
else:
    print(f"\nDecision (at alpha={alpha}): Fail to Reject the Null Hypothesis ($H_0$)")
    print("Conclusion: There is NOT sufficient evidence to conclude that the average age is significantly different.")


Two-Sample T-Test Results
T-Statistic: -2.5680
P-value: 0.01025

Decision (at alpha=0.05): Reject the Null Hypothesis ($H_0$)
Conclusion: There is sufficient evidence to conclude that the average age is significantly different.


## Summary and Conclusion

In [None]:
# Finish it

## Completion Check

Before you submit your project, check the following to make sure it is complete and easy to understand:
<ul>
<li> Are the sections labeled and organized in a logical way? </li>
<li> Are there descriptions that give an outline of what is being done in each section in plain language? </li>
<li> Do you explain your thought process and logic behind any decisions you make? </li>
<li> What do your finding mean, and what evidence from your project supports that? </li>
<li> Do things run, end-to-end, without errors? </li>
<li> Is the code commented, easy to read, and easy to follow? </li>
</ul>

In short, I should be able to read through this, end-to-end, and understand what you did, why you did it, and what you found.
