
 Financial Wellness Scoring System

 TYPE: Behavioral Scoring Prototype (NOT predictive)
 
 PURPOSE: Compute an interpretable financial wellness
          score (0â€“100) using behavioral ratios and
          exploratory clustering.

 IMPORTANT:
 - Scores for new users are COMPUTED, not predicted
 - No outcome labels exist (defaults, stress events)
 - This is a decision-support framework

In [151]:
!pip install joblib

'pip' is not recognized as an internal or external command,
operable program or batch file.


In [152]:
# Imports
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import pi

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [153]:
# 2. Data Loading & Cleaning

df = pd.read_csv("financial_data_google_form.csv")

df.columns = df.columns.str.strip().str.lower()

for col in df.columns:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(",", "", regex=False)
        .astype(float)
    )

print("Data Loaded:", df.shape)



Data Loaded: (200, 8)


Data Cleaning
 
 Remove whitespace from column names
 
 Convert numeric strings (including comma-formatted values) to floats

In [154]:
# Clean column names
df.columns = df.columns.str.strip().str.lower()

# Convert all columns to numeric safely
for col in df.columns:
    df[col] = (
        df[col]
        .astype(str)
        .str.replace(",", "", regex=False)
        .astype(float)
    )

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   income                   200 non-null    float64
 1   monthly_spend            200 non-null    float64
 2   savings_percent          200 non-null    float64
 3   emergency_fund           200 non-null    float64
 4   debt                     200 non-null    float64
 5   savings_amount           200 non-null    float64
 6   expense_to_income_ratio  200 non-null    float64
 7   debt_to_income_ratio     200 non-null    float64
dtypes: float64(8)
memory usage: 12.6 KB


In [155]:
print("\n--- Info ---")
print(df.info())


--- Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   income                   200 non-null    float64
 1   monthly_spend            200 non-null    float64
 2   savings_percent          200 non-null    float64
 3   emergency_fund           200 non-null    float64
 4   debt                     200 non-null    float64
 5   savings_amount           200 non-null    float64
 6   expense_to_income_ratio  200 non-null    float64
 7   debt_to_income_ratio     200 non-null    float64
dtypes: float64(8)
memory usage: 12.6 KB
None


In [156]:
print("\n--- Describe ---")
print(df.describe(include='all'))


--- Describe ---
              income  monthly_spend  savings_percent  emergency_fund  \
count     200.000000     200.000000       200.000000    2.000000e+02   
mean    82563.055000   60803.990000        14.580000    2.487952e+05   
std     34816.402535   27963.112392        10.241977    2.842280e+05   
min     21695.000000   15252.000000         0.000000    2.293000e+03   
25%     52266.500000   36828.000000         7.000000    7.508750e+04   
50%     82071.500000   56967.000000        14.000000    1.649475e+05   
75%    113759.000000   83640.000000        19.000000    3.045422e+05   
max    149761.000000  131157.000000        44.000000    1.540960e+06   

                debt  savings_amount  expense_to_income_ratio  \
count     200.000000      200.000000               200.000000   
mean   284458.130000    11938.565000                 0.734900   
std    215610.560178    10233.578523                 0.126622   
min      1580.000000        0.000000                 0.460000   
25%    1

In [157]:
features = [
    "income",
    "monthly_spend",
    "savings_percent",
    "emergency_fund",
    "debt",
    "savings_amount",
    "expense_to_income_ratio",
    "debt_to_income_ratio"
]

X = df[features]


In [158]:
#Scaling & Clustering

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(X_scaled)

cluster_means = df.groupby("cluster")[features].mean()



In [159]:
#Dynamic Cluster Naming
fragile_cluster = cluster_means["debt_to_income_ratio"].idxmax()
wealthy_cluster = cluster_means["income"].idxmax()

remaining = cluster_means.drop(
    [fragile_cluster, wealthy_cluster], errors="ignore"
)

savers_cluster = remaining["savings_percent"].idxmax()

assigned = {fragile_cluster, wealthy_cluster, savers_cluster}
moderate_cluster = list(set(df["cluster"]) - assigned)[0]

cluster_map = {
    fragile_cluster: "Financially Fragile",
    wealthy_cluster: "High Income / High Spend",
    savers_cluster: "Disciplined Savers",
    moderate_cluster: "Tight Budget / Moderate"
}

df["cluster_name"] = df["cluster"].map(cluster_map)

print("\nCluster Mapping:")
print(cluster_map)



Cluster Mapping:
{np.int32(2): 'High Income / High Spend', np.int32(1): 'Disciplined Savers', 0: 'Tight Budget / Moderate'}


In [160]:
def calculate_wellness_score(row):

    score = 0

    income = row["monthly_income"] + 1
    spend = row["monthly_spend"]
    savings_pct = row["savings_percent"]
    debt = row["total_debt"]
    emergency = row["emergency_fund"]

    # -----------------------------
    # 1. Expense Discipline (25%)
    # Smooth decay, no free passes
    # -----------------------------
    er = spend / income

    if er <= 0.4:
        expense_score = 25
    elif er >= 1.0:
        expense_score = 0
    else:
        expense_score = (1 - ((er - 0.4) / 0.6)) * 25

    # -----------------------------
    # 2. Debt Burden (30%)
    # Uses annual income, harsher curve
    # -----------------------------
    annual_income = income * 12
    dti = debt / annual_income

    if dti <= 0.5:
        debt_score = 30
    elif dti >= 3.0:
        debt_score = 0
    else:
        debt_score = (1 - ((dti - 0.5) / 2.5)) * 30

    # -----------------------------
    # 3. Emergency Fund (25%)
    # This is survival, not optimization
    # -----------------------------
    months_cover = emergency / (spend + 1)

    if months_cover >= 6:
        emergency_score = 25
    else:
        emergency_score = (months_cover / 6) * 25

    # -----------------------------
    # 4. Savings Quality (20%)
    # Penalized if debt is high
    # -----------------------------
    base_savings_score = min(savings_pct / 25, 1.0) * 20

    # Savings penalty if debt exists
    if dti > 1:
        base_savings_score *= 0.6
    elif dti > 2:
        base_savings_score *= 0.4

    savings_score = base_savings_score

    # -----------------------------
    # Final Score
    # -----------------------------
    
    score = (
        expense_score +
        debt_score +
        emergency_score +
        savings_score
    )

    return round(score, 2)



In [161]:
def wellness_label(score):
    if score < 35:
        return "Critical"
    elif score < 55:
        return "At Risk"
    elif score < 70:
        return "Stable"
    elif score < 85:
        return "Strong"
    else:
        return "Excellent"


In [162]:
def score_explanation(row):
    reasons = []

    er = row["expense_to_income_ratio"]
    dr = row["debt_to_income_ratio"]
    savings = row["savings_percent"]
    months = row["emergency_fund"] / (row["monthly_spend"] + 1)

    # Expense behavior
    if er > 0.7:
        reasons.append("Spending consumes most of income, leaving little room for savings")
    elif er > 0.5:
        reasons.append("Spending level limits long-term financial growth")

    # Savings behavior
    if savings < 10:
        reasons.append("Savings rate is too low to build financial resilience")
    elif savings < 20:
        reasons.append("Savings rate is moderate but could be stronger")

    # Debt burden
    if dr > 2:
        reasons.append("Debt is dangerously high relative to income")
    elif dr > 1:
        reasons.append("Debt level increases financial risk and stress")

    # Emergency fund
    if months < 1:
        reasons.append("Emergency fund is critically insufficient")
    elif months < 3:
        reasons.append("Emergency fund would not cover short-term shocks")

    if not reasons:
        reasons.append("Financial habits are balanced and resilient")

    return reasons


Financial Wellness Scoring Logic

SCORING PHILOSOPHY:

Rule-based, interpretable, heuristic

Designed as a BEHAVIORAL PROXY, not outcome prediction

Weights reflect a conservative, risk-sensitive stance

WEIGHTS (sum to 100%):

Savings Percent: 30% (positive)

Expense-to-Income: 25% (negative)

Debt-to-Income: 30% (negative)

Emergency Fund Ratio: 15% (positive)

In [168]:
# Sanity Check
corr = df["score"].corr(df["debt_to_income_ratio"])
print(f"Sanity Check: Correlation with Debt Ratio = {corr:.2f}")


KeyError: 'score'

In [169]:
# 11. Scope & Limitations
# ------------------------------
# - Behavioral scoring framework, not predictive
# - Heuristic, interpretable weights
# - Cross-sectional data only
# - No outcome validation
# - Stable scoring suitable for new users (Streamlit)

print("\nFinal Preview:")
print(df.head())


Final Preview:
     income  monthly_spend  savings_percent  emergency_fund      debt  \
0  137875.0       104507.0             16.0        271292.0  411276.0   
1   77586.0        63308.0              0.0         46069.0  486119.0   
2   86096.0        66189.0             17.0        205050.0  224773.0   
3  125371.0        94915.0             11.0        363038.0  334566.0   
4  127202.0       101609.0             10.0        241005.0  382276.0   

   savings_amount  expense_to_income_ratio  debt_to_income_ratio  cluster  \
0         22060.0                     0.76                  2.98        0   
1             0.0                     0.82                  6.27        2   
2         14636.0                     0.77                  2.61        0   
3         13790.0                     0.76                  2.67        0   
4         12720.0                     0.80                  3.01        0   

               cluster_name  
0   Tight Budget / Moderate  
1  High Income / High 

In [172]:

joblib.dump(kmeans, "kmeans.pkl")

['kmeans.pkl']

In [173]:
joblib.dump(scaler, "scaler.pkl") 

['scaler.pkl']