# Customer Churn Prediction (Modernized)

This notebook is updated for Python 3.13+ with modern APIs, safe data handling, and a clean ML pipeline.

**Highlights**:
- Uses `pathlib` for file paths.
- Uses `seaborn.histplot` for distributions.
- Applies `try/except` for data loading and training.
- Prevents data leakage (split → resample → scale).
- Compatible with packages in `requirements.txt`.

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE

import dtale

# Display settings
pd.set_option("display.max_columns", None)
plt.style.use("seaborn-v0_8")

In [None]:
data_path = Path("BankChurners.csv")

try:
    if not data_path.exists():
        raise FileNotFoundError(f"CSV not found: {data_path.resolve()}")

    data = pd.read_csv(data_path)
    print(f"Loaded: {data.shape[0]} rows × {data.shape[1]} columns")
    display(data.head())
except (FileNotFoundError, pd.errors.ParserError) as exc:
    raise RuntimeError(f"Failed to load dataset: {exc}")

In [None]:
# Quick EDA
numerical_cols = [col for col in data.columns if data[col].dtype != "object"]

# Plot a few numeric distributions
for col in numerical_cols[:6]:
    sns.histplot(data[col].dropna(), kde=True, bins=25)
    plt.title(f"Distribution: {col}")
    plt.tight_layout()
    plt.show()

# Optional: interactive exploration (opens a local web UI)
# dtale.show(data)

In [None]:
# Feature/target split
try:
    y = pd.get_dummies(data["Attrition_Flag"], drop_first=True).squeeze()
    X = data.iloc[:, 2:21].copy()

    # Encode categoricals
    X = pd.get_dummies(X, columns=["Gender", "Marital_Status"], drop_first=True)

    # Train/test split FIRST to prevent leakage
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y,
    )

    # Resample training data only
    smote = SMOTE(sampling_strategy="auto", random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)

    print("Data prepared successfully.")
except KeyError as exc:
    raise RuntimeError(f"Missing expected column: {exc}")

In [None]:
# Train models and evaluate
try:
    rf_model = RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced",
    )
    rf_model.fit(X_train_scaled, y_train_resampled)

    lr_model = LogisticRegression(
        max_iter=1000,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced",
    )
    lr_model.fit(X_train_scaled, y_train_resampled)

    # Evaluate Random Forest
    rf_pred = rf_model.predict(X_test_scaled)
    rf_f1 = f1_score(y_test, rf_pred)

    print("Random Forest F1:", round(float(rf_f1), 4))
    print(classification_report(y_test, rf_pred))

    # Evaluate Logistic Regression
    lr_pred = lr_model.predict(X_test_scaled)
    lr_f1 = f1_score(y_test, lr_pred)

    print("Logistic Regression F1:", round(float(lr_f1), 4))
    print(classification_report(y_test, lr_pred))
except Exception as exc:
    raise RuntimeError(f"Training failed: {exc}")