In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("chetanmittal033/bank-dataset-for-customer-churn-prediction")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/chetanmittal033/bank-dataset-for-customer-churn-prediction?dataset_version_number=1...


100%|██████████| 262k/262k [00:00<00:00, 16.4MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/chetanmittal033/bank-dataset-for-customer-churn-prediction/versions/1





In [1]:
!pip install opendatasets


Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [2]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/chetanmittal033/bank-dataset-for-customer-churn-prediction")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: shubhangiagarwal0
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/chetanmittal033/bank-dataset-for-customer-churn-prediction
Downloading bank-dataset-for-customer-churn-prediction.zip to ./bank-dataset-for-customer-churn-prediction


100%|██████████| 262k/262k [00:00<00:00, 346MB/s]







In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib
import os
from sklearn.metrics import confusion_matrix, classification_report

# ===============================
# Load Dataset
# ===============================
df = pd.read_csv("/content/bank-dataset-for-customer-churn-prediction/Churn_Modelling.csv")   # <-- update filename if needed

# ===============================
# Drop unnecessary columns
# ===============================
df.drop(columns=["RowNumber", "CustomerId", "Surname"], inplace=True)

# ===============================
# Encode categorical features
# ===============================
label_encoder = LabelEncoder()
df["Gender"] = label_encoder.fit_transform(df["Gender"])
df["Geography"] = label_encoder.fit_transform(df["Geography"])

# ===============================
# Split features and target
# ===============================
X = df.drop("Exited", axis=1)
y = df["Exited"]

# ===============================
# Train-Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===============================
# Feature Scaling
# ===============================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save scaler for Streamlit
os.makedirs("model", exist_ok=True)
joblib.dump(scaler, "model/scaler.pkl")

# ===============================
# Models
# ===============================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}

# ===============================
# Evaluation Function
# ===============================
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Metrics
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nConfusion Matrix for {model_name}:")
    print(cm)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    return metrics


# ===============================
# Train, Evaluate, Save Models
# ===============================
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    metrics = evaluate_model(model, X_test, y_test,name)
    results[name] = metrics

    # Save model
    filename = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, f"model/{filename}")

# ===============================
# Display Results
# ===============================
results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:\n")
print(results_df.round(4))



Confusion Matrix for Logistic Regression:
[[1552   41]
 [ 349   58]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      1593
           1       0.59      0.14      0.23       407

    accuracy                           0.81      2000
   macro avg       0.70      0.56      0.56      2000
weighted avg       0.77      0.81      0.75      2000


Confusion Matrix for Decision Tree:
[[1357  236]
 [ 213  194]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.85      0.86      1593
           1       0.45      0.48      0.46       407

    accuracy                           0.78      2000
   macro avg       0.66      0.66      0.66      2000
weighted avg       0.78      0.78      0.78      2000


Confusion Matrix for KNN:
[[1513   80]
 [ 250  157]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Confusion Matrix for XGBoost:
[[1502   91]
 [ 215  192]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.94      0.91      1593
           1       0.68      0.47      0.56       407

    accuracy                           0.85      2000
   macro avg       0.78      0.71      0.73      2000
weighted avg       0.83      0.85      0.84      2000


Model Performance Comparison:

                     Accuracy     AUC  Precision  Recall  F1 Score     MCC
Logistic Regression    0.8050  0.7710     0.5859  0.1425    0.2292  0.2167
Decision Tree          0.7755  0.6643     0.4512  0.4767    0.4636  0.3219
KNN                    0.8350  0.7724     0.6624  0.3857    0.4876  0.4180
Naive Bayes            0.8290  0.8146     0.7559  0.2359    0.3596  0.3573
Random Forest          0.8645  0.8469     0.7857  0.4595    0.5798  0.5315
XGBoost                0.8470  0.8330     0.6784  0.4717    0.5565  0.4789
