📝 Title + Introduction

# 🧠 Bank Customer Churn Prediction

This project predicts whether a bank customer will churn using machine learning models like Logistic Regression, Random Forest, and XGBoost.


📥 Load Dataset

In [None]:
import pandas as pd
df = pd.read_csv("churn_modeling.csv")
df.head()


🧹 Data Cleaning + Preprocessing

In [None]:
df_cleaned = df.drop(columns=['CustomerId', 'Surname'])

from sklearn.preprocessing import LabelEncoder, StandardScaler
df_cleaned['Gender'] = LabelEncoder().fit_transform(df_cleaned['Gender'])
df_cleaned = pd.get_dummies(df_cleaned, columns=['Geography'], drop_first=True)

X = df_cleaned.drop(columns=['Churn'])
y = df_cleaned['Churn']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)


🤖 Model Training + Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=50, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

for name, model in models.items():
    print(f"\n🚀 Training: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"✅ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("📊 Classification Report:")
    print(classification_report(y_test, y_pred))
    print("📉 Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))


📈 ROC Curve Comparison

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
for name, model in models.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('📈 ROC Curve Comparison')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


Feature Importance (XGBoost)

In [None]:
xgb_model = models["XGBoost"]
importances = xgb_model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 5))
plt.barh(feature_names, importances)
plt.title("🔍 XGBoost Feature Importance")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()


In [None]:
## ✅ Conclusion

- Logistic Regression was weakest at detecting churn
- XGBoost had the best balance of accuracy and recall
- Feature Importance revealed Age, Activity, Geography mattered most
