In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
)

In [2]:
data = pd.read_csv('Churn_Modelling.csv')

# ========== 3. Quick Exploration ==========
print("Rows, Columns:", data.shape)
print(data.dtypes)
print("Missing values per column:\n", data.isna().sum())
print(data.head())

Rows, Columns: (10000, 14)
RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object
Missing values per column:
 RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64
   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304

In [4]:
X = data.drop('Exited', axis=1)
y = data['Exited']

# ========== 6. Train–Test Split ==========
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

base_learners = [
    ('rf',  RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
]
meta_learner = LogisticRegression()

stack_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5
)


In [6]:
stack_model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [7]:
for name, Xs, ys in [('Train', X_train, y_train), ('Test', X_test, y_test)]:
    preds = stack_model.predict(Xs)
    proba = stack_model.predict_proba(Xs)[:, 1]
    print(f"\n--- {name} Metrics ---")
    print("Accuracy:", accuracy_score(ys, preds))
    print("ROC AUC :", roc_auc_score(ys, proba))
    print(classification_report(ys, preds))
    print("Confusion Matrix:\n", confusion_matrix(ys, preds))


--- Train Metrics ---
Accuracy: 0.998125
ROC AUC : 0.9999992295171962
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6370
           1       1.00      0.99      1.00      1630

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000

Confusion Matrix:
 [[6370    0]
 [  15 1615]]

--- Test Metrics ---
Accuracy: 0.8635
ROC AUC : 0.848162492230289
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1593
           1       0.76      0.48      0.59       407

    accuracy                           0.86      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.85      0.86      0.85      2000

Confusion Matrix:
 [[1532   61]
 [ 212  195]]


In [12]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

# === 1. Load the data ===
df = pd.read_csv('Churn_Modelling.csv')

In [None]:

df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
imputer = SimpleImputer(strategy='median')
df[num_cols] = imputer.fit_transform(df[num_cols])

le_geo = LabelEncoder()
df['Geography'] = le_geo.fit_transform(df['Geography'])

le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

In [None]:
X = df.drop('Exited', axis=1)
y = df['Exited']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

In [None]:
base_learners = [
    ('rf',  RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
]
stack_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=LogisticRegression(),
    cv=5
)
stack_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
for split_name, Xs, ys in [('Train', X_train, y_train), ('Test', X_test, y_test)]:
    preds = stack_model.predict(Xs)
    probs = stack_model.predict_proba(Xs)[:, 1]
    print(f"\n--- {split_name} set metrics ---")
    print(f"Accuracy: {accuracy_score(ys, preds):.4f}")
    print(f"ROC AUC : {roc_auc_score(ys, probs):.4f}")
    print(classification_report(ys, preds))
    print("Confusion Matrix:")
    print(confusion_matrix(ys, preds))


--- Train set metrics ---
Accuracy: 0.9981
ROC AUC : 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6370
           1       1.00      0.99      1.00      1630

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000

Confusion Matrix:
[[6370    0]
 [  15 1615]]

--- Test set metrics ---
Accuracy: 0.8635
ROC AUC : 0.8482
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1593
           1       0.76      0.48      0.59       407

    accuracy                           0.86      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.85      0.86      0.85      2000

Confusion Matrix:
[[1532   61]
 [ 212  195]]


In [None]:
joblib.dump(stack_model, 'churn_model.pkl')
joblib.dump(scaler,      'scaler.pkl')
joblib.dump({'Geography': le_geo, 'Gender': le_gender}, 'label_encoders.pkl')

print("\nSaved artifacts:")
print(" • churn_model.pkl")
print(" • scaler.pkl")
print(" • label_encoders.pkl")



Saved artifacts:
 • churn_model.pkl
 • scaler.pkl
 • label_encoders.pkl


In [None]:
import pandas as pd
import joblib

# 1. Load artifacts
model    = joblib.load('churn_model.pkl')
scaler   = joblib.load('scaler.pkl')
encoders = joblib.load('label_encoders.pkl')

new_customer = {
    'CreditScore'     : 600,
    'Geography'       : 'France',
    'Gender'          : 'Female',
    'Age'             : 40,
    'Tenure'          : 3,
    'Balance'         : 50000.0,
    'NumOfProducts'   : 1,
    'HasCrCard'       : 1,
    'IsActiveMember'  : 1,
    'EstimatedSalary' : 60000.0
}

df_new = pd.DataFrame([new_customer])

df_new['Geography'] = encoders['Geography'].transform(df_new['Geography'])
df_new['Gender']    = encoders['Gender'].transform(df_new['Gender'])

X_new = scaler.transform(df_new)

pred_class = model.predict(X_new)[0]
pred_prob  = model.predict_proba(X_new)[0,1]

print(f"Churn prediction: {pred_class}  (1 = WILL churn)")
print(f"Churn probability: {pred_prob:.2f}")


Churn prediction: 0  (1 = WILL churn)
Churn probability: 0.26
