In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

import joblib
import os

In [7]:
# Load data
df = pd.read_csv('Bank Customer Churn Prediction.csv')

df.head()
df.info()
df.describe()
df['churn'].value_counts(normalize=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB


churn
0    0.7963
1    0.2037
Name: proportion, dtype: float64

In [8]:
# Cleaning 

#Drop identifier
df = df.drop(columns=['customer_id'])

# If churn is Yes/No, convert to 1/0
if df['churn'].dtype == 'object':
    df['churn'] = df['churn'].map({'Yes': 1, 'No': 0})

#Handle missing values 
df = df.dropna()

#Check class balance
print(df['churn'].value_counts(normalize=True))
    

churn
0    0.7963
1    0.2037
Name: proportion, dtype: float64


In [9]:
# One-hot encode categorical variables

cat_cols = ['country', 'gender']
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

#Features and target

X = df_encoded.drop(columns=['churn'])
y = df_encoded['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# Scale numeric features

num_cols = ['credit_score', 'age', 'tenure', 'balance', 'products_number', 'estimated_salary']

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [11]:
#Train model (Random Forest)
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nConfusion matrix:\n', confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

Accuracy: 0.861

Confusion matrix:
 [[1545   48]
 [ 230  177]]

Classification report:
               precision    recall  f1-score   support

           0       0.87      0.97      0.92      1593
           1       0.79      0.43      0.56       407

    accuracy                           0.86      2000
   macro avg       0.83      0.70      0.74      2000
weighted avg       0.85      0.86      0.84      2000



In [12]:
# Save model and scaler
os.makedirs("../models", exist_ok=True)
joblib.dump(rf, "../models/scaler.pkl")
joblib.dump(scaler, "../models/scaler.pkl")
joblib.dump(X.columns.tolist(), "../models/features_columns.pkl")

['../models/features_columns.pkl']