In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load the "Predicting Churn for Bank Customers" dataset
data = pd.read_csv("Churn_Modelling.csv")

In [2]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Remove the "RowNumber", "CustomerId", and "Surname" features
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

# One-hot encode the "geography" and "gender" features
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_geography = encoder.fit_transform(data[["Geography"]]).toarray()
encoded_gender = encoder.fit_transform(data[["Gender"]]).toarray()

In [4]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
# Concatenate the one-hot encoded features with the original data
encoded_features = np.concatenate([encoded_geography, encoded_gender], axis=1)
encoded_data = pd.concat([data.drop(["Geography", "Gender"], axis=1), pd.DataFrame(encoded_features)], axis=1)
encoded_data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,0,1,2,3,4
0,619,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0,1.0,0.0
1,608,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0,1.0,0.0
2,502,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0,1.0,0.0
3,699,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0,1.0,0.0
4,850,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0,1.0,0.0


In [6]:
# Split the data into features (X) and target (y)
X = encoded_data.drop("Exited", axis=1)
y = encoded_data["Exited"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train.values

array([0, 0, 0, ..., 0, 0, 0])

In [7]:
# Train the Random Forest Classifier on the training data
clf = RandomForestClassifier()
clf.fit(X_train.values, y_train.values)

In [8]:
# Make predictions on the testing data
y_pred = clf.predict(X_test.values)

In [9]:
# Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.878


In [10]:
# Evaluate the model's performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93      1605
           1       0.79      0.52      0.63       395

    accuracy                           0.88      2000
   macro avg       0.84      0.74      0.78      2000
weighted avg       0.87      0.88      0.87      2000



In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score

# Define the hyperparameters to search over
param_grid = {'n_estimators': [100, 200,300], 'max_depth': [None,20,40]}

In [24]:
# Define the scoring metric (in this case, recall for the positive class)
recall_scorer = make_scorer(recall_score, pos_label=1)

# Create a Random Forest Classifier
clf = RandomForestClassifier(random_state=42,class_weight='balanced')

# Create the grid search object
grid_search = GridSearchCV(clf, param_grid, scoring=recall_scorer, cv=5)

# Fit the grid search to the data
grid_search.fit(X_train.values, y_train.values)

In [25]:
# Get the best parameters
best_params = grid_search.best_params_
print('Best parameters:', best_params)

# Get the best model
best_model = grid_search.best_estimator_

Best parameters: {'max_depth': 20, 'n_estimators': 200}


In [29]:
# Train the Random Forest Classifier on the training data
clf = RandomForestClassifier(random_state=42,max_depth=20,n_estimators=200,class_weight='balanced')
clf.fit(X_train.values, y_train.values)

In [30]:
# Make predictions on the testing data
y_pred = clf.predict(X_test.values)

In [31]:
# Evaluate the model's performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.92      1605
           1       0.78      0.50      0.61       395

    accuracy                           0.87      2000
   macro avg       0.83      0.73      0.77      2000
weighted avg       0.87      0.87      0.86      2000

