In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Load the "Predicting Churn for Bank Customers" dataset
data = pd.read_csv("Churn_Modelling.csv")

In [3]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Remove the "RowNumber", "CustomerId", and "Surname" features
data = data.drop(["RowNumber", "CustomerId", "Surname"], axis=1)

# One-hot encode the "geography" and "gender" features
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_geography = encoder.fit_transform(data[["Geography"]]).toarray()
encoded_gender = encoder.fit_transform(data[["Gender"]]).toarray()

In [5]:
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [8]:
# Concatenate the one-hot encoded features with the original data
encoded_features = np.concatenate([encoded_geography, encoded_gender], axis=1)
encoded_data = pd.concat([data.drop(["Geography", "Gender"], axis=1), pd.DataFrame(encoded_features)], axis=1)
encoded_data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,0,1,2,3,4
0,619,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0,1.0,0.0
1,608,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0,1.0,0.0
2,502,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0,1.0,0.0
3,699,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0,1.0,0.0
4,850,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0,1.0,0.0


In [14]:
# Split the data into features (X) and target (y)
X = encoded_data.drop("Exited", axis=1)
y = encoded_data["Exited"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train.values

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
# Train the Random Forest Classifier on the training data
clf = RandomForestClassifier()
clf.fit(X_train.values, y_train.values)

In [18]:
# Make predictions on the testing data
y_pred = clf.predict(X_test.values)

In [19]:
# Evaluate the model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8645


In [23]:
# Evaluate the model's performance
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1600
           1       0.74      0.49      0.59       400

    accuracy                           0.86      2000
   macro avg       0.81      0.73      0.76      2000
weighted avg       0.86      0.86      0.85      2000

