In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 
from scipy.stats import mode 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
  
%matplotlib inline

In [None]:
data = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv').dropna(axis=1)
data.drop(columns=['Surname', 'RowNumber', 'CustomerId'], inplace=True)
data = pd.get_dummies(data, columns=['Geography', 'Gender'])
data.info()

churn_count = data['Exited'].value_counts()
temp_df = pd.DataFrame({
    'Exited':churn_count.index,
    'Counts': churn_count.values
})

plt.figure(figsize=(10, 6))
sns.barplot(x='Exited', y='Counts', data=temp_df)
plt.xticks(rotation=90)
plt.show()

In [None]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

In [None]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
print(X_test.shape)
print(X_val.shape)

In [None]:
def cv_scoring(estimator, X, y): 
    return accuracy_score(y, estimator.predict(X)) 
  
models = { 
    "SVC":SVC(), 
    "Gaussian NB":GaussianNB(), 
    "Random Forest":RandomForestClassifier(random_state=18) 
} 
  
for model_name in models: 
    model = models[model_name] 
    scores = cross_val_score(model, X, y, cv = 10,  
                             n_jobs = -1,  
                             scoring = cv_scoring) 
    print("=="*30) 
    print(model_name) 
    print(f"Scores: {scores}") 
    print(f"Mean Score: {np.mean(scores)}")

From the above output we notice that our Random Forest Classifier is performing very well and the mean score after K-fold Cross Validation is also very high, so we will fill the data on Random Forest Classifier Algorithm.

In [None]:
rf_model = RandomForestClassifier(random_state=18) 
rf_model.fit(X_train, y_train) 
preds = rf_model.predict(X_val) 
print(f"Accuracy on train data by Random Forest Classifier: {accuracy_score(y_train, rf_model.predict(X_train))*100}") 
  
print(f"Accuracy on test data by Random Forest Classifier: {accuracy_score(y_val, preds)*100}") 
  
cf_matrix = confusion_matrix(y_val, preds) 
plt.figure(figsize=(12,8)) 
sns.heatmap(cf_matrix, annot=True) 
plt.title("Confusion Matrix for Random Forest Classifier on Test Data") 
plt.show()

In [None]:
#Now predicting on test dataset
y_pred = rf_model.predict(X_test)
print(f"Accuracy of the model on test dataset: {accuracy_score(y_test, y_pred)}")

#Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{cm}')

#Classification Report
report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{report}')

#ROC Curve and AUC
y_proba = rf_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
fpr, tpr, _ = roc_curve(y_test, y_proba)

import matplotlib.pyplot as plt
plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (area = {auc:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()