In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

dataset = pd.read_csv("/kaggle/input/churn-modelling/Churn_Modelling.csv")
dataset

In [None]:
df_data = dataset

le = LabelEncoder()
dummy_columns = [] #array for multiple value columns

for column in df_data.columns:
    if df_data[column].dtype == object and column != 'customerID' and column != 'RowNumber' and column != 'Surname':
        if df_data[column].nunique() == 2:
            #apply Label Encoder for binary ones
            df_data[column] = le.fit_transform(df_data[column]) 
        else:
            dummy_columns.append(column)
#apply get dummies for selected columns
df_data = pd.get_dummies(data = df_data,columns = dummy_columns, drop_first=True)

X = df_data.drop(["RowNumber","CustomerId","Surname","Exited"], axis=1)
y = df_data.Exited

#Splitting into test set and training set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#Use stratified KFold as cross-validation strategy
kfold = StratifiedKFold(n_splits=10, shuffle=True)

In [None]:
#Logistic Regression Model

#feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train_scaled = sc_X.fit_transform(X_train)
X_test_scaled = sc_X.transform(X_test)

#Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
fit_1 = classifier.fit(X_train_scaled,y_train)

#Predicting the Test set results
y_pred = classifier.predict(X_test_scaled)

#applying k-Fold Cross Validation
accuracies_log = cross_val_score(estimator=classifier, X=X_train_scaled, y=y_train, cv=kfold)

#confusion matrix
cm_log = confusion_matrix(y_test, y_pred)

#classification report
cr_log = classification_report(y_test, y_pred)

In [None]:
#KNN model

#Fitting classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
classifier.fit(X_train_scaled,y_train)

#Predicting the test set value
y_pred = classifier.predict(X_test_scaled)

#applying k-Fold Cross Validation
accuracies_knn = cross_val_score(estimator=classifier, X=X_train_scaled, y=y_train, cv=kfold)

#confusion matrix
cm_knn = confusion_matrix(y_test, y_pred)

#classification report
cr_knn = classification_report(y_test, y_pred)

In [None]:
#Support Vector Machine Model

#Fitting classifier to the Training set
from sklearn.svm import SVC

#Fitting classifier to the Training set
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train_scaled,y_train)

#Predicting the test set value
y_pred = classifier.predict(X_test_scaled)

#applying k-Fold Cross Validation
accuracies_svm_gaussian = cross_val_score(estimator=classifier, X=X_train_scaled, y=y_train, cv=kfold)

#confusion matrix
cm_svm_gaussian = confusion_matrix(y_test, y_pred)

#classification report
cr_svm_gaussian = classification_report(y_test, y_pred)

In [None]:
#Naive Bayes Model

#Fitting classifier to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train_scaled,y_train)

#Predicting the test set value
y_pred = classifier.predict(X_test_scaled)

#applying k-Fold Cross Validation
accuracies_naivebayes = cross_val_score(estimator=classifier, X=X_train_scaled, y=y_train, cv=kfold)

#confusion matrix
cm_naivebayes = confusion_matrix(y_test, y_pred)

#classification report
cr_naivebayes = classification_report(y_test, y_pred)

In [None]:
#Decision Tree Model

#Fitting classifier to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(X_train_scaled,y_train)

#Predicting the test set value
y_pred = classifier.predict(X_test_scaled)

#applying k-Fold Cross Validation
accuracies_dectree = cross_val_score(estimator=classifier, X=X_train_scaled, y=y_train, cv=kfold)

#confusion matrix
cm_dectree = confusion_matrix(y_test, y_pred)

#classification report
cr_dectree = classification_report(y_test, y_pred)

In [None]:
#Random Forest Model

#Fitting classifier to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, random_state=0, criterion='entropy')
classifier.fit(X_train_scaled,y_train)

#Predicting the test set value
y_pred = classifier.predict(X_test_scaled)

#applying k-Fold Cross Validation
accuracies_randfor = cross_val_score(estimator=classifier, X=X_train_scaled, y=y_train, cv=kfold)

#confusion matrix
cm_randfor = confusion_matrix(y_test, y_pred)

#classification report
cr_randfor = classification_report(y_test, y_pred)

In [None]:
#XGBoost Model

#Fitting XGBoost to the training set
from xgboost import XGBClassifier
xgbmodel = XGBClassifier()
xgbmodel.fit(X_train, y_train)

#Predicting the test set value
y_pred = xgbmodel.predict(X_test)

#applying k-Fold Cross Validation
accuracies_xgboost = cross_val_score(estimator=xgbmodel, X=X_train, y=y_train, cv=kfold)

#confusion matrix
cm_xgboost = confusion_matrix(y_test, y_pred)

#classification report
cr_xgboost = classification_report(y_test, y_pred)

In [None]:
#Comparing K-Fold means on training set

row_list = []
dict_1 = {"Logistic":accuracies_log.mean(), "KNN":accuracies_knn.mean(), 
         "SVM_Gaussian":accuracies_svm_gaussian.mean(), "Naive Bayes":accuracies_naivebayes.mean(),
         "Decision Tree":accuracies_dectree.mean(), "Random Forest":accuracies_randfor.mean(),
         "XGBoost":accuracies_xgboost.mean()}
row_list.append(dict_1)
dict_2 = {"Logistic":accuracies_log.std(),"KNN":accuracies_knn.std(), 
         "SVM_Gaussian":accuracies_svm_gaussian.std(), "Naive Bayes":accuracies_naivebayes.std(),
         "Decision Tree":accuracies_dectree.std(), "Random Forest":accuracies_randfor.std(),
         "XGBoost":accuracies_xgboost.std()}
row_list.append(dict_2)
acc_df = pd.DataFrame(row_list)
acc_df = acc_df.transpose()
acc_df = acc_df.rename(columns={0:"skfold_mean",1:"skfold_std"})
acc_df = acc_df.sort_values(by="skfold_mean",ascending=False)
acc_df

In [None]:
def cla_report(matrix,name):
    TP=matrix[0,0]
    FP=matrix[0,1]
    FN=matrix[1,0]
    TN=matrix[1,1]
    precision_class_1 = TN/(TN+FP)
    recall_class_1 = TN/(TN+FN)
    precision_class_0 = TP/(TP+FN)
    recall_class_0 = TP/(TP+FP)
    accuracy = (TP+TN)/(TP+FP+FN+TN)
    return {"Model":name,"Precision Stay":precision_class_0, "Precision Leave":precision_class_1, 
            "Recall Stay":recall_class_0, "Recall Leave":recall_class_1, "Accuracy":accuracy}



In [None]:
#Comparing Confusion Matrix metrics on test set

cml = []
cml.append(cla_report(cm_svm_gaussian,"SVM Gaussian"))
cml.append(cla_report(cm_log,"Logistic"))
cml.append(cla_report(cm_randfor,"Random Forest"))
cml.append(cla_report(cm_xgboost,"XGBoost"))
cml.append(cla_report(cm_knn,"KNN"))
cml.append(cla_report(cm_naivebayes,"Naive Bayes"))
cml.append(cla_report(cm_dectree,"Decision Tree"))
cml_df = pd.DataFrame(cml)
cml_df = cml_df.sort_values(by=["Recall Leave","Precision Leave"],ascending=False)
cml_df

In [None]:
#Important features of XGBoost model

import matplotlib.pyplot as plt
from xgboost import plot_importance

fig, ax = plt.subplots(figsize=(10,8))
p = plot_importance(xgbmodel, ax=ax)

In [None]:
df_data['ChurnProb'] = xgbmodel.predict_proba(df_data[X_train.columns])[:,1]
df_data[['CustomerId','ChurnProb']].head()