In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,ConfusionMatrixDisplay,precision_score,recall_score,f1_score,classification_report,roc_curve,plot_roc_curve,auc,precision_recall_curve,plot_precision_recall_curve,average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/churn-modelling/Churn_Modelling.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

### Missing Values

In [None]:
df.isnull().sum()

In [None]:
f, ax = plt.subplots(nrows = 1, ncols = 1, figsize=(16,5))

sns.heatmap(df.T.isna(), cmap='Blues')
ax.set_title('Missing Values')

for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14)
plt.show()

## Data Visualization
### 2.1 Corr Heat Map

In [None]:
# compute the correlation matrix

corr = df.corr()

# generate a mask for the upper triangle

mask = np.triu(np.ones_like(corr,dtype=bool))

# set up matplotlib figure
f, ax = plt.subplots(figsize=(11,9))

#generate a custom diverging colormap
cmap = sns.diverging_palette(230,20,as_cmap=True)

#draw the heatpmap ith the mask and correct aspect ratio
sns.heatmap(corr,mask=mask, cmap=cmap, vmax=.3,center=0,square=True,linewidths=.5,cbar_kws = {'shrink':.5})

In [None]:
# Exited correlations

corr.sort_values(by=['Exited'],ascending=False).iloc[0].sort_values(ascending=False)

### 2.2 CountPlot

In [None]:
print(df.Gender.value_counts())
sns.set_theme(style='darkgrid')
ax = sns.countplot(data=df, x='Gender')

In [None]:
print(df.Gender.value_counts())
sns.set_theme(style='darkgrid')
ax = sns.countplot(data=df, x='Tenure')

In [None]:
print(df.NumOfProducts.value_counts())
sns.set_theme(style='darkgrid')
ax = sns.countplot(data=df,x='NumOfProducts')

In [None]:
print(df.HasCrCard.value_counts())
sns.set_theme(style='darkgrid')
ax = sns.countplot(data=df,x='HasCrCard')

In [None]:
print(df.IsActiveMember.value_counts())
sns.set_theme(style='darkgrid')
ax = sns.countplot(data=df,x='IsActiveMember')

In [None]:
print(df.Exited.value_counts())
sns.set_theme(style='darkgrid')
ax = sns.countplot(data=df,x='Exited')

In [None]:
print(df.Geography.value_counts())
sns.set_theme(style='darkgrid')
ax = sns.countplot(data=df,x='Geography')

### 2.3 Distibution Plot

In [None]:
fig = plt.figure(figsize=(7,7))
sns.displot(df.CreditScore, color='green',label='CreditScore', kde=True)
plt.legend();

In [None]:
fig = plt.figure(figsize=(7,7))
ax = sns.displot(df.Age, color='blue',label='Age',kde='True')
plt.legend();

In [None]:
fig = plt.figure(figsize=(7,7))
ax = sns.displot(df.Balance, color='red', label='Balance',kde=True)
plt.legend();

In [None]:
fig  = plt.figure(figsize=(7,7))
ax = sns.displot(df.EstimatedSalary,label='EstimatedSalary',color='green',kde=True)
plt.legend();

### 2.4 Violin Plot

In [None]:
plt.figure(figsize=(13,13))
sns.set_theme(style='darkgrid')
plt.subplot(2,3,1)
sns.violinplot(x='CreditScore',y='Exited',data=df)
plt.subplot(2,3,2)
sns.violinplot(x='Gender',y='Exited',data=df)
plt.subplot(2,3,3)
sns.violinplot(x='Age',y='Exited',data=df)
plt.subplot(2,3,4)
sns.violinplot(x='Tenure',y='Exited',data=df)
plt.subplot(2,3,5)
sns.violinplot(x='Balance',y='Exited',data=df)
plt.subplot(2,3,6)
sns.violinplot(x='NumOfProducts',y='Exited',data=df)
plt.show()

In [None]:
df.head()

### Data Describe

In [None]:
desc = df.describe().T
df1 = pd.DataFrame(index=['CreditScore', 'Age',
                          'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 
                           'EstimatedSalary', 'Exited'], 
                   columns= ["count","mean","std","min",
                             "25%","50%","75%","max"], data= desc )

f,ax = plt.subplots(figsize=(12,12))

sns.heatmap(df1, annot=True,cmap = "Blues", fmt= '.0f',
            ax=ax,linewidths = 5, cbar = False,
            annot_kws={"size": 16})

plt.xticks(size = 18)
plt.yticks(size = 12, rotation = 0)
plt.ylabel("Variables")
plt.title("Descriptive Statistics", size = 16)
plt.show()

## Data Preprocessing

In [None]:
df.drop(['CustomerId','Surname'],inplace=True,axis=1)

In [None]:
df.head()

In [None]:
le = LabelEncoder()
df['Geography'] = le.fit_transform(df['Geography'])
df['Gender'] = le.fit_transform(df['Gender'])

In [None]:
df.head()

In [None]:
df['EstimatedSalaryBand'] = pd.cut(df['EstimatedSalary'], 5)
df[['EstimatedSalaryBand', 'Exited']].groupby(['EstimatedSalaryBand'], as_index=False).mean().sort_values(by='EstimatedSalaryBand', ascending=True)

In [None]:
df.head()

In [None]:
df.loc[df['EstimatedSalary'] <= 40007, 'EstimatedSalary'] = 0
df.loc[(df['EstimatedSalary'] > 40007) & (df['EstimatedSalary'] <= 80003), 'EstimatedSalary'] = 1
df.loc[(df['EstimatedSalary'] > 80003) & (df['EstimatedSalary'] <= 120000), 'EstimatedSalary'] = 2
df.loc[(df['EstimatedSalary'] > 120000) & (df['EstimatedSalary'] <= 159996), 'EstimatedSalary'] = 3
df.loc[df['EstimatedSalary'] > 159996, 'EstimatedSalary'] = 4

df.head()

In [None]:
df.EstimatedSalary.unique()

In [None]:
df.drop('EstimatedSalaryBand',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df['BalanceBand'] = pd.cut(df['Balance'], 5)
df[['BalanceBand', 'Exited']].groupby(['BalanceBand'], as_index=False).mean().sort_values(by='BalanceBand', ascending=True)

In [None]:
df.head()

In [None]:
df.loc[df['Balance'] <= 0, 'Balance'] = 0
df.loc[(df['Balance'] > 0) & (df['Balance'] <= 251), 'Balance'] = 1
df.loc[(df['Balance'] > 251) & (df['Balance'] <= 50179), 'Balance'] = 2
df.loc[(df['Balance'] > 50179) & (df['Balance'] <= 100359), 'Balance'] = 3
df.loc[(df['Balance'] > 100359) & (df['Balance'] <= 150538), 'Balance'] = 4
df.loc[(df['Balance'] > 150538) & (df['Balance'] <= 200718), 'Balance'] = 5
df.loc[(df['Balance'] > 200718) & (df['Balance'] <= 250000), 'Balance'] = 6
df.head()

In [None]:
df.drop(['BalanceBand'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df['CreditScoreBand'] = pd.cut(df['CreditScore'], 5)
df[['CreditScoreBand', 'Exited']].groupby(['CreditScoreBand'], as_index=False).mean().sort_values(by='CreditScoreBand', ascending=True)

In [None]:
df.loc[df['CreditScore'] <= 450, 'CreditScore'] = 0
df.loc[(df['CreditScore'] > 450) & (df['CreditScore'] <= 550), 'CreditScore'] = 1
df.loc[(df['CreditScore'] > 550) & (df['CreditScore'] <= 650), 'CreditScore'] = 2
df.loc[(df['CreditScore'] > 650) & (df['CreditScore'] <= 750), 'CreditScore'] = 3
df.loc[(df['CreditScore'] > 750) & (df['CreditScore'] <= 850), 'CreditScore'] = 4
df.loc[(df['CreditScore'] > 850), 'CreditScore'] = 5

In [None]:
df.drop('CreditScoreBand',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
X = df.drop('Exited',axis=1)
y = df['Exited']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.33,random_state=42)

In [None]:
print('DataFrame Shape:', df.shape)
print('*'*25)
print('X Train Shape:', X_train.shape)
print('*'*25)
print('X Test Shape:', X_test.shape)
print('*'*25)
print('Y Train Shape:', y_train.shape)
print('*'*25)
print('Y Test Shape:', y_test.shape)

## Model Deployment

In [None]:
models = []
models.append(['XGBClassifier',XGBClassifier(learning_rate=0.1,objective='binary:logistic',random_state=0,eval_metric='mlogloss')])
models.append(['Logistic Regression',LogisticRegression(random_state=0)])
models.append(['SVM',SVC(random_state=0)])
models.append(['KNeigbors',KNeighborsClassifier()])
models.append(['GaussianNB',GaussianNB()])
models.append(['BernoulliNB',BernoulliNB()])
models.append(['DecisionTree',DecisionTreeClassifier(random_state=0)])
models.append(['RandomForest',RandomForestClassifier(random_state=0)])
models.append(['AdaBoostClassifier',AdaBoostClassifier()])

In [None]:
lst_1 = []
for m in range(len(models)):
    lst_2 = []
    model = models[m][1]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test,y_pred)
    accuracies = cross_val_score(estimator= model, X = X_train,y = y_train, cv=10)

# k-fOLD Validation
    roc = roc_auc_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    print(models[m][0],':')
    print(cm)
    print('Accuracy Score: ',accuracy_score(y_test,y_pred))
    print('')
    print('K-Fold Validation Mean Accuracy: {:.2f} %'.format(accuracies.mean()*100))
    print('')
    print('Standard Deviation: {:.2f} %'.format(accuracies.std()*100))
    print('')
    print('ROC AUC Score: {:.2f} %'.format(roc))
    print('')
    print('Precision: {:.2f} %'.format(precision))
    print('')
    print('Recall: {:.2f} %'.format(recall))
    print('')
    print('F1 Score: {:.2f} %'.format(f1))
    print('-'*40)
    print('')
    lst_2.append(models[m][0])
    lst_2.append(accuracy_score(y_test,y_pred)*100)
    lst_2.append(accuracies.mean()*100)
    lst_2.append(accuracies.std()*100)
    lst_2.append(roc)
    lst_2.append(precision)
    lst_2.append(recall)
    lst_2.append(f1)
    lst_1.append(lst_2)

In [None]:
df2 = pd.DataFrame(lst_1,columns=['Model','Accuracy','K-Fold Mean Accuracy','Std.Deviation','ROC_AUC','Precision','Recall','F1 Score'])

df2.sort_values(by=['Accuracy','K-Fold Mean Accuracy'],inplace=True,ascending=False)
df2

# COMPARE

In [None]:
sns.barplot(x='Accuracy',y='Model',data=df2,color='b')
plt.title('Model Compare Graphic');

## Model Tuning

In [None]:
grid_models = [(XGBClassifier(), [{'learning_rate': [0.01, 0.05, 0.1], 'eval_metric': ['error']}]),
               (KNeighborsClassifier(),[{'n_neighbors':[5,7,8,10], 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}]), 
               (DecisionTreeClassifier(),[{'criterion':['gini','entropy'],'random_state':[0]}]), 
               (RandomForestClassifier(),[{'n_estimators':[100,150,200],'criterion':['gini','entropy'],'random_state':[0]}]),]

In [None]:
for i,j in grid_models:
    grid = GridSearchCV(estimator=i,param_grid = j, scoring = 'accuracy',cv = 10)
    grid.fit(X_train,y_train)
    best_accuracy = grid.best_score_
    best_param = grid.best_params_
    print(' {}: \n Best Accuracy: {:.2f} %'.format(i,best_accuracy*100))
    print('')
    print('-'*25)
    print('')

In [None]:
classifier = XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None,monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)[:,1]
cm = confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))
print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
print('Accuracy Score: ',accuracy_score(y_test, y_pred))

# Visualizing Confusion Matrix
plt.figure(figsize = (8, 5))
sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15}, 
            yticklabels = ['No stroke', 'Stroke'], xticklabels = ['Predicted no stroke', 'Predicted stroke'])
plt.yticks(rotation = 0)
plt.show()

# Roc Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)

sns.set_theme(style = 'white')
plt.figure(figsize = (8, 8))
plt.plot(false_positive_rate,true_positive_rate, color = '#b01717', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], linestyle = '--', color = '#174ab0')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

***Thank you for looking. Please provide your comments.***