In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pylab import rcParams
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('../input/churn-modelling/Churn_Modelling.csv', index_col = None)
data.head()

In [None]:
data.shape

In [None]:
#data.drop([0,1], axis = 0).head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data['Geography'].nunique()

In [None]:
data['NumOfProducts'].nunique()

In [None]:
data['Exited'].nunique()

In [None]:
data['Exited'].value_counts()

In [None]:
data.isnull().sum()

In [None]:
plt.hist(data['Age'])
plt.show()

In [None]:
for i in ['CreditScore','Age','Tenure','Balance']:
    plt.hist(data[i])
    plt.xlabel(i)
    print()
    plt.show()

In [None]:
data1 = data.copy()

bins = [0,18,40,60,100]
labels = ['Minor', 'Adult', 'Middle_Age', 'Senior']
data1['Age_Band'] = pd.cut(data1.Age, bins, labels = labels,include_lowest = True)

In [None]:
data1[['Age','Age_Band']].tail()

In [None]:
data1.drop(['CustomerId','RowNumber','Surname','Age'], axis = 1, inplace = True)
data1.head()

In [None]:
geography = pd.get_dummies(data1['Geography'], drop_first = True)
gender = pd.get_dummies(data1['Gender'], drop_first = True)
age_band = pd.get_dummies(data1['Age_Band'], drop_first = True)

data2 = pd.concat([data1.drop(['Geography','Gender', 'Age_Band'], axis = 1), geography, gender, age_band], axis = 1)

data2.head()

In [None]:
data2.columns

In [None]:
df = data2[['CreditScore', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Germany', 'Spain',
       'Male', 'Adult', 'Middle_Age', 'Senior','Exited']]
df.head()

In [None]:
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values
Y = Y.astype('int')

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 10)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)

In [None]:
print(confusion_matrix(Y_test,Y_pred))
print()
print('Classification Report')
print(classification_report(Y_test, Y_pred))
print()
print('Accuracy of Model :',round(accuracy_score(Y_test, Y_pred),4))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, max_depth = 10, random_state = 100)
rf.fit(X_train, Y_train)
Y_pred_rf = rf.predict(X_test)

In [None]:
print(confusion_matrix(Y_test,Y_pred_rf))
print()
print('Classification Report')
print(classification_report(Y_test, Y_pred_rf))
print()
print('Accuracy of Model :',round(accuracy_score(Y_test, Y_pred_rf),4))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier(n_estimators = 400, criterion = 'entropy', max_depth = 10, random_state = 10)
et.fit(X_train, Y_train)
Y_pred_et = et.predict(X_test)

In [None]:
print(confusion_matrix(Y_test,Y_pred_et))
print()
print('Classification Report')
print(classification_report(Y_test, Y_pred_et))
print()
print('Accuracy of Model :',round(accuracy_score(Y_test, Y_pred_et),4))

In [None]:
from sklearn.metrics import roc_curve

Y_pred_prob_lr = lr.predict_proba(X_test)
fpr_lr, tpr_lr, thresholds_lr = roc_curve(Y_test, Y_pred_prob_lr[:,1])

plt.figure(figsize = (10,4))
plt.subplot(121)
plt.plot(fpr_lr,tpr_lr)

Y_pred_prob_et = et.predict_proba(X_test)
fpr_et, tpr_et, thresholds_et = roc_curve(Y_test, Y_pred_prob_et[:,1])
plt.plot(fpr_et,tpr_et)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])

plt.title('ROC curve for Heart disease classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.legend(['Logistic Regression', 'Bagging'])
plt.grid(True)

plt.subplot(122)
Y_pred_prob_rf = rf.predict_proba(X_test)
fpr_rf, tpr_rf, thresholds_rf = roc_curve(Y_test, Y_pred_prob_rf[:,1])

plt.plot(fpr_lr,tpr_lr)
plt.plot(fpr_rf,tpr_rf, 'g')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])

plt.title('ROC curve for Heart disease classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.legend(['Logistic Regression', 'Random Forest'])
plt.grid(True)

plt.show()

In [None]:
#plt.figure(figsize = (10,4))

plt.plot(fpr_lr,tpr_lr)
plt.plot(fpr_et,tpr_et)
plt.plot(fpr_rf,tpr_rf)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])

plt.title('ROC curve for Heart disease classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.legend(['Logistic Regression', 'Bagging', 'Random Forest'])
plt.grid(True)

plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion = 'gini', max_depth = None, random_state = 10)
dt.fit(X_train, Y_train)
Y_pred_dt = dt.predict(X_test)

print(confusion_matrix(Y_test,Y_pred_dt))
print()
print('Classification Report')
print(classification_report(Y_test, Y_pred_dt))
print()
print('Accuracy of Model :',round(accuracy_score(Y_test, Y_pred_dt),4))

In [None]:
Y_pred_prob_dt = dt.predict_proba(X_test)
fpr_dt, tpr_dt, thresholds_dt = roc_curve(Y_test, Y_pred_prob_dt[:,1])

#plt.figure(figsize = (10,4))

plt.plot(fpr_lr,tpr_lr)
plt.plot(fpr_dt,tpr_dt)
plt.plot(fpr_et,tpr_et)
plt.plot(fpr_rf,tpr_rf)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])

plt.title('ROC curve for Heart disease classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.legend(['Logistic Regression', 'Decision Tree', 'Bagging', 'Random Forest'])
plt.grid(True)

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
print('AUC value for Logistic Regression is ',round(roc_auc_score(Y_test,Y_pred_prob_lr[:,1]),4))
print('AUC value for Decision Tree is ',round(roc_auc_score(Y_test,Y_pred_prob_dt[:,1]),4))
print('AUC value for Bagging is ',round(roc_auc_score(Y_test,Y_pred_prob_et[:,1]),4))
print('AUC value for Random Forest is ',round(roc_auc_score(Y_test,Y_pred_prob_rf[:,1]),4))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred_knn = knn.predict(X_test)

print(confusion_matrix(Y_test,Y_pred_knn))
print()
print('Classification Report')
print(classification_report(Y_test, Y_pred_knn))
print()
print('Accuracy of Model :',round(accuracy_score(Y_test, Y_pred_knn),4))

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel = 'sigmoid', C = 0.6, random_state = 10)
svm.fit(X_train, Y_train)
Y_pred_svm = svm.predict(X_test)

print(confusion_matrix(Y_test,Y_pred_svm))
print()
print('Classification Report')
print(classification_report(Y_test, Y_pred_svm))
print()
print('Accuracy of Model :',round(accuracy_score(Y_test, Y_pred_svm),4))