In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Import the required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pylab import rcParams
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# Read Data

In [None]:
df=pd.read_csv('../input/credit-card-customer-churn-prediction/Churn_Modelling.csv')
df.tail()

# Lets explore our Data :

# What is the dimension of the data?

In [None]:
df.shape

In [None]:
df.info()

In [None]:
columns=df.columns
for i in columns:
    print(i,'has : ',df[i].nunique(),'Unique Values')

In [None]:
print(df.Exited.value_counts())

# Are there any missing values in the Data?


# Checking for Null values:

In [None]:
df.isnull().sum()

### *No nulls in data*

# What is Min and Max for the Continous Columns?

In [None]:
df.describe()

# Basic EDA on Data:

In [None]:
plt.hist(df['Age'])
plt.xlabel("Age")
plt.title("Age Distribution")
plt.show()

In [None]:
plt.scatter(df['Age'], df['Balance'],edgecolors='Red')
plt.title("Age Vs Balance")
plt.xlabel("Age")
plt.ylabel('Balance')
plt.show()

In [None]:
# Data to plot
sizes = df['Geography'].value_counts(sort = True)
labels=df['Geography']
colors = ["grey","purple","red"] 
rcParams['figure.figsize'] = 5,5

# Plot
plt.pie(sizes,colors=colors,autopct='%1.1f%%',shadow=True,startangle=270)
plt.title('Geographical Area - Churn in Dataset')
plt.legend(labels)
plt.show()

In [None]:
# Data to plot
sizes = df['Exited'].value_counts(sort = True)
labels=df['Exited']
colors = ["purple","red"] 
rcParams['figure.figsize'] = 5,5

# Plot
plt.pie(sizes,colors=colors,autopct='%1.1f%%',shadow=True,startangle=270)
plt.title('Exit Customers - Churn in Dataset')
plt.legend(labels)
plt.show()

# What is the Age Band with Highest Number of Customers ?

 ### *The Columns RowNumber,Surname arent of much significance to our analysis so dropping them*
 ### *The Columns Geography,Gender are Categorical in nature so dummyfying the columns in order to be able to use them.*

In [None]:
dataset = df.drop(['RowNumber', 'Surname'], axis=1)
dataset =  dataset.drop(['Geography', 'Gender'], axis=1)

# Banding Age :

In [None]:
bins= [0,18,40,60,100]
labels = ['Minor','Adult','Middle Age','Senior Citizen']
dataset['AgeGroup'] = pd.cut(dataset['Age'], bins=bins, labels=labels, right=False)
dataset[['Age','AgeGroup']].head(10)

In [None]:
dataset.columns

In [None]:
dataset =  dataset.drop(['Age'], axis=1)

# Dummification of the Categorical Column :

In [None]:
Geography = pd.get_dummies(df.Geography).iloc[:,1:]
Gender = pd.get_dummies(df.Gender).iloc[:,1:]
Age = pd.get_dummies(dataset.AgeGroup).iloc[:,1:]

# Model Prediction using Logistic Regression

In [None]:
dataset = pd.concat([dataset,Geography,Gender,Age], axis=1)

In [None]:
dataset.head()

In [None]:
dataset.drop(['AgeGroup'], axis=1,inplace= True)

In [None]:
dataset.columns

In [None]:
X = dataset.drop(['Exited'], axis=1)
y = dataset['Exited']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr=LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

# What is the Accuracy of Logistic Regression ?

In [None]:
print(accuracy_score(y_test,y_pred))

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print(classification_report(y_test, y_pred))

### Main priority here precision high, low recall according to our business

# Confusion Matrix : 

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
plt.figure(figsize = (8,5))
sns.heatmap(conf_matrix, annot=True,fmt='d',cmap='gist_ncar')

### The confusion matrix shows 1553+25 = 1587 correct predictions and 422 incorrect ones.

### True Positives: 25

### True Negatives: 1553

### False Positives: 42(Type I error)

### False Negatives: 380( Type II error)

In [None]:
TN=cm[0,0]
TP=cm[1,1]
FN=cm[1,0]
FP=cm[0,1]
sensitivity=TP/float(TP+FN)
specificity=TN/float(TN+FP)

In [None]:
print('The ACCURACY of the model = TP+TN / (TP+TN+FP+FN) = ',(TP+TN)/float(TP+TN+FP+FN),'\n\n',
      'The PRECISION of the  model     =   TP / (TP+FP) = ',TP/float(TP+FP),'\n\n'
      'Sensitivity or True Positive Rate/RECALL = TP / (TP+FN) = ',TP/float(TP+FN),'\n\n',
      'Specificity or True Negative Rate = TN / (TN+FP) = ',TN/float(TN+FP),'\n\n',)

# The AUC ROC Curve :

In [None]:
y_pred_prob=lr.predict_proba(X_test)[:,:]
y_pred_prob_df=pd.DataFrame(data=y_pred_prob, columns=['Prob of Open Account (0)','Prb of Closed Account (1)'])
y_pred_prob_df.head()

In [None]:
from sklearn.preprocessing import binarize
for i in range(1,5):
    cm2=0
    y_pred_prob_yes=lr.predict_proba(X_test)
    y_pred2=binarize(y_pred_prob_yes,i/10)[:,1]
    cm2=confusion_matrix(y_test,y_pred2)
    print ('With',i/10,'threshold the Confusion Matrix is ','\n',cm2,'\n',
            'with',cm2[0,0]+cm2[1,1],'correct predictions and',cm2[1,0],'Type II errors( False Negatives)','\n\n',
          'Sensitivity: ',cm2[1,1]/(float(cm2[1,1]+cm2[1,0])),'Specificity: ',cm2[0,0]/(float(cm2[0,0]+cm2[0,1])),'\n\n\n')

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_yes[:,1])
plt.plot(fpr,tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for Heart disease classifier')
plt.xlabel('False positive rate (1-Specificity)')
plt.ylabel('True positive rate (Sensitivity)')
plt.grid(True)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_prob_yes[:,1])

# Modelling with another classifier : 

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100,max_depth=10, random_state=100)

In [None]:
X_train.head(3)

In [None]:
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

In [None]:
predictions

# What is the Accuracy of a RF Model ?


In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
print("Accuracy",accuracy_score(y_test, predictions))

In [None]:
result = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print(result)

In [None]:
print("Classification Report\n",classification_report(y_test,predictions)) 

In [None]:
print('Open', round(dataset['Exited'].value_counts()[0]/len(dataset) * 100,2), '% of the dataset')
print('Closed account', round(dataset['Exited'].value_counts()[1]/len(dataset) * 100,2), '% of the dataset')