In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE as sm
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix 
from sklearn.metrics import f1_score,precision_recall_curve, roc_curve,  roc_auc_score, accuracy_score


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/credit-card-customers/BankChurners.csv")
df.head(2)

In [None]:
print(df['Education_Level'].unique())
print(df['Marital_Status'].unique())
print(df['Card_Category'].unique())
print(df['Income_Category'].unique())

In [None]:
df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],axis=1, inplace=True)
df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'],axis=1, inplace=True)
df.drop(['CLIENTNUM'],axis=1, inplace=True)

# EDA

In [None]:
df.describe(include='all').T

In [None]:
# Checking for null values

sns.heatmap(df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
plt.show()

In [None]:
sns.countplot(x='Attrition_Flag',data = df, label = 'Counts')
plt.show()

In [None]:
sns.countplot(x='Gender',data = df, hue='Attrition_Flag')
plt.show()

In [None]:
sns.countplot(x='Marital_Status',data = df, hue='Attrition_Flag')
plt.show()

In [None]:
plt.figure(figsize=[15,8])
sns.countplot(x='Customer_Age',data = df, hue='Attrition_Flag')
plt.show()

In [None]:
df['Dependent_count'].hist(bins=5)

In [None]:
sns.countplot(x='Card_Category',data = df, hue='Attrition_Flag')
plt.show()

In [None]:
plt.figure(figsize=[8,8])
sns.countplot(x='Income_Category',data = df, hue='Attrition_Flag')
plt.show()

In [None]:
plt.figure(figsize=[8,8])
sns.countplot(x='Education_Level',data = df, hue='Attrition_Flag')
plt.show()

In [None]:
def update_education_unknown(df):
    edu_level = df[0]
    if edu_level == 'Unknown':
        return 'Edu_Unknown'
    else: 
        return edu_level

def update_marital_unknown(df):
    marital = df[0]
    if marital == 'Unknown':
        return 'Marital_Unknown'
    else: 
        return marital

In [None]:
df['Education_Level'] = df[['Education_Level']].apply(update_education_unknown,axis=1)
df['Marital_Status'] = df[['Marital_Status']].apply(update_marital_unknown,axis=1)

In [None]:
attir_flag = pd.get_dummies(df['Attrition_Flag'],drop_first=True)
df.drop(['Attrition_Flag'], axis=1, inplace=True)
df = pd.concat([df, attir_flag], axis=1)

In [None]:
corr_matrix = df.corr()
corr_matrix['Existing Customer'].sort_values(ascending = False)

In [None]:
g = sns.heatmap(df.corr(),annot=True,fmt = ".2f", cmap = "coolwarm")
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

In [None]:
df2 = df.copy()

In [None]:
df2.drop(['Avg_Open_To_Buy'], axis=1, inplace=True) # dropping because of multicollinearity 

In [None]:
g = sns.heatmap(df2.corr(),annot=True,fmt = ".2f", cmap = "coolwarm")
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

In [None]:
# ENCODING THE CATEGORICAL VARIABLES
gender_flag = pd.get_dummies(df2['Gender'],drop_first=True)
df2.drop(['Gender'], axis=1, inplace=True)
df2 = pd.concat([df2, gender_flag], axis=1)

marital_flag = pd.get_dummies(df2['Marital_Status'],drop_first=True)
df2.drop(['Marital_Status'], axis=1, inplace=True)
df2 = pd.concat([df2, marital_flag], axis=1)

edu_flag = pd.get_dummies(df2['Education_Level'],drop_first=True)
df2.drop(['Education_Level'], axis=1, inplace=True)
df2 = pd.concat([df2, edu_flag], axis=1)

card_flag = pd.get_dummies(df2['Card_Category'],drop_first=True)
df2.drop(['Card_Category'], axis=1, inplace=True)
df2 = pd.concat([df2, card_flag], axis=1)

income_flag = pd.get_dummies(df2['Income_Category'],drop_first=True)
df2.drop(['Income_Category'], axis=1, inplace=True)
df2 = pd.concat([df2, income_flag], axis=1)

In [None]:
df2.head()

In [None]:
df2.columns

# TRAIN TEST SPLIT

In [None]:
from sklearn.model_selection import train_test_split
X = df2.loc[:, df2.columns != 'Existing Customer'] 
y = df2.loc[:,df2.columns == 'Existing Customer']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state = 10, stratify=y)

In [None]:
print("Number transactions X_train dataset: ", X_train.shape) 
print("Number transactions y_train dataset: ", y_train.shape) 
print("Number transactions X_test dataset: ", X_test.shape) 
print("Number transactions y_test dataset: ", y_test.shape)

In [None]:
sns.countplot(x = 'Existing Customer',data = y_train)
plt.show()

# HANDLING IMBALANCED CLASS

In [None]:
smo = sm(random_state = 2) 
X_train_res, y_train_res = smo.fit_sample(X_train, y_train.values.ravel()) 

In [None]:
l = list(y_train_res)
sns.countplot(x = l)
plt.show()

# SCALING DATASET

In [None]:
sc = StandardScaler()
X_train_res= sc.fit_transform(X_train_res)

In [None]:
X_test = sc.transform(X_test)

In [None]:
def display_scores(scores):
    print('Scores',scores)
    print('Mean',scores.mean())
    print('Standard Deviation',scores.std())

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

# MODELING

### K Neighbours Classifier

In [None]:
# k = KNeighborsClassifier()
# k_param_grid = {"n_neighbors":[5,10,15,20],
#                "metric" : ['minkowski','euclidean','manhattan',"chebyshev"]
#                }
# k_NN = GridSearchCV(k,param_grid = k_param_grid, cv=5, scoring="accuracy", n_jobs= -1, verbose = 2)
# k_NN.fit(X_train_res,y_train_res)
# k_NN.best_estimator_

In [None]:
# k_NN.best_score_

In [None]:
k = KNeighborsClassifier(metric='manhattan')
k_score = cross_val_score(k, X_train_res, y_train_res, cv=5, scoring="accuracy")
display_scores(k_score)

In [None]:
y_train_pred = cross_val_predict(k, X_train_res, y_train_res, cv=5)

In [None]:
print(precision_score(y_train_res, y_train_pred))
print(recall_score(y_train_res, y_train_pred))
print(f1_score(y_train_res, y_train_pred))
print(roc_auc_score(y_train_res, y_train_pred))

In [None]:
knn = k.fit(X_train_res,y_train_res)
knn_pred = knn.predict(X_test)  #PREDICTION
print(accuracy_score(y_test, knn_pred))

In [None]:
cfm_vpc = confusion_matrix(y_test,knn_pred)
sns.heatmap(cfm_vpc, annot=True,fmt = ".2f", cmap = "coolwarm")
fig=plt.gcf()
fig.set_size_inches(5,5)
plt.show()

In [None]:
print(precision_score(y_test,knn_pred))
print(recall_score(y_test,knn_pred))
print(f1_score(y_test,knn_pred))
print(roc_auc_score(y_test,knn_pred))
fpr, tpr, thresholds = roc_curve(y_test,knn_pred)
plot_roc_curve(fpr, tpr)
plt.show()

### SVC

In [None]:
# svc_c = SVC(random_state = 2)
# svc_param_grid = {'kernel': ['sigmoid','rbf'], 
#                   'gamma': [ 0.001, 0.01, 0.1, 1],
#                   'C': [1, 10, 20],
#                  'probability': [True],
#                   'tol': [0.001, 0.01, 0.1, 1],
#                  'decision_function_shape':['ovr'],
#                  }
# gsSVMC = GridSearchCV(svc_c,param_grid = svc_param_grid, cv=5, scoring="accuracy", 
#                       n_jobs= -1, verbose = 1)
# gsSVMC.fit(X_train_res,y_train_res)
# gsSVMC.best_estimator_

In [None]:
# gsSVMC.best_score_

In [None]:
svc_c = SVC(C=20, gamma=0.01, probability=True, random_state=2, tol=0.1)
c_score = cross_val_score(svc_c, X_train_res, y_train_res, cv=5, scoring="accuracy")
display_scores(c_score)

In [None]:
y_train_pred = cross_val_predict(svc_c, X_train_res, y_train_res, cv=5)

In [None]:
print(precision_score(y_train_res, y_train_pred))
print(recall_score(y_train_res, y_train_pred))
print(f1_score(y_train_res, y_train_pred))
print(roc_auc_score(y_train_res, y_train_pred))

In [None]:
svc = svc_c.fit(X_train_res,y_train_res)
svc_pred = svc.predict(X_test)
print(accuracy_score(y_test, svc_pred))

In [None]:
cfm_vpc = confusion_matrix(y_test,svc_pred)
sns.heatmap(cfm_vpc, annot=True,fmt = ".2f", cmap = "coolwarm")
fig=plt.gcf()
fig.set_size_inches(5,5)
plt.show()

In [None]:
print(precision_score(y_test, svc_pred))
print(recall_score(y_test, svc_pred))
print(f1_score(y_test, svc_pred))
print(roc_auc_score(y_test, svc_pred))
fpr, tpr, thresholds = roc_curve(y_test, svc_pred)
plot_roc_curve(fpr, tpr)
plt.show()

### Decision Tree Classifier

In [None]:
# DTC = DecisionTreeClassifier()
# DTC_param_grid = {'criterion' : ['entropy'],
#                   'max_depth':[2,3,4,5],
#                   "min_samples_split": [1,2,3,4,5,6,7,8,9,10],
#                   "min_samples_leaf": [1,2,3,4,5,6,7,8,9,10]
#                  }
# gsDTC = GridSearchCV(DTC,param_grid = DTC_param_grid, cv=5, scoring="accuracy", n_jobs= -1, verbose = 2)
# gsDTC.fit(X_train_res,y_train_res)
# gsDTC.best_estimator_

In [None]:
# gsDTC.best_score_

In [None]:
DTC = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=9,
                       min_samples_split=8)
c_score = cross_val_score(DTC, X_train_res, y_train_res, cv=5, scoring="accuracy")
display_scores(c_score)

In [None]:
y_train_pred = cross_val_predict(DTC, X_train_res, y_train_res, cv=5)

In [None]:
print(precision_score(y_train_res, y_train_pred))
print(recall_score(y_train_res, y_train_pred))
print(f1_score(y_train_res, y_train_pred))
print(roc_auc_score(y_train_res, y_train_pred))

In [None]:
dt = DTC.fit(X_train_res,y_train_res)
dtc_pred = dt.predict(X_test)  #PREDICTION
print(accuracy_score(y_test, dtc_pred))


In [None]:
cfm_vpc = confusion_matrix(y_test, dtc_pred)
sns.heatmap(cfm_vpc, annot=True,fmt = ".2f", cmap = "coolwarm")
fig=plt.gcf()
fig.set_size_inches(5,5)
plt.show()

In [None]:
print(precision_score(y_test, dtc_pred))
print(recall_score(y_test, dtc_pred))
print(f1_score(y_test, dtc_pred))
print(roc_auc_score(y_test, dtc_pred))
fpr, tpr, thresholds = roc_curve(y_test, dtc_pred)
plot_roc_curve(fpr, tpr)
plt.show()

### Random Forest Classifier

In [None]:
# RFC = RandomForestClassifier()


# rf_param_grid = {"max_depth": [2,3,4,5],
#               "max_features": [1,2,3,4,5],
#               "min_samples_split": [2,3,4,5],
#               "min_samples_leaf": [1,2,3,4,5],
#               "bootstrap": [False],
#               "n_estimators" :[50,100,150,200],
#               "criterion": ["entropy"]}

# gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=5, scoring="accuracy", n_jobs= -1, verbose = 2)
# gsRFC.fit(X_train_res,y_train_res)
# gsRFC.best_estimator_

In [None]:
# gsRFC.best_score_

In [None]:
RFC = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=5,
                       max_features=5, n_estimators=200)
c_score = cross_val_score(RFC, X_train_res, y_train_res, cv=5, scoring="accuracy")
display_scores(c_score)

In [None]:
y_train_pred = cross_val_predict(RFC, X_train_res, y_train_res, cv=5)

In [None]:
print(precision_score(y_train_res, y_train_pred))
print(recall_score(y_train_res, y_train_pred))
print(f1_score(y_train_res, y_train_pred))
print(roc_auc_score(y_train_res, y_train_pred))

In [None]:
rf = RFC.fit(X_train_res,y_train_res)
rfc_pred = rf.predict(X_test)  #PREDICTION
print(accuracy_score(y_test, rfc_pred))

In [None]:
cfm_vpc = confusion_matrix(y_test, rfc_pred)
sns.heatmap(cfm_vpc, annot=True,fmt = ".2f", cmap = "coolwarm")
fig=plt.gcf()
fig.set_size_inches(5,5)
plt.show()


In [None]:
print(precision_score(y_test, rfc_pred))
print(recall_score(y_test, rfc_pred))
print(f1_score(y_test, rfc_pred))
print(roc_auc_score(y_test, rfc_pred))
fpr, tpr, thresholds = roc_curve(y_test, rfc_pred)
plot_roc_curve(fpr, tpr)
plt.show()

### Logistic Regression

In [None]:
# lr_ = LogisticRegression() 
# lr_param_grid = {'penalty':['l1','l2'],
#                 'tol':[1e-4,1e-3,1e-2,1e-5],
#                  'C':[0.1, 1, 100],
#                  'multi_class':['ovr'],
#                  'max_iter':[1000],
#                  'solver':['newton_cg','sag','saga','lbfgs']
#                 }
# gslr = GridSearchCV(lr_,param_grid = lr_param_grid, cv=5, scoring="accuracy", n_jobs= -1, verbose = 2)
# gslr.fit(X_train_res,y_train_res)

In [None]:
# gslr_best = gslr.best_estimator_
# gslr_best

In [None]:
lr = LogisticRegression(C=0.1, max_iter=1000, multi_class='ovr', solver='saga',
                   tol=0.001) 
c_score = cross_val_score(lr, X_train_res, y_train_res, cv=5, scoring="accuracy")
display_scores(c_score)

In [None]:
y_train_pred = cross_val_predict(lr, X_train_res, y_train_res, cv=5)

In [None]:
print(precision_score(y_train_res, y_train_pred))
print(recall_score(y_train_res, y_train_pred))
print(f1_score(y_train_res, y_train_pred))
print(roc_auc_score(y_train_res, y_train_pred))


In [None]:
lrc = lr.fit(X_train_res,y_train_res)
lr_pred = lrc.predict(X_test)  #PREDICTION
print(accuracy_score(y_test, lr_pred))

In [None]:
cfm_vpc = confusion_matrix(y_test,lr_pred)
sns.heatmap(cfm_vpc, annot=True,fmt = ".2f", cmap = "coolwarm")
fig=plt.gcf()
fig.set_size_inches(5,5)
plt.show()

In [None]:
print(precision_score(y_test,lr_pred))
print(recall_score(y_test,lr_pred))
print(f1_score(y_test,lr_pred))
print(roc_auc_score(y_test,lr_pred))
fpr, tpr, thresholds = roc_curve(y_test,lr_pred)
plot_roc_curve(fpr, tpr)
plt.show()

## *Decision Tree Classifier and Random Forest are the two classifiers which gave us satisfactory results. Rest classifiers overfitted by a big margin.*