In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/credit-card-customers/BankChurners.csv")

In [None]:
df.head()

In [None]:
df['Attrition_Flag'].replace({'Existing Customer': 0} , inplace = True)
df['Attrition_Flag'].replace({'Attrited Customer': 1} , inplace = True)


In [None]:
#Removing Last two columns from our Analysis 
df = df.iloc[:, :-2]
df.head(50)

In [None]:
#Checking the shape fo the Dataframe
df.shape

In [None]:
# Replace spaces in the column names with underscore
df.columns = [c.replace(' ', '_') for c in df.columns]

In [None]:
df.info()

In [None]:
#Show Basic Statistics
df.describe()

In [None]:
duplicate = df[df.duplicated()] 
  
print("Duplicate Rows :") 
  
# Print the resultant Dataframe 
duplicate 

# This shows none of the rows are duplicate.

In [None]:
#Setting CLIENTNUM as Index
df.set_index('CLIENTNUM', inplace=True)

In [None]:
df.head()

In [None]:
#Checking for Number of NA values
df.isna().sum()

This Shows there isn't any NA values in the dataframe

In [None]:
(df == 'Unknown').sum()

In [None]:
len(df[(df == 'Unknown').any(axis=1)].index)

In [None]:
plt.hist(df.loc[df['Income_Category']!='Unknown']['Income_Category']) 
plt.show()

In [None]:
mostfrequentcategory_Marital_Status = df['Marital_Status'].mode()[0]
mostfrequentcategory_Marital_Status

In [None]:
df['Marital_Status'].replace({'Unknown': mostfrequentcategory_Marital_Status} , inplace = True)

In [None]:
df['Marital_Status'].head(10)

In [None]:
mostfrequentcategory_Income_Category = df['Income_Category'].mode()[0]
mostfrequentcategory_Income_Category
df['Income_Category'].replace({'Unknown': mostfrequentcategory_Income_Category} , inplace = True)

In [None]:
mostfrequentcategory_Education_Level = df['Education_Level'].mode()[0]
mostfrequentcategory_Education_Level
df['Education_Level'].replace({'Unknown': mostfrequentcategory_Education_Level} , inplace = True)

In [None]:
(df == 'Unknown').sum()

Feature Selection 

In [None]:
df = pd.get_dummies(df, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
X = df.drop('Attrition_Flag', axis=1)
y = df['Attrition_Flag']

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(30).plot(kind='barh')
plt.show()

In [None]:
#get correlations of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

Pearson Correlation

In [None]:
num_feats=20
feature_name = X.columns.tolist()
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature
cor_support, cor_feature = cor_selector(X, y,num_feats)
print(str(len(cor_feature)), 'selected features')


In [None]:
cor_feature

Chi-Square Features

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

In [None]:
chi_feature

Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(solver = 'liblinear'), n_features_to_select=num_feats, step=10,  verbose=5)
rfe_selector.fit(X_norm, y)

In [None]:
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

In [None]:
rfe_feature

Lasso: SelectFromMode

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1", solver = 'liblinear'), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

In [None]:
embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

In [None]:
embeded_lr_feature

In [None]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

In [None]:
features = ['Total_Trans_Ct' , 'Total_Trans_Amt' , 'Total_Revolving_Bal' , 'Total_Relationship_Count' , 'Total_Ct_Chng_Q4_Q1',
            'Total_Amt_Chng_Q4_Q1' , 'Months_Inactive_12_mon' , 'Marital_Status_Single' , 'Gender_F' , 'Credit_Limit' , 
            'Contacts_Count_12_mon']

In [None]:
X = X[features]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.30, random_state=42)

MODEL BUILDING 

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, plot_confusion_matrix, plot_roc_curve


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

1) Logistic Regression

In [None]:
model_lr = LogisticRegression(solver = 'liblinear')
model_lr.fit(X_train, y_train)

In [None]:
print("Coefficients of the Logistic Regression model")
coef = model_lr.coef_
intercept = model_lr.intercept_
print("Coef:", coef)
print("Intercept:", intercept)

In [None]:
predicted_classes_lr = model_lr.predict(X_test)


In [None]:
predicted_classes_lr_prob = model_lr.predict_proba(X_test)


In [None]:
print("Confusion matrix for LR model:")
conf_mat_lr = confusion_matrix(y_test.tolist(), predicted_classes_lr)
print(conf_mat_lr)
sns.heatmap(conf_mat_lr, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()

In [None]:
accuracy_lr = accuracy_score(y_test, predicted_classes_lr)
print("accuracy score for LR model::", accuracy_lr)

ROC Curve and then calculate threshold probabilty

*   List item
*   List item



In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
logit_roc_auc = roc_auc_score(y_test, model_lr.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model_lr.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
def find_best_threshold(threshould, fpr, tpr):
   t = threshould[np.argmax(tpr*(1-fpr))]
   # (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
   print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
   return t

In [None]:
from math import *

In [None]:
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, model_lr.predict_proba(X_test)[:,1])
# get the best threshold
J = tpr - fpr
ix = np.argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))

In [None]:
THRESHOLD = 0.183734

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score

In [None]:
preds = np.where(model_lr.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

pd.DataFrame(data=[accuracy_score(y_test, preds), recall_score(y_test, preds),
                   precision_score(y_test, preds), roc_auc_score(y_test, preds)], 
             index=["accuracy", "recall", "precision", "roc_auc_score"])

2) Gaussian Naive Bayes

In [None]:
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)

In [None]:
predicted_classes_nb = model_nb.predict(X_test)

In [None]:
print("Confusion matrix for NB model:")
conf_mat_nb = confusion_matrix(y_test.tolist(), predicted_classes_nb)
print(conf_mat_nb)
sns.heatmap(conf_mat_nb, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()

In [None]:
accuracy_nb = accuracy_score(y_test, predicted_classes_nb)
print("accuracy score for NB model::", accuracy_nb)

3) KNN Nodel

In [None]:
model_knn = KNeighborsClassifier(n_neighbors = 2)
# fitting model
model_knn.fit(X_train,y_train)
#predict
predicted_classes_knn = model_knn.predict(X_test)

In [None]:
print("Confusion matrix for KNN model:")
conf_mat_knn = confusion_matrix(y_test.tolist(), predicted_classes_knn)
print(conf_mat_knn)
sns.heatmap(conf_mat_knn, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()

In [None]:
accuracy_knn = accuracy_score(y_test, predicted_classes_knn)
print("accuracy score for KNN model::", accuracy_knn)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
neighbors = list(range(1,10,2))
# empty list that will hold cv scores
cv_scores = [ ]
#perform 10-fold cross-validation
for K in neighbors:
    model_knn = KNeighborsClassifier(n_neighbors = K)
    scores = cross_val_score(model_knn,X_train,y_train,cv = 10,scoring =
    "accuracy")
    cv_scores.append(scores.mean())

In [None]:
def plot_accuracy(knn_list_scores):
    pd.DataFrame({"K":[i for i in range(1,10,2)],
                  "Accuracy":knn_list_scores}).set_index("K").plot.bar(figsize= (9,6),ylim=(0.78,1.00),rot=0)
    plt.show()
plot_accuracy(cv_scores)

n = 3 is the optimal value

In [None]:
model_knn = KNeighborsClassifier(n_neighbors = 3)
# fitting model
model_knn.fit(X_train,y_train)
#predict
predicted_classes_knn = model_knn.predict(X_test)

In [None]:
print("Confusion matrix for KNN model:")
conf_mat_knn = confusion_matrix(y_test.tolist(), predicted_classes_knn)
print(conf_mat_knn)
sns.heatmap(conf_mat_knn, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()

In [None]:
accuracy_knn = accuracy_score(y_test, predicted_classes_knn)
print("accuracy score for KNN model::", accuracy_knn)

4) Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier


In [None]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train,y_train)
DecisionTreeClassifier()
predicted_classes_dt = model_dt.predict(X_test)
print("Confusion matrix for DT model:")
conf_mat_dt = confusion_matrix(y_test.tolist(), predicted_classes_dt)
print(conf_mat_dt)
sns.heatmap(conf_mat_dt, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()

accuracy_dt = accuracy_score(y_test, predicted_classes_dt)
print("accuracy score  for DT model::", accuracy_dt)

In [None]:
model_dt = DecisionTreeClassifier(criterion = 'entropy')
model_dt.fit(X_train,y_train)
predicted_classes_dt = model_dt.predict(X_test)

print("Confusion matrix for DT model:")
conf_mat_dt = confusion_matrix(y_test.tolist(), predicted_classes_dt)
print(conf_mat_dt)
sns.heatmap(conf_mat_dt, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()

accuracy_dt = accuracy_score(y_test, predicted_classes_dt)
print("accuracy score  for DT model::", accuracy_dt)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train,y_train)

predicted_classes_rf = model_rf.predict(X_test)
print("Confusion matrix for RF model:")
conf_mat_rf = confusion_matrix(y_test.tolist(), predicted_classes_rf)
print(conf_mat_rf)
sns.heatmap(conf_mat_rf, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()

accuracy_rf = accuracy_score(y_test, predicted_classes_rf)
print("accuracy score for RF model::", accuracy_rf)

In [None]:
model_rf = RandomForestClassifier(criterion = 'entropy')
model_rf.fit(X_train,y_train)
predicted_classes_rf = model_rf.predict(X_test)

print("Confusion matrix for RF model:")
conf_mat_rf = confusion_matrix(y_test.tolist(), predicted_classes_rf)
print(conf_mat_rf)
sns.heatmap(conf_mat_rf, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()

accuracy_rf = accuracy_score(y_test, predicted_classes_rf)
print("accuracy score for RF model::", accuracy_rf)

6) Support Vector Machine

In [None]:
from sklearn.svm import SVC
model_svm = SVC()
model_svm.fit(X_train,y_train)

predicted_classes_svm = model_svm.predict(X_test)
print("Confusion matrix for SVM model:")
conf_mat_svm = confusion_matrix(y_test.tolist(), predicted_classes_svm)
print(conf_mat_svm)
sns.heatmap(conf_mat_svm, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()


accuracy_svm = accuracy_score(y_test, predicted_classes_svm)
print("accuracy score for SVM model::", accuracy_svm)

7) SGDC Classifier

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
model_sgdc = OneVsRestClassifier(SGDClassifier())
model_sgdc.fit(X_train,y_train)

predicted_classes_sgdc = model_sgdc.predict(X_test)
print("Confusion matrix for SGDC model:")
conf_mat_sgdc = confusion_matrix(y_test.tolist(), predicted_classes_sgdc)
print(conf_mat_sgdc)
sns.heatmap(conf_mat_sgdc, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()


accuracy_sgdc = accuracy_score(y_test, predicted_classes_sgdc)
print("accuracy score for SGDC model::", accuracy_sgdc)

8) Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model_lda = LinearDiscriminantAnalysis()
model_lda.fit(X_train,y_train)
predicted_classes_lda = model_lda.predict(X_test)

print("Confusion matrix for LDA model:")
conf_mat_lda = confusion_matrix(y_test.tolist(), predicted_classes_lda)
print(conf_mat_lda)
sns.heatmap(conf_mat_lda, annot = True , fmt = 'g')
plt.xlabel("Predicted classes")
plt.ylabel("Actual classes")
plt.show()

accuracy_lda = accuracy_score(y_test, predicted_classes_lda)
print("accuracy score for LDA model::", accuracy_lda)