In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Lets read the data
data = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv", index_col = 0)
data_train, data_test = train_test_split(data, test_size = 0.3)

In [None]:
# Lets take a quick look at the data
data_train.head()

In [None]:
# Lets get an overview of income distributions
income = data_train.groupby("Income_Category").Income_Category.agg(["count"])

fig = plt.figure()
plt.title("Number of customers in income categories")
plt.grid(which = "both", color = "lightgray", linestyle = "--")
plt.bar(x = income.index, height = income["count"])
plt.xticks(rotation = 45)
plt.xlabel("Income category")
plt.ylabel("Number of customers")

income

In [None]:
gender_income = data_train.groupby(["Income_Category", "Gender"]).Income_Category.agg(["count"])
gender_income.unstack().plot(kind = "bar")
plt.title("Income distribution based on income and gender")
plt.xticks(rotation = 45)
plt.show()

In [None]:
education_income = data_train.groupby(["Income_Category", "Education_Level"]).Income_Category.agg(["count"])
education_income.unstack().plot(kind = "bar")
plt.title("Income distribution based on education and gender")
plt.xticks(rotation = 45)
plt.show()

In [None]:
n = data_train.shape[1] - 10
categories = {a: b for a, b in zip(range(n), data_train.columns)}
for i in range(2, n):
    data_train.groupby([categories[i], categories[0]]).Attrition_Flag.agg(["count"]).unstack().plot(kind = "bar")
    plt.tight_layout()

Total relationship count and Contacts count over 12 months looks like indicators. However, it is hard to see which attributes to use since the value of Atrrited and Existing custumor is less informative than their relation. 

In [None]:
for i in range(2, n):
    base = data_train.groupby([categories[i]]).Attrition_Flag.agg(["count"])
    temp_data = data_train.groupby([categories[i], categories[0]]).Attrition_Flag.agg(["count"])/base
    temp_data.unstack().plot(kind = "bar")

After normalizing and eyeballing it appears age might provide some information, education, income and card category is mildly indicative, month on book might be a good indicator. Total relationship count and months inactive appear strong indicators.

Let's create a pipeline and start transforming the data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from pandas.api.types import CategoricalDtype

# make a custom function for the pipeline
class ToCat(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        df = X.copy()
        cat_cols = [0, 2, 3, 4, 5, 7]
        income_cats = CategoricalDtype(categories=["Unknown", "Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +"], ordered=True)
        df.loc[:, categories[6]] = df.iloc[:, 6].astype(income_cats)
        df.iloc[:, 6] = df.iloc[:, 6].fillna("Unknown")
        for i in cat_cols:
            df.iloc[:, [i]] = df.iloc[:, [i]].astype("category")
        return df
    
class CatToCodes(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        df = X.copy()
        df["Income_Unknown"] = (df["Income_Category"] == "Unknown").astype(int)
        df["Income_Cat"] = df.Income_Category.cat.codes
        df.drop(columns = ["Income_Category"], inplace = True)
        cat_cols = df.select_dtypes(['category']).columns
        for col in cat_cols:
            df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis = 1)
            df.drop(columns = [col], inplace = True)
        df.drop(columns = ["Attrition_Flag_Attrited Customer"], inplace = True)
        return df
    
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return X.iloc[:, list(range(X.shape[1] - 2))]

pipe = Pipeline(steps = [
    ('drop_cols', DropColumns()),
    ('make_cat', ToCat()),
    ('cat_to_codes', CatToCodes()),
    ])

pipe.fit(data_train)
data_train = pipe.transform(data_train)
data_test  = pipe.transform(data_test)

data_train.head()

In [None]:
data_train.describe()

Now that everything is changed to numbers we should inspect the data to check how it is related. We start with a correlation plot which checks for linear dependencies. 

In [None]:
corr_mat = data_train.corr()

sns.heatmap(corr_mat)
plt.show()

From the correlation matrix it appears the non-categorical features has the greatest linear relation with the attrition feature. Now let's check if all variables are approximately normally distributed. We also see that Credit_Limit and Avg_Open_To_Buy is almost perfectly correlated.

But first let's also list the most correlated variables.

In [None]:
corr_mat["Attrition_Flag_Existing Customer"].abs().sort_values(ascending = False)

It appears Total_Trans_Ct, Total_Ct_Chng_Q4_Q1, Total_Revolving_Bal and Contacts_Count_12_mon is the features which has the greatest correlation with the target variable.

In [None]:
with plt.style.context('seaborn-darkgrid'):
    fig = plt.figure(figsize = (12, 12))
    ax = pd.plotting.radviz(data_train, 'Attrition_Flag_Existing Customer', alpha = 0.25)
    L = ax.legend(loc = "upper right")
    L.get_texts()[0].set_text("Attrited Customer")
    L.get_texts()[1].set_text("Existing Customer")
    plt.tight_layout()
    plt.show()

It is clear from the radviz plot that the classes are not well seperated.

In [None]:
with plt.style.context("seaborn-darkgrid"):
    fig = plt.figure()
    axes = pd.plotting.scatter_matrix(data_train.iloc[:, 0:13], alpha = 0.1, figsize = (18, 18), diagonal = 'hist')
    for ax in axes.flatten():
        ax.xaxis.label.set_rotation(90)
        ax.yaxis.label.set_rotation(0)
        ax.yaxis.label.set_ha('right')
    plt.tight_layout()
    plt.gcf().subplots_adjust(wspace = 0, hspace = 0)
    plt.show()

Again, Credit_Limit and Avg_Open_To_Buy seems to capture virtually the same information. They are perfectly linearly correlated and it may be prudent to drop one of them. Customer_Age and Months_on_book are also well correlated. So, it may be wise to drop Custumer_Age. Additionally, not all features are normally distributed. Some features are from a power distribution. So, it is important that we rescale these. Finally, there are some artifacts with exceptional amount of people with 36 or so Months_on_book and some other features. Let's do a rescaling and check again.

But, first let's seperate the training target from the dataset.

In [None]:
# Seperate features from target and dropping Customer age and avg open to buy

target_train = data_train["Attrition_Flag_Existing Customer"]
data_train = data_train.drop(columns = ["Attrition_Flag_Existing Customer"])
target_test = data_test["Attrition_Flag_Existing Customer"]
data_test = data_test.drop(columns = ["Attrition_Flag_Existing Customer"])

Let's rescale and see if we get better distributions. Let's also try and do some feature eliminations. 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
ss = StandardScaler()
mms = MinMaxScaler(feature_range = (0, 1))
pt = PowerTransformer(method = "box-cox")

pipe_scale = Pipeline(steps = [('s', mms), ('ss', ss)])
pipe_scale.fit(data_train, target_train)
data_train_t = pipe_scale.transform(data_train)
data_test_t = pipe_scale.transform(data_test)

data_train_t = pd.DataFrame(data_train_t, columns = data_train.columns, index = data_train.index)
data_test_t = pd.DataFrame(data_test_t, columns = data_test.columns, index = data_test.index)


In [None]:
data_train_t.describe()

In [None]:
with plt.style.context("seaborn-darkgrid"):
    fig = plt.figure()
    axes = pd.plotting.scatter_matrix(data_train_t.iloc[:, 0:13], alpha = 0.1, figsize = (18, 18), diagonal = 'hist')
    for ax in axes.flatten():
        ax.xaxis.label.set_rotation(90)
        ax.yaxis.label.set_rotation(0)
        ax.yaxis.label.set_ha('right')
    plt.tight_layout()
    plt.gcf().subplots_adjust(wspace = 0, hspace = 0)
    plt.show()

It is better than before, but not all power distributions have been scaled to normal distributions. Let's now reinspect the cluster plot generated by the radviz function.

In [None]:
with plt.style.context('seaborn-darkgrid'):
    fig = plt.figure(figsize = (12, 12))
    temp = data_train_t.copy()
    temp["Attrition_Flag"] = target_train
    ax = pd.plotting.radviz(temp, 'Attrition_Flag', alpha = 0.25)
    L = ax.legend(loc = "upper right")
    L.get_texts()[0].set_text("Attrited Customer")
    L.get_texts()[1].set_text("Existing Customer")
    plt.tight_layout()
    plt.show()

The data seem a bit more centered but no obvious change is visible. Still, with the scaling and so on we will test with a logistic regression and see if we can gain a good accuracy with the dataset as it currently is.

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC

estimator = SVC(kernel = "linear")

min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(2),
              scoring='accuracy',
              min_features_to_select=min_features_to_select)
rfecv.fit(data_train_t, target_train)

selected_train = data_train_t.iloc[:, rfecv.ranking_ == 1]
selected_test = data_test_t.iloc[:, rfecv.ranking_ == 1]


print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(min_features_to_select,
               len(rfecv.grid_scores_) + min_features_to_select),
         rfecv.grid_scores_)
plt.show()

In [None]:
selected_train.head()

In [None]:
# Start with a simple Logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

clf_t = LogisticRegression().fit(data_train_t, target_train)
scores = cross_val_score(clf_t, data_train_t, target_train, cv = 5)
print("Mean score: ", np.mean(scores), " Std: ", np.std(scores))

clf_t = LogisticRegression().fit(selected_train, target_train)
scores = cross_val_score(clf_t, selected_train, target_train, cv = 5)
print("Mean score: ", np.mean(scores), " Std: ", np.std(scores))

With these transformed feateres, we get about 91% accuracy using logistic regression. Now let's reduce the features further with PCA analysis and keep the tranformed features with greatest variation.

In [None]:
from sklearn.decomposition import PCA

pca = PCA(0.95)

pca.fit(selected_train)
selected_train = pca.transform(selected_train)
selected_test = pca.transform(selected_test)

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

parameters = {'kernel':('linear', 'rbf', 'poly'), 'C':np.logspace(0,1,11)}
svc = svm.SVC()
gnb = GaussianNB()
rfc = RandomForestClassifier()

In [None]:
clf_t = GridSearchCV(svc, parameters)
clf_t.fit(data_train_t, target_train)
clf_t.best_estimator_

In [None]:
clf_selected = GridSearchCV(svc, parameters)
clf_selected.fit(selected_train, target_train)
clf_selected.best_estimator_

In [None]:
gnb.fit(data_train_t, target_train)

In [None]:
rf_parameters = {
                'max_depth': range(1, 10),
                'min_samples_leaf': range(1, 10),
                }
clf_rf = GridSearchCV(rfc, rf_parameters)
clf_rf.fit(data_train_t, target_train)
clf_rf.best_estimator_

In [None]:
scores_t = cross_val_score(clf_t, data_train_t, target_train, cv = 5)
print("Mean score: ", np.mean(scores_t), " Std: ", np.std(scores_t))

In [None]:
scores_selected = cross_val_score(clf_selected, selected_train, target_train, cv = 5)
print("Mean score: ", np.mean(scores_selected), " Std: ", np.std(scores_selected))

In [None]:
scores_gnb = cross_val_score(gnb, data_train_t, target_train, cv = 5)
print("Naive Bayes Mean score: ", np.mean(scores_gnb), " Std: ", np.std(scores_gnb))

In [None]:
scores_rnc = cross_val_score(clf_rf, data_train_t, target_train, cv = 5)
print("Random Forest Mean score: ", np.mean(scores_rnc), " Std: ", np.std(scores_rnc))

The random forest appear to give the best results at 94%. Let's wrap it up with testing against the test sets and perform an ANOVA test to verify that the best performer is indeed the best performer.

In [None]:
test_scores_t = cross_val_score(clf_t, data_test_t, target_test, cv = 10)
test_scores_selected = cross_val_score(clf_selected, selected_test, target_test, cv = 10)
test_scores_rnc = cross_val_score(clf_rf, data_test_t, target_test, cv = 10)

And now we will perform an ANOVA test with the null hypothesis $H_0$: all test scores are the same, $H_a$: at least one test score is different to the rest.

In [None]:
import statsmodels.api as sm
import statsmodels.stats.multicomp

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy import stats

In [None]:
fig = plt.figure()
sns.kdeplot(test_scores_t, shade = True)
sns.kdeplot(test_scores_selected, shade = True)
sns.kdeplot(test_scores_rnc, shade = True)
plt.legend(["SVM", "SVM reduced features", "Random Forest"])
plt.show()

The distributions are approximately normally distributed, and the Random forest classifier perfermed the best.