In [None]:
# Usual imports
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Read in data
df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
df.drop(df.columns[[21, 22]], axis = 1, inplace = True) # drop naives_bayes columns, as recommended in description
df.shape

In [None]:
df.describe()

Using describe on the dataset we can see the average time on books is almost 3 years with a utilization ratio of 27%

In [None]:
# Less text on income bins
df['Income_Category'] = df['Income_Category'].replace('Less than $40K', '< 40K')
df['Income_Category'] = df['Income_Category'].replace('$40K - $60K', '40K - 60K')
df['Income_Category'] = df['Income_Category'].replace('$60K - $80K', '60K - 80K')
df['Income_Category'] = df['Income_Category'].replace('$80K - $120K', '80K - 120K')
df['Income_Category'] = df['Income_Category'].replace('$120K +', '>120K')

# Setup target variable
df['churn'] = df['Attrition_Flag'].replace('Existing Customer',0).replace('Attrited Customer',1)
df.drop('Attrition_Flag',axis=1,inplace=True)

# Check for Nulls
df.isna().any()

This data was pre-processed so a lot of the cleaning has already been done, but at this point you would do any cleaning (checking for nulls, data types, etc)

In [None]:
df.info()


In [None]:
# Import encoder to transform categorical data into numerical (for model)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# create a label encoder for columns with <2 unique values
le = LabelEncoder()
#count = 0
for col in df.columns[1:]:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <= 2:
            le.fit(df[col])
            df[col] = le.transform(df[col])

In [None]:
# Explore numerical data
EDA_df = df[['Customer_Age',
             'Gender',
             'Dependent_count',
             'Months_on_book',
             'Total_Revolving_Bal',
             'Total_Amt_Chng_Q4_Q1',
             'Avg_Utilization_Ratio',
             'Contacts_Count_12_mon',
            'Total_Relationship_Count']]

    
fig = plt.figure(figsize=(10, 10))
plt.suptitle('Histograms of Numerical Columns\n',
             horizontalalignment="center",
             fontstyle = "normal",
             fontsize = 24,
             fontfamily = "sans-serif")

for i in range(EDA_df.shape[1]):
    plt.subplot(3, 3, i + 1)
    f = plt.gca()
    f.set_title(EDA_df.columns.values[i])
    vals = np.size(EDA_df.iloc[:, i].unique())
    if vals >= 100:
        vals = 100
    plt.hist(EDA_df.iloc[:, i], 
         bins=vals,
         color = '#AEC3B0')
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])

* We can see a spike in ages at ~25, presumably you cannot get a credit card with this bank before then or perhaps they don't record lower than that. There is a second spike at ~50.

* For Gender, there is a fairly even distribution.

* Dependents are mostly 2/3 while the least are 0 or 5/greater than 5. 

* Months on book has a fairly even distribtion but a huge spike at ~36. Presumably there might be some common contract for 3 years

* Total revolving balance appears to be commonly low with a lot of customers at 0, and at 2500 (maybe a limit?), otherwise and even distribution.

* Total amount changed between quarter 4 and q 1 is commonly between 0.5 and 1, with some outliers going past that

* Average Utilization Ratio is (apart from the 0 counts) right-skewed with a subtle spike around 0.05/0.1

In [None]:
# Explore categorical data
categories = ['Education_Level',
              'Marital_Status',
              'Income_Category',
              'Card_Category']

fig, ax = plt.subplots(4, figsize=(14, 7))

i = 0
for cat in categories:
    ax[i].hist(df[cat],color = '#AEC3B0')
    ax[i].set_title(cat)
    i += 1
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

Still not sure why the education graph has oddly spaced data, if you know I'd like to hear how to fix it!

* Most credit card holders are Graduates
* Divorcees are least likely to own a credit card
* The most credit cards belong to people with an income under $40,000
* I'd imagine that Blue is the standard credit card category, in which case it doesn't appear that other categories are being utilised correctly. Maybe due to high turn-over at the 3 year mark?

In [None]:
# Visualise churn rate by categories
categories = ['Education_Level','Marital_Status','Income_Category','Card_Category']

for cat in categories:
    temp_churn = df.groupby([cat,'churn']).size().unstack()
    temp_churn.rename(columns={0:'No', 1:'Yes'}, inplace=True)
    colors  = ['#598392','#AEC3B0']

    ax = (temp_churn.T*100.0 / temp_churn.T.sum()).T.plot(
        kind='bar',
        figsize = (12,6),
        width = 0.5,
        stacked = True,
        color = colors)

    plt.ylabel('% of Customers')
    plt.xlabel(cat)
    plt.title(cat + ' Churn Rate')

    plt.legend(loc='right', fontsize = "medium")
    plt.xticks(rotation=0, horizontalalignment="center")
    plt.yticks(rotation=0, horizontalalignment="right")
    ax.yaxis.set_major_formatter(mtick.PercentFormatter())
    for p in ax.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy() 
        ax.text(x+width/2, 
                y+height/2, 
                '{:.1f}%'.format(height), 
                horizontalalignment='center', 
                verticalalignment='center')
    ax.autoscale(enable=False, axis='both', tight=False)

* The higher the customers education the more likely they are to churn
* Married customers are less likely to churn
* Those under 40K or over 120K have the highest chrun rate
* Platinum card holders have a very high churn rate

In [None]:
# Create a bivariate correlation plot
import seaborn as sn

corr = df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(18, 15))
cmap = sn.diverging_palette(220, 10, as_cmap=True)
sn.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# One-hot encode the data
ID = df["CLIENTNUM"] # Extract as we do not want to encode this unique identifier
df = df.drop(columns="CLIENTNUM")
df = pd.get_dummies(df)
df = pd.concat([df, ID], axis = 1)

# Split for training and test data
response = df["churn"]
df = df.drop(columns="churn")

from sklearn.model_selection import train_test_split
# Split data into training and testing data (X), and training/testing labels (y) using a 20% test size, 80% train size
X_train, X_test, y_train, y_test = train_test_split(df, response, stratify=response, test_size = 0.2)

In [None]:
train_identity = X_train['CLIENTNUM']
X_train = X_train.drop(columns = ['CLIENTNUM'])
test_identity = X_test['CLIENTNUM']
X_test = X_test.drop(columns = ['CLIENTNUM'])

In [None]:
# Scale the data, better for skewed data as seen in the graphs above
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_train2.columns = X_train.columns.values
X_train2.index = X_train.index.values
X_train = X_train2
X_test2 = pd.DataFrame(sc_X.transform(X_test))
X_test2.columns = X_test.columns.values
X_test2.index = X_test.index.values
X_test = X_test2

# Model Selection

In [None]:
# Import a variety of models with mostly basic parameters
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
# Models and parameters to test
models = []
models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state = 0,class_weight='balanced')))
models.append(('SVC', SVC(kernel = 'linear', random_state = 0)))
models.append(('Kernel SVM', SVC(kernel = 'rbf', random_state = 0)))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)))
models.append(('Gaussian NB', GaussianNB()))
models.append(('Decision Tree Classifier',DecisionTreeClassifier(criterion = 'entropy', random_state = 0)))
models.append(('Random Forest', RandomForestClassifier(n_estimators=100, criterion = 'entropy', random_state = 0)))

# Storing results of each model
acc_results = []
auc_results = []
precision = []
recall = []
names = []
col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD', 'Accuracy Mean', 'Accuracy STD','Recall','Precision']
model_results = pd.DataFrame(columns=col)

# Evaluate each model using k-fold cross-validation:
i = 0
for name, model in models:
    kfold = model_selection.KFold(
        n_splits=10)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(  
    model, X_train, y_train, cv=kfold, scoring='accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(  
    model, X_train, y_train, cv=kfold, scoring='roc_auc')
    # Precision & Recall
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    precision.append(precision_score(y_test, pred))
    recall.append(recall_score(y_test, pred))
    # Storing Results
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results.loc[i] = [name,
                        round(cv_auc_results.mean()*100, 2),
                        round(cv_auc_results.std()*100, 2),
                        round(cv_acc_results.mean()*100, 2),
                        round(cv_acc_results.std()*100, 2),
                        round(recall[i]*100, 2), # The one we want to focus on
                        round(precision[i]*100, 2)]
    i += 1

In [None]:
model_results.sort_values(by=['Recall'], ascending=False)

In [None]:
clf = LogisticRegression(random_state = 0)
clf.fit(X_train, y_train)

feature_importance = abs(clf.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

featfig = plt.figure(figsize=(15,5))
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(X_train.columns)[sorted_idx], fontsize=8)
featax.set_xlabel('Relative Feature Importance')

plt.tight_layout()   
plt.show()

In [None]:
test = ['Total_Trans_Ct',
        'Total_Trans_Amt',
        'Total_Revolving_Bal',
        'Total_Relationship_Count',
        'Total_Ct_Chng_Q4_Q1',
        'Contacts_Count_12_mon',
        'Gender',
        'Months_Inactive_12_mon']

In [None]:
X_train = X_train[test]
X_test = X_test[test]

In [None]:
models = []
models.append(('Logistic Regression', LogisticRegression(solver='liblinear', random_state = 0, class_weight='balanced')))
models.append(('SVC', SVC(kernel = 'linear', random_state = 0)))
models.append(('Kernel SVM', SVC(kernel = 'rbf', random_state = 0)))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)))
models.append(('Gaussian NB', GaussianNB()))
models.append(('Decision Tree Classifier',DecisionTreeClassifier(criterion = 'entropy', random_state = 0)))
models.append(('Random Forest', RandomForestClassifier(n_estimators=100, criterion = 'entropy', random_state = 0)))

acc_results = []
auc_results = []
precision = []
recall = []
names = []
col = ['Algorithm', 'ROC AUC Mean', 'ROC AUC STD', 'Accuracy Mean', 'Accuracy STD','Recall','Precision']
model_results2 = pd.DataFrame(columns=col)
i = 0
# Evaluate each model using k-fold cross-validation:
for name, model in models:
    kfold = model_selection.KFold(
        n_splits=10)
    # accuracy scoring:
    cv_acc_results = model_selection.cross_val_score(  
    model, X_train, y_train, cv=kfold, scoring='accuracy')
    # roc_auc scoring:
    cv_auc_results = model_selection.cross_val_score(  
    model, X_train, y_train, cv=kfold, scoring='roc_auc')
    # Precision & Recall
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    precision.append(precision_score(y_test, pred))
    recall.append(recall_score(y_test, pred))
    # Storing Results
    acc_results.append(cv_acc_results)
    auc_results.append(cv_auc_results)
    names.append(name)
    model_results2.loc[i] = [name,
                        round(cv_auc_results.mean()*100, 2),
                        round(cv_auc_results.std()*100, 2),
                        round(cv_acc_results.mean()*100, 2),
                        round(cv_acc_results.std()*100, 2),
                        round(recall[i]*100, 2),
                        round(precision[i]*100, 2)]
    i += 1

In [None]:
model_results2.sort_values(by=['Recall'], ascending=False)

# Results
Using the feature importance graph, I took a few of the best performing features (this could probably be extended) and scored the model using all features against the model with the highest performing features

In [None]:
model_results3.sort_values(by=['Recall'], ascending=False)