In [None]:
# Essentials
import numpy as np
import pandas as pd
import datetime
import random

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgbm
from lightgbm import LGBMRegressor, LGBMClassifier
import xgboost as xgb

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn import metrics

pd.set_option('display.max_columns', None)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000



import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from collections import defaultdict


# Import Churn Data

In [None]:
df = pd.read_csv('../input/credit-card-customers/BankChurners.csv')
df = df.iloc[:, :-2]
df.head()

Define target variable as churn.

In [None]:
df.Attrition_Flag.value_counts()

In [None]:
df['churn'] = (df.Attrition_Flag == 'Attrited Customer')*1

df.head()

In [None]:
df.churn.sum()/df.churn.count()

Target ratio is 16% which is not too bad.

In [None]:
df.isnull().sum()

In [None]:
df.describe()

# EDA

We will define functions that will help us analyze features and their relationships with churn behavior. 

In [None]:
# function takes feature name as input and plots it by churn group
def plt_feature_by_churn(feature_name):
    fig = plt.figure(figsize=(16,10))
    sns.countplot(feature_name,data=df,hue='churn')
    if df[feature_name].dtype != 'O':
        print('correlation between ', feature_name, ' and churn:', df[[feature_name, 'churn']].corr().iloc[0,1])

In [None]:
# function takes feature name as input and plots churn ratios for each value of the feature
def plt_churn_ratio(feature_name):
    feat = df[[feature_name, 'churn']].groupby(feature_name).sum()/df[[feature_name, 'churn']].groupby(feature_name).count()
    sns.lineplot(feat.index, feat['churn'] )

In [None]:
# function takes feature name as input and applies one-hot encoding
def one_hot(feature_name):
    return(pd.get_dummies(df, columns=[feature_name], drop_first=True))

## Age 

In [None]:
plt_feature_by_churn('Customer_Age')

In [None]:
plt_churn_ratio('Customer_Age')

Looks like there is no linear relationship between age and churn, we still keep age as a feature but no need for binning.

## Gender 

In [None]:
plt_feature_by_churn('Gender')

In [None]:
plt_churn_ratio('Gender')

Females are slightly more likely to churn with 17% compared to males with 15%, we'll convert this feature to 1-0.

In [None]:
df = one_hot('Gender')
df.head()

## Dependents 

In [None]:
plt_feature_by_churn('Dependent_count')

In [None]:
plt_churn_ratio('Dependent_count')

Similar to age, even though we see a jump at 3, there is no linear relationship between num of dependents and churn, we still keep it as a feature but no need for binning.

## Education 

In [None]:
plt_feature_by_churn('Education_Level')

In [None]:
plt_churn_ratio('Education_Level')

Similar to gender, we will apply one-hot encoding to education level.

In [None]:
df = one_hot('Education_Level')

## Marital Status 

In [None]:
plt_feature_by_churn('Marital_Status')

In [None]:
plt_churn_ratio('Marital_Status')

Looks like married people are less likely to churn, we will apply one-hot encoding to this feature too.

In [None]:
df = one_hot('Marital_Status')

## Income 

In [None]:
plt_feature_by_churn('Income_Category')

In [None]:
plt_churn_ratio('Income_Category')

In [None]:
df = one_hot('Income_Category')

## Card Category 

In [None]:
plt_feature_by_churn('Card_Category')

In [None]:
plt_churn_ratio('Card_Category')

In [None]:
df = one_hot('Card_Category')

## Months on Book 

In [None]:
plt_feature_by_churn('Months_on_book')

In [None]:
plt_churn_ratio('Months_on_book')

## Total Relationship Count 

In [None]:
plt_feature_by_churn('Total_Relationship_Count')

In [None]:
plt_churn_ratio('Total_Relationship_Count')

## Months inactive 

In [None]:
plt_feature_by_churn('Months_Inactive_12_mon')

In [None]:
plt_churn_ratio('Months_Inactive_12_mon')

## Contacts

In [None]:
plt_feature_by_churn('Contacts_Count_12_mon')


In [None]:
plt_churn_ratio('Contacts_Count_12_mon')

## Credit Limit 

In [None]:
df['Credit_Limit_K'] = np.round(df.Credit_Limit/1000, 0)

In [None]:
plt_feature_by_churn('Credit_Limit_K')


In [None]:
plt_churn_ratio('Credit_Limit_K')

## Revolving Balance 

In [None]:
df['Total_Revolving_Bal_500'] = np.round(df.Total_Revolving_Bal/500, 0)

In [None]:
plt_feature_by_churn('Total_Revolving_Bal_500')

In [None]:
plt_churn_ratio('Total_Revolving_Bal_500')

## Open To Buy 

In [None]:
df['Avg_Open_To_Buy_K'] = np.round(df.Avg_Open_To_Buy/1000, 0)
plt_feature_by_churn('Avg_Open_To_Buy_K')

In [None]:
plt_churn_ratio('Avg_Open_To_Buy_K')

## Trends

In [None]:
df['Total_Amt_Chng_Q4_Q1_10P'] = np.round(df.Total_Amt_Chng_Q4_Q1*10,0)
plt_feature_by_churn('Total_Amt_Chng_Q4_Q1_10P')

In [None]:
plt_churn_ratio('Total_Amt_Chng_Q4_Q1_10P')

In [None]:
df['Total_Ct_Chng_Q4_Q1_10P'] = np.round(df.Total_Ct_Chng_Q4_Q1*10,0)
plt_feature_by_churn('Total_Ct_Chng_Q4_Q1_10P')

In [None]:
plt_churn_ratio('Total_Ct_Chng_Q4_Q1_10P')

## Total Transaction Amount 

In [None]:
df['Total_Trans_Amt_K'] = np.round(df.Total_Trans_Amt/1000,0)
plt_feature_by_churn('Total_Trans_Amt_K')

In [None]:
plt_churn_ratio('Total_Trans_Amt_K')

## Total Transaction Count 

In [None]:
df['Total_Trans_Ct_10'] = np.round(df.Total_Trans_Ct/10,0)
plt_feature_by_churn('Total_Trans_Ct_10')

In [None]:
plt_churn_ratio('Total_Trans_Ct_10')

## Utilization 

In [None]:
df['Avg_Utilization_Ratio_10P'] = np.round(df.Avg_Utilization_Ratio*10,0)
plt_feature_by_churn('Avg_Utilization_Ratio_10P')

In [None]:
plt_churn_ratio('Avg_Utilization_Ratio_10P')

In [None]:
df.head()

# LGB Model 

In [None]:
X = df.drop(['CLIENTNUM', 'Attrition_Flag', 'churn', 'Credit_Limit_K', 'Total_Revolving_Bal_500', 'Avg_Open_To_Buy_K', 
            'Total_Amt_Chng_Q4_Q1_10P', 'Total_Trans_Amt_K', 'Total_Trans_Ct_10', 'Total_Ct_Chng_Q4_Q1_10P',
            'Avg_Utilization_Ratio_10P'],axis=1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=145)

In [None]:
lgb = LGBMClassifier(learning_rate=0.01,max_depth=5, min_child_weight=3,
                     n_estimators=1000,num_leaves=5).fit(X_train,y_train)
y_pred = pd.DataFrame(lgb.predict(X_test))
y_predprob = pd.DataFrame(lgb.predict_proba(X_test))

print('The accuracy of the LGBM is',metrics.accuracy_score(y_pred,y_test))

In [None]:
y_predprob.head()

In [None]:
print ("\nModel Report")
print ("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
print ("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob[1]))

ax= plt.subplot()
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, ax = ax);

In [None]:
feature_imp = pd.DataFrame(sorted(zip(lgb.feature_importances_,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()
plt.savefig('lgbm_importances-01.png')

Prediction accuracy is suspiciously high, which might be a result of total transaction amount's period overlapping with churn period. This is still a good start to predicting churn which is a very important subject for not just banks, but all big corporates with large customer base.