In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer, recall_score, roc_auc_score
from xgboost import XGBClassifier, plot_importance
from pdpbox import pdp, get_dataset, info_plots

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

The dataset is from kaggle [data](https://www.kaggle.com/santoshd3/bank-customers)   

In [None]:
df = pd.read_csv('../input/bank-customers/Churn Modeling.csv')
df.head()

In [None]:
df.info()

We can see there are no missing values. The datatypes are all good. 

In [None]:
print(df.columns)

In [None]:
df.drop(df.columns[[0,1]], axis=1, inplace=True)

unique_vals = {}
print('Unique values for each feature:\n')
for column in df.columns:
    unique_vals[column]=df[column].unique()
    print(len(unique_vals[column]), 'unique values of ', column)

No replicated CustomerId. Numbers of unique values of Gender, HasCrCard, IsActiveMember, Exited are legit.

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(15,15))
sns.histplot(ax=axes[0, 0], data=df, x="CreditScore", hue="Exited", multiple="stack")
sns.histplot(ax=axes[0, 1], data=df, x='Age', hue="Exited", multiple="stack")
sns.histplot(ax=axes[0, 2], data=df, x='Tenure', hue="Exited", multiple="stack")
sns.histplot(ax=axes[1, 0], data=df, x='Balance', hue="Exited", multiple="stack")
sns.histplot(ax=axes[1, 1], data=df, x='NumOfProducts', hue="Exited", multiple="stack")
sns.histplot(ax=axes[1, 2], data=df, x='EstimatedSalary', hue="Exited", multiple="stack")
sns.histplot(ax=axes[2, 0], data=df, x='Geography', hue="Exited", multiple="stack")
sns.histplot(ax=axes[2, 1], data=df, x='Gender', hue="Exited", multiple="stack")
sns.histplot(ax=axes[2, 2], data=df, x='HasCrCard', hue="Exited", multiple="stack")
sns.histplot(ax=axes[3, 0], data=df, x='IsActiveMember', hue="Exited", multiple="stack")

We can see generally, customers from the following groups are more likely to exit: 
1. Over the age of 40.
2. From Germany.
3. Female.

Customers from the following groups are less likely to exit: 
1. Having 2 products.
2. Active members. 

In [None]:
cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Exited']
sns.pairplot(df[cols], hue='Exited', kind='hist', height=2)
plt.show();

In [None]:
# encode the categorical features
cat_features = ['Geography', 'Gender']
ohe = OneHotEncoder(sparse=False, dtype='int64', drop='if_binary')
cat_encoded = ohe.fit_transform(df[cat_features])
column_name = ohe.get_feature_names(cat_features)
ohe_frame =  pd.DataFrame(cat_encoded, columns= column_name)
df = pd.concat([df.select_dtypes(exclude='object'), ohe_frame], axis=1)
#df.info()

In [None]:
corrmatrix = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
ax = sns.heatmap(corrmatrix, vmax=.8, square=True, annot=True, cmap="YlGnBu")

Something interesting: Balance of customers from different countries varies a lot. 

In [None]:
X = df.drop(['Exited'], axis=1)
y = df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

param_grid = {'max_depth':range(3,15),'criterion':['gini','entropy']}
rf = RandomForestClassifier(random_state=4)
model_rf = GridSearchCV(rf, param_grid=param_grid)
model_rf.fit(X_train, y_train)
pred_test = model_rf.predict(X_test)
print('Classification Report of RandomForestClassifier: \n', classification_report(y_test, pred_test))
#scores = cross_val_score(model_rf, X, y, scoring='roc_auc')
#roc_auc_score(y_test, model_rf.predict_proba(X_test)[:, 1], average='weighted')
#print ('cross validation score of RandomForestClassifier: %.8f'%scores.mean())

We can see the recall is not good, meaning a lot of false negatives. Let's try to improve that if we don't want to miss potentially positive cases. 

In [None]:
rf1 = model_rf.best_estimator_
importances1 = rf1.feature_importances_
feature_importances = pd.Series(importances1, index=X.columns)
feature_importances.nlargest(12).plot(kind='barh')

In [None]:
rf2 = RandomForestClassifier(random_state=4, class_weight={0:1,1:5})
# For imbalanced sample: less 'Exited'=1 present, give 'Exited'=1 more weight. 
scorer = make_scorer(recall_score)
model_rf2 = GridSearchCV(rf2, param_grid=param_grid, scoring=scorer)
model_rf2.fit(X_train, y_train)
pred_test = model_rf2.predict(X_test)
print('Classification Report of RandomForestClassifier: \n', classification_report(y_test, pred_test))

It seems we achieve a good recall though the accuracy is relatively low. We need to tune the model according to our business objectives, like intervening before the exiting happens. In such cases, we may be willing to sacrifice accuracy for recall.

In [None]:
best_rf = model_rf2.best_estimator_
importances = best_rf.feature_importances_
feature_importances = pd.Series(importances, index=X.columns)
feature_importances.nlargest(12).plot(kind='barh')

In [None]:
for target_feature in ['Age', 'NumOfProducts', 'IsActiveMember']:
    pdp_i = pdp.pdp_isolate(model=best_rf, dataset=X, model_features=X.columns, feature=target_feature)
    pdp.pdp_plot(pdp_i, target_feature, figsize=(8,5))

Let's try another model:

In [None]:
xgb = XGBClassifier()
"""
param_grid = {'learning_rate': [0.01, 0.05], 
#        'min_child_weight': [1, 5],
#        'subsample': [0.6, 0.8],
#        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 8],
#        'n_estimators': [100, 500]
        }

model_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='roc_auc')
"""
xgb.fit(X_train, y_train)
pred_test = xgb.predict(X_test)
print('Classification Report of XGBClassifier: \n', classification_report(y_test, pred_test))
#scores = cross_val_score(model_xgb, X, y, scoring='roc_auc')
#print ('cross validation score of XGBClassifier: %.8f'%scores.mean())

In [None]:
xgb = XGBClassifier(scale_pos_weight=5)
xgb.fit(X_train, y_train)
pred_test = xgb.predict(X_test)
print('Classification Report of XGBClassifier: \n', classification_report(y_test, pred_test))

In [None]:
plot_importance(xgb)