In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly_express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('../input/customer-churn-with-explainable-ai/Churn_Modelling.csv')
data.head()

Let's explore the data! 

In [None]:
data.info()

In [None]:
data.nunique()

There are 10000 entries, with 14 columns.
There are 3 text fields, Surname, Geography & Gender.
The rest of the features have number entries, and there are no empty fields.

There are several columns with categorical data:
Geography, Gender, HasCrCard, IsActiveMember.
The column Exited has the label to be predicted. 

In [None]:
# inspect target results
fig = px.histogram(data, x='Exited', width=300, height=300)
fig.update_xaxes(type='category')
#fig.update_layout(bargap=0.2)
fig.show()

There is a class imbalance issue, with the no. of people exiting 1 quarter that of those that stayed!

In [None]:
fig = px.histogram(data, x='CreditScore', width=600, height=300, facet_col="Exited", nbins=40)
fig.show()

Looks like credit score is a normal distribution, with the peak between 600-700 for both classes.

In [None]:
# inspect geography
fig = px.histogram(data, x='Geography', width=300, height=300)
fig.show()

All the customers come from Europe. Most are from France (~50%) while Germany is ~25% and Spain 25%

In [None]:
# inspect age
fig = px.box(data, y='Age', facet_col="Exited", width=500, height=300)
fig.show()

Looks like most who exited tend to be older than those who stayed!

In [None]:
fig = px.histogram(data, x='Balance', facet_col="Exited", width=500, height=300, nbins=30)
fig.show()

There are a lot of balances (> 3600) with zero amount!

Start Data Processing

Start separating out the labels column

In [None]:
from sklearn.preprocessing import LabelEncoder
labels = data['Exited']
le = LabelEncoder()
le.fit(labels)
labels = le.transform(labels)
class_names = le.classes_

Drop the columns RowNumber, CustomerId, Surname as they are not predictive features, and remove the labels column

In [None]:
data = data.drop(['RowNumber', 'CustomerId', 'Surname', 'Exited'], axis = 1) 
data.head()

In [None]:
feature_names = data.columns.tolist()

Explicitly define the categorical features, we need this for LIME.
Categorical data: Geography, Gender, HasCrCard, IsActiveMember.

In [None]:
categorical_features = [1,2,7,8]
categorical_names = {}
for feature in categorical_features:
    column = data.iloc[:,feature].values
    le = LabelEncoder()
    le.fit(column)
    data.iloc[:, feature] = le.transform(column)
    categorical_names[feature] = le.classes_

In [None]:
data.head(10)

In [None]:

data.info()

Ok..now the dataframe has all data as integers/floats, so we are ready to start training the LightGBM classifier!

In [None]:
seed = 101 # fix random seed for reproducibility
np.random.seed(seed)

# Split Train Test sets
from sklearn.model_selection import train_test_split
train, test, labels_train, labels_test = train_test_split(data, labels, 
                                                    test_size=0.2,
                                                    stratify=labels,
                                                    random_state=seed)
print(train.shape, test.shape, labels_train.shape, labels.shape)

In [None]:
import lightgbm as lgb
from bayes_opt import BayesianOptimization

def modelFitter(colsampleByTree, subsample,maxDepth, num_leaves, num_estimators, learn_rate):   
    model = lgb.LGBMClassifier(learning_rate=learn_rate, n_estimators=num_estimators.astype("int32"), max_depth=maxDepth.astype("int32"), subsample=subsample, colsample_bytree=colsampleByTree,num_leaves=num_leaves.astype("int32"), is_unbalance = 'true', random_state=seed)

    evalSet  = [(test, labels_test)]
    model.fit(train, labels_train, eval_metric="auc", eval_set=evalSet, early_stopping_rounds=50, categorical_feature=categorical_features, verbose=False)

    bestScore = model.best_score_[list(model.best_score_.keys())[0]]['auc']

    return bestScore

# Bounded region of parameter space
pbounds = {'colsampleByTree': (0.2,1.0), 'subsample': (0.2,1.0), 'maxDepth': (2,5), 'num_leaves': (4, 40), 'num_estimators': (100, 500), 'learn_rate': (0.02, 0.3)}

optimizer = BayesianOptimization(
    f=modelFitter,
    pbounds=pbounds,
    random_state=1)

optimizer.maximize(init_points=5,n_iter=5)  #n_iter=bayesian, init_points=random


    colsam = 0.69, learn_rate = 0.10, maxDepth =  3, num_estimators = 116, numleaves =  9, subsample = 0.74

In [None]:
gbtree = lgb.LGBMClassifier(learning_rate=0.10, n_estimators=116, num_leaves= 9, max_depth=3, subsample=0.74, colsample_bytree=0.69, objective = 'binary', is_unbalance = 'true', random_state=seed)
gbtree.fit(train, labels_train, categorical_feature=categorical_features)

In [None]:
y_preds = gbtree.predict(test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
def model_evaluate(): 
    
    print('Test Accuracy:\t{:0.1f}%'.format(accuracy_score(labels_test,y_preds)*100))
    
    #classification report
    print('\n')
    print(classification_report(labels_test, y_preds))

    #confusion matrix
    confmat = confusion_matrix(labels_test, y_preds)

    fig, ax = plt.subplots(figsize=(4, 4))
    ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(confmat.shape[0]):
        for j in range(confmat.shape[1]):
            ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.tight_layout()

In [None]:
# evaluate model with test set
model_evaluate()

In [None]:
#PLot AUC-curve
import sklearn.metrics as metrics
y_pred = gbtree.predict_proba(test)
fpr, tpr, threshold = metrics.roc_curve(labels_test, y_pred[:,1])
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
#PLot PR-curve
from sklearn.metrics import precision_recall_curve
y_pred = gbtree.predict_proba(test)
precision, recall, thresholds = precision_recall_curve(labels_test, y_pred[:,1])
 
thresholds = np.append(thresholds, 1)
f1_scores = 2*(precision*recall)/(precision+recall)
plt.step(recall, precision, color='b', alpha=0.4, where='post')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall curve')
plt.show()