In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import missingno as msno
import category_encoders as ce
import optuna
import warnings
warnings.filterwarnings('ignore')

from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.pipeline import Pipeline

# Data Exploration

In [None]:
df = pd.read_csv('../input/churn-modelling/Churn_Modelling.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
msno.matrix(df)

Our data is clean and there is no missing value.

In [None]:
df = df.drop(columns=['RowNumber', 'CustomerId'])

We are removing the first 2 columns because we don't need them for prediction.

In [None]:
df['Surname'].nunique()

For the Surname column, we'll save it first because it might work. <br>
Assuming the same family has a similar Churn probability.

## Statistic Summary

In [None]:
df.describe()

In [None]:
df.describe(include='object')

## Visualization

### Target Feature

In [None]:
fig = px.histogram(df, x='Exited',
                   height=400, width=500,
                   title='Target Feature Distribution')
fig.update_xaxes(type='category')
fig.show()

In [None]:
df['Exited'].value_counts(normalize=True)*100

The target distribution on the dataset is unbalanced. But this is normal because this is a customer churn dataset.

In terms of the dataset, it can be said to be good because there are enough positive classes so that the model will be easier to detect positive class. <br>
However, from a business perspective, it is not good because the Churn rate is quite high.

Because of this we will use the AUC ROC metric at the modeling stage with focus in higher recall on positive class.

### Numerical Features

In [None]:
fig = make_subplots(rows=2, cols=3)

fig.append_trace(go.Histogram(
    x=df['CreditScore'], name='Credit Score', nbinsx=50
), row=1, col=1)
fig.update_xaxes(title_text='Credit Score', row=1, col=1)

fig.append_trace(go.Histogram(
    x=df['Age'], name='Age', nbinsx=30
), row=1, col=2)
fig.update_xaxes(title_text='Age', row=1, col=2)

fig.append_trace(go.Histogram(
    x=df['Balance'], name='Balance', nbinsx=20
), row=1, col=3)
fig.update_xaxes(title_text='Balance', row=1, col=3)

for col, feature in enumerate(['Tenure', 'EstimatedSalary']):
    fig.append_trace(go.Histogram(
        x=df[feature], name=feature,
        nbinsx=20
    ), row=2, col=col+1)
    fig.update_xaxes(title_text=feature, row=2, col=col+1)

fig.update_layout(
    height=700, width=1200, 
    title_text='Features Distribution with Histogram'
)
fig.show()

In [None]:
print('Customer dengan Balance 0:',len(df[df['Balance']==0]))

* Credit Score has a fairly normal distribution but there is an anomaly in customers with a credit score of 840 to 859, which is quite high compared to the previous range.
* Age has the right-skewed distribution with the largest number of customers in the 35 to 29 year age segment (2308 people)
* Balance has a normal distribution but there is an anomaly in the Balance with a value of 0 with a total of 3617 people.
* Tenure and EstimatedSalary have a uniform distribution

In [None]:
fig = px.histogram(
    df, x='CreditScore', color='Exited',
    marginal='box', nbins=50,
    color_discrete_map={0: '#636EFA', 1: '#EF553B'},
    barmode='overlay'
)

fig.update_layout(
    height=500, width=800, 
    title_text='Credit Score Feature in Detail'
)
fig.show()

Customers with class 0 and 1 on the Credit Score feature both have a normal distribution with the anomaly on the right. <br>
For class 1 there are several outliers on the left.

There is a median difference between the two classes but not significant. The median for class 1 is slightly lower. In other words, customers with a low credit score have a higher (but not significant) churn rate.

In [None]:
fig = px.histogram(
    df, x='Age', color='Exited',
    marginal='box', nbins=30,
    color_discrete_map={0: '#636EFA', 1: '#EF553B'},
    barmode='overlay'
)

fig.update_layout(height=500, width=800, 
                  title_text='Age Feature in Detail')
fig.show()

Customers with class 0 in the Age feature have a right-skew distribution while those for class 1 have a normal distribution. There are several outliers in class 1 and quite a number of outliers in class 0.

When viewed from the median, customers with old age have a higher tendency to churn.

In [None]:
fig = px.histogram(
    df, x='Balance', color='Exited',
    marginal='box', nbins=20,
    color_discrete_map={0: '#636EFA', 1: '#EF553B'},
    barmode='overlay'
)

fig.update_layout(height=500, width=800, 
                  title_text='Balance Feature in Detail')
fig.show()

Both classes have the same distribution with the anomaly at Balance 0. There are no outliers.

Regardless of the anomaly, the two distributions appear to have the same median. However, due to the anomaly at value 0, the median for class 0 is lower because at value 0 there are more class 0 compared to class 1.

In [None]:
fig = px.histogram(
    df, x='Tenure', color='Exited', marginal='box',
    color_discrete_map={0: '#636EFA', 1: '#EF553B'},
    barmode='overlay'
)
fig.update_layout(height=500, width=800, 
                  title_text='Tenure Feature in Detail')
fig.show()

Both classes in the Tenure feature have the same distribution and both have no outliers.

In [None]:
fig = px.histogram(
    df, x='Tenure', color='Exited',
    color_discrete_map={0: '#636EFA', 1: '#EF553B'},
    category_orders={'Tenure': [0,1,2,3,4,5,6,7,8,9,10]},
    barnorm='percent'
)

fig.update_layout(
    height=500, width=800, 
    title_text='Tenure Feature in Detail',
    yaxis_title='Percentage of Churn',
    yaxis={'ticksuffix':'%'}
)
fig.update_xaxes(type='category')
fig.show()

There is no significant difference between the churn and the average level is 20%. <br>
Customers with a 7 year Tenure had the lowest churn rate (17.2%).

In [None]:
fig = px.histogram(
    df, x='EstimatedSalary', color='Exited', marginal='box',
    color_discrete_map={0: '#636EFA', 1: '#EF553B'},
    barmode='overlay', nbins=20
)

fig.update_layout(height=500, width=800, 
                  title_text='EstimatedSalary Feature in Detail')
fig.show()

The two classes in the EstimatedSalary feature have a uniform distribution, with a slightly higher median value in class 1.

### Categorical Features

In [None]:
fig = make_subplots(rows=2, cols=3)

# For loop for the first row
for col, feature in enumerate(['NumOfProducts', 'HasCrCard', 'IsActiveMember']):
    fig.append_trace(go.Histogram(
        x=df[feature], name=feature,
    ), row=1, col=col+1)
    fig.update_xaxes(title_text=feature, row=1, col=col+1)

# For loop for the second row
for col, feature in enumerate(['Geography', 'Gender']):
    fig.append_trace(go.Histogram(
        x=df[feature], name=feature,
    ), row=2, col=col+1)
    fig.update_xaxes(title_text=feature, row=2, col=col+1)

fig.update_xaxes(type='category', 
                 categoryorder='category ascending')
fig.update_layout(height=700, width=1200, 
                  title_text='Categorical Features Distribution')
fig.show()

* Our dataset is dominated by customers who have 1 and 2 products. The intensity of customers who have 3 and 4 products is only a few.
* Customers who have more credit cards (more than 2 times than those who do not)
* There are quite a lot of inactive members, almost equal to active members.
* There are far more customers from France than customers from Germany and Spain.
* There are more male customers

In [None]:
fig = px.histogram(
    df, x='NumOfProducts', color='Exited',
    color_discrete_map={0: '#636EFA', 1: '#EF553B'},
    barnorm='percent'
)
fig.update_layout(
    height=500, width=800, 
    title_text='NumOfProducts Feature in Detail',
    yaxis_title='Percentage of Churn',
    yaxis={'ticksuffix':'%'}
)
fig.update_xaxes(
    type='category',
    categoryorder='category ascending'
)
fig.show()

From categories 1 and 2 with the highest number of customers, we can see that customers who only have 1 product have a higher churn rate (27.7%).

Meanwhile, customers with 3 products had a churn rate of 82.7% and the most were customers with 4 products that had a churn rate of 100%.

In [None]:
fig = px.histogram(
    df, x='HasCrCard', color='Exited',
    color_discrete_map={0: '#636EFA', 1: '#EF553B'},
    barnorm='percent'
)
fig.update_layout(height=500, width=800, 
                  title_text='HasCrCard Feature in Detail',
                  yaxis_title='Percentage of Churn',
                  yaxis={'ticksuffix':'%'})
fig.update_xaxes(
    type='category',
    categoryorder='category ascending'
)
fig.show()

There is no significant difference in this feature. Both categories have the same churn rate.

In [None]:
fig = px.histogram(
    df, x='IsActiveMember', color='Exited',
    color_discrete_map={0: '#636EFA', 1: '#EF553B'},
    barnorm='percent'
)
fig.update_layout(height=500, width=800, 
                  title_text='IsActiveMember Feature in Detail',
                  yaxis_title='Percentage of Churn',
                  yaxis={'ticksuffix':'%'})
fig.update_xaxes(
    type='category',
    categoryorder='category ascending'
)
fig.show()

Inactive customers have a higher churn rate with a portion of 26.8% compared to active customers (14.2%).

In [None]:
fig = px.histogram(
    df, x='Geography', color='Exited',
    barnorm='percent',
    color_discrete_map={0: '#636EFA', 1: '#EF553B'}
)
fig.update_yaxes(title_text='Percentage of Churn')
fig.update_layout(height=500, width=800, 
                  title_text='Exited Percentage by Geography',
                  yaxis={'ticksuffix':'%'})
fig.show()

Customers from Germany have a churn rate of 32.4%, while customers from France are 16.2% and Spain 16.7%.

In [None]:
fig = px.histogram(
    df, x='Gender', color='Exited',
    barnorm='percent',
    color_discrete_map={0: '#636EFA', 1: '#EF553B'}
)
fig.update_yaxes(title_text='Percent')

fig.update_layout(height=500, width=800, 
                  title_text='Exited Percentage by Gender',
                  yaxis={'ticksuffix':'%'})
fig.show()

Female customers have a higher churn rate (25%) than male customers (16.5%).

### Heatmap Correlation

In [None]:
encoder = ce.TargetEncoder()
df_temp = encoder.fit_transform(df.drop(columns='Exited'), df['Exited'])
df_corr = df_temp.join(df['Exited']).corr()

fig = ff.create_annotated_heatmap(
    z=df_corr.values,
    x=list(df_corr.columns),
    y=list(df_corr.index),
    annotation_text=df_corr.round(2).values,
    showscale=True, colorscale='Viridis'
)
fig.update_layout(height=600, width=800, 
                  title_text='Feature Correlation')
fig.update_xaxes(side='bottom')
fig.show()

The highest correlation to Target is the Surname feature (0.36), and the second is the Age feature with a value of 0.29.

The insights that can be obtained from this data are:
* Customers with higher Age have a higher churn rate
* There are family names (Surname) whose churn level is higher

## Data Preprocessing

### Feature Enginering

In [None]:
df['BalanceToSalaryRatio'] = df['Balance'] / df['EstimatedSalary']

In [None]:
from itertools import combinations
cat_cols = df.select_dtypes('object').columns

for col in combinations(cat_cols, 2):
    df[col[0]+'_'+col[1]] = df[col[0]] + "_" + df[col[1]]
    
df.head()

In [None]:
df.describe(include='object')

In [None]:
encoder = ce.TargetEncoder()
df_temp = encoder.fit_transform(df.drop(columns='Exited'), df['Exited'])
df_corr = df_temp.join(df['Exited']).corr()

fig = ff.create_annotated_heatmap(
    z=df_corr.values,
    x=list(df_corr.columns),
    y=list(df_corr.index),
    annotation_text=df_corr.round(2).values,
    showscale=True, colorscale='Viridis'
)
fig.update_layout(height=700, width=900, 
                  title_text='Feature Correlation')
fig.update_xaxes(side='bottom')
fig.show()

After performing feature engineering we get a new feature with a higher correlation with the target.

In [None]:
df.head()

In [None]:
df.describe(include='object')

### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns='Exited'), df['Exited'],
    test_size=0.2, random_state=0,
)

### Building Pipeline

In [None]:
# Ratio using for scale_pos_weight to get better recall on imbalance class
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

In [None]:
xgb_pipeline = Pipeline([
    ('one_hot', ce.OneHotEncoder(cols=['Geography', 'Gender', 'Geography_Gender'])),
    ('catboost', ce.CatBoostEncoder(cols=['Surname', 'Surname_Geography', 'Surname_Gender'])),
    ('xgb', XGBClassifier(scale_pos_weight=ratio))
])

In [None]:
lgb_pipeline = Pipeline([
    ('one_hot', ce.OneHotEncoder(cols=['Geography', 'Gender', 'Geography_Gender'])),
    ('catboost', ce.CatBoostEncoder(cols=['Surname', 'Surname_Geography', 'Surname_Gender'])),
    ('lgb', LGBMClassifier(scale_pos_weight=ratio))
])

In [None]:
cat_pipeline = Pipeline([
    ('one_hot', ce.OneHotEncoder(cols=['Geography', 'Gender', 'Geography_Gender'])),
    ('catboost', ce.CatBoostEncoder(cols=['Surname', 'Surname_Geography', 'Surname_Gender'])),
    ('cat', CatBoostClassifier(scale_pos_weight=ratio, verbose=0))
])

## Modeling

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn import metrics

def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nPrecision={:0.3f} | Recall={:0.3f}\nAccuracy={:0.3f} | F1 Score={:0.3f}".format(
                precision, recall, accuracy, f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    plt.subplot(1,2,1)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)


def model_eval(model, X_train, y_train, 
               scoring_='roc_auc', cv_=5):
  
    model.fit(X_train, y_train)

    train_pred = model.predict(X_train)
    train_predprob = model.predict_proba(X_train)[:,1]
           
    cv_score = cross_val_score(model, X_train, y_train, cv=cv_, scoring=scoring_)
    print('Model Report on Train and CV Set:')
    print('--------')
    print('Train Accuracy: {:0.6f}'.format(metrics.accuracy_score(y_train, train_pred)))
    print('Train AUC Score: {:0.6f}'.format(metrics.roc_auc_score(y_train, train_predprob)))
    print('CV AUC Score: Mean - {:0.6f} | Std - {:0.6f} | Min - {:0.6f} | Max - {:0.6f} \n'.format(
        np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))



def test_eval(model, X_train, X_test, y_train, y_test):
    
    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    predprob = model.predict_proba(X_test)[:,1]
    
    print('Model Report on Test Set:')
    print('--------')
    print('Classification Report \n', metrics.classification_report(y_test, pred))

    conf = metrics.confusion_matrix(y_test, pred)
    group_names = ['True Negative', 'False Positive', 'False Negtive', 'True Positive']
    make_confusion_matrix(conf, percent=False, group_names=group_names,
                          figsize=(14,5), title='Confusion Matrix')

    plt.subplot(1,2,2)
    fpr, tpr, _ = metrics.roc_curve(y_test, predprob)
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve\nAUC Score: {:0.3f}'.format(metrics.roc_auc_score(y_test, predprob)))
    plt.legend()

Confusion Matrix function credit to [DTrimarchi10](https://github.com/DTrimarchi10/confusion_matrix)

#### XGBoost

In [None]:
test_eval(xgb_pipeline, X_train, X_test, y_train, y_test)

#### LightGBM

In [None]:
test_eval(lgb_pipeline, X_train, X_test, y_train, y_test)

#### CatBoost

In [None]:
test_eval(cat_pipeline, X_train, X_test, y_train, y_test)

#### CatBoost (built in categorical encoder)

In [None]:
cat_features = df.select_dtypes('object').columns

cat = CatBoostClassifier(scale_pos_weight=ratio,
                         verbose=0, cat_features=cat_features)
test_eval(cat, X_train, X_test, y_train, y_test)

The best model is CatBoost (built in categorical encoder) with ROC AUC Score 0.874, and Recall rate 0.76 on positive class.

I will update this notebook with a model after hyperparamete tuning.