In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# necessary package
from collections import Counter
from scipy import stats

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import imblearn

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('tableau-colorblind10')
import seaborn as sns
from plotnine import *
import plotly.graph_objects as go
import plotly.express as px

## Load Data

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
df.head()

In [None]:
df.info()

In [None]:
len(df['CLIENTNUM'].unique())

## Dataprep

In [None]:
# Correcting Data Type
## string 



In [None]:
# drop columns
df = df.drop(columns = ['CLIENTNUM', 
              'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 
             'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'])
df.info()


## Exploration


In [None]:
# Data Distribution + Sample Selection

cat_col = [x for x in df.columns if df[x].dtype == 'O']
num_col = [x for x in df.columns if x not in cat_col]

print('cat_col : {} \n\
num_col : {}'.format(len(cat_col), len(num_col)))

In [None]:
## categorical data
# var = 'Gender'
# tmp = df[var].value_counts().reset_index()
# plt.bar(tmp['index'], tmp[var])
# plt.title(var)

fig = plt.figure(figsize=(10, 8))
fig.subplots_adjust(hspace=0.8, wspace=0.8)
for i,var in enumerate(cat_col):
    tmp = df[var].value_counts().reset_index()
    ax = fig.add_subplot(2, 3, i+1)
    ax.bar(tmp['index'], tmp[var], color='black')
    ax.set_title(var)
    ax.set_xticklabels(tmp['index'], rotation=90)


In [None]:
# df['Attrition_Flag'].value_counts()
1627/(1627+8500)

In [None]:
query = (df['Education_Level'] == 'Unknown') 
df[query]['Attrition_Flag'].value_counts()

In [None]:
# df['Marital_Status'].value_counts()
# 749/10127
query = (df['Marital_Status'] == 'Unknown') 
df[query]['Attrition_Flag'].value_counts()

In [None]:
query = (df['Income_Category'] == 'Unknown') 
df[query]['Attrition_Flag'].value_counts()

In [None]:
# df.groupby(['Card_Category'])['Attrition_Flag'].value_counts()
df['Card_Category'].value_counts()
# 9436/10127

Note
* imbalance data (Attrited Customer 16.00%)
* Unknown data (Education_Level, Marital_Status, Income_Category)
* Blue Card almost entire data sets

In [None]:
# numerical data
# var = 'Customer_Age'
# tmp = df[var].value_counts().reset_index()
# plt.hist(df[var], color='black')
# plt.title(var)

fig = plt.figure(figsize=(10, 12))
fig.subplots_adjust(hspace=0.8, wspace=0.8)
for i,var in enumerate(num_col):
    ax = fig.add_subplot(5, 3, i+1)
    ax.hist(df[var], color='black')
    ax.set_title(var)

In [None]:
# fig.savefig('./fig2.png')

In [None]:
# Avg_Utilization_Ratio เท่ากับ 0 นี่คือไม่ใช้บัตรเลยหรอ แล้วเปิดทำไม หรือเพิ่งสมัคร ?? 
df[df['Avg_Utilization_Ratio'] == 0]['Months_on_book'].sort_values()

In [None]:
len(num_col) 

In [None]:
## Relationship between 2 variables
plt.figure(figsize=(12,10))
sns.heatmap(df[num_col].corr(), vmin=-1.0, vmax=1.0, linewidths=0.5, cmap='icefire', annot=True, fmt='.2f')
# plt.savefig

In [None]:
df[num_col].info()

In [None]:
# Sample Selection 
## Marital_Status == 'Unknown'
# df = df[df['Marital_Status'] != 'Unknown']

## Customer Profile

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
df.head()

In [None]:
# define features types
demog_features = ['Customer_Age', 'Gender', 'Dependent_count', 'Education_Level', 
                 'Marital_Status', 'Income_Category']

relate_features = ['Card_Category', 'Months_on_book', 'Total_Relationship_Count', 
                     'Credit_Limit']

behavior_features = ['Months_Inactive_12_mon', 'Contacts_Count_12_mon', 
                     'Total_Revolving_Bal', 'Avg_Open_To_Buy', 
                    'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 
                    'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']



In [None]:
# segmentation : clustering customers by their Demographic and Relationship with Bank
segment_frame = df[['CLIENTNUM'] + demog_features + relate_features]

# retrieve data for clustering 
cat_var = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 
          'Card_Category']
num_var = [x for x in segment_frame.columns.tolist() if x not in cat_var + ['CLIENTNUM']]

X_num = segment_frame[num_var].values
enc = OneHotEncoder()
X_cat = enc.fit_transform(segment_frame[cat_var]).toarray()
X = np.concatenate((X_num, X_cat), axis=1)

print(X.shape)

In [None]:
X[:, 4:]

In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

print(__doc__)

# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
# X =  

range_n_clusters = [3, 4, 5]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X[:, 0], X[:, 2], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 2], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

In [None]:
# selected k (k=4)
clusterer = KMeans(n_clusters=3, random_state=10)
cluster_labels = clusterer.fit_predict(X)



In [None]:
segment_frame['customer_group'] = cluster_labels
segment_frame

In [None]:
# demog + relate by group
segment_frame['customer_group'].value_counts()

In [None]:
cat_var = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 
          'Card_Category', 'Months_on_book', 'Total_Relationship_Count']



fig = plt.figure(figsize=(14, 12))
fig.subplots_adjust(hspace=1.0, wspace=0.4)
for i,var in enumerate(cat_var):
    ax = fig.add_subplot(3, 3, i+1)
    ax = sns.heatmap(pd.crosstab(segment_frame['customer_group'], 
                                 segment_frame[var], normalize='index'), 
                     cmap='Greens', linewidths=0.5)
    ax.set_title(var)
    

In [None]:
num_var = ['Customer_Age', 'Dependent_count', 'Credit_Limit']

fig = plt.figure(figsize=(14, 4))
fig.subplots_adjust(hspace=0.4, wspace=0.4)
for i,var in enumerate(num_var):
    ax = fig.add_subplot(1, 3, i+1)
    ax = sns.boxplot(data=segment_frame, x='customer_group', y=var)
    ax.set_title(var)
    

In [None]:
# Create customer profile table
customer_profile = pd.merge(df[['CLIENTNUM'] + behavior_features], 
                            segment_frame[['CLIENTNUM', 'customer_group']], 
                            on='CLIENTNUM')
customer_profile = customer_profile.drop(columns=['CLIENTNUM'])
customer_profile.head()

In [None]:
behavior_features

In [None]:
# visualize behavior using radar plot for each group
fig = px.parallel_coordinates(customer_profile, color='customer_group', 
                             dimensions=['Avg_Open_To_Buy', 'Avg_Utilization_Ratio', 
                                         'Total_Trans_Amt', 'Total_Trans_Ct'
                                        ])
fig.show()




In [None]:

# radar
tmp = customer_profile.groupby('customer_group').mean()
tmp = tmp.apply(lambda x: x/x.max(), axis=0)

# tmp
var = tmp.columns.tolist()[1:]

fig = go.Figure()

fig.add_trace(go.Scatterpolar(r = tmp.iloc[0, 1:].values.tolist(), 
                             theta = var, 
                             fill ='toself', 
                             name = 'Group 0'))
fig.add_trace(go.Scatterpolar(r = tmp.iloc[1, 1:].values.tolist(), 
                             theta = var, 
                             fill ='toself', 
                             name = 'Group 1'))
fig.add_trace(go.Scatterpolar(r = tmp.iloc[2, 1:].values.tolist(), 
                             theta = var, 
                             fill ='toself', 
                             name = 'Group 2'))

fig.update_layout(polar=dict(radialaxis=dict(visible=True)), showlegend=True)
fig.show()

# data = pd.DataFrame(dict(r = tmp.iloc[0, 1:].values.tolist(), 
#                         theta = var))
# fig = px.line_polar(data, r='r', theta='theta', 
#                     line_close=True)
# fig.show()


## Feature Engineering
* Binning : Customer_Age, Months_on_book, Credit_Limit, Total_Revolving_Bal, <br>Avg_Open_To_Buy, Total_Amt_Chng_Q4_Q1, Total_Trans_Amt, Total_Trans_Ct, <br>Total_Ct_Chng_Q4_Q1, Avg_Utilization_Ratio
* Weight of Evidence


In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
df.head()

In [None]:
# drop columns
df = df.drop(columns = ['CLIENTNUM', 
              'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 
             'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'])
df.info()


### Binning

In [None]:
# Customer_Age : <20, 20-30, 30-40, 40-50, 50-60, >60
converter = lambda x: '<30' if (x <= 30) else \
                        ('30-40' if (x > 30 and x <= 40) else \
                        ('40-50' if (x > 40 and x <= 50) else \
                        ('50-60' if (x > 50 and x <= 60) else \
                        '>60')))


tmp = df['Customer_Age'].apply(converter)
df['Customer_Age_Binning'] = tmp

tmp2 = df.groupby('Customer_Age_Binning')['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

tmp2['Customer_Age_Binning'] = pd.Categorical(tmp2['Customer_Age_Binning'], 
                                              categories=['<30','30-40','40-50','50-60','>60'], 
                                              ordered=True)
fig_age_bin = ggplot(tmp2) +\
                geom_col(aes(x='Customer_Age_Binning', y='Frequency_of_Loans', fill='Attrition_Flag'))
fig_age_bin

In [None]:
# Months_on_book
var = 'Months_on_book'
var_binning = var + '_Binning'
tmp = pd.qcut(df[var], q=4)
df[var_binning] = tmp

tmp2 = df.groupby(var_binning)['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

fig_mob_bin = ggplot(tmp2) +\
                        geom_col(aes(x=var_binning, y='Frequency_of_Loans', 
                                    fill='Attrition_Flag'))
fig_mob_bin


In [None]:
# ggsave(fig_mob_bin, './fig4_mob_binning.png')

In [None]:
# Credit_Limit
var = 'Credit_Limit'
var_binning = var + '_Binning'
tmp = pd.qcut(df[var], q=4)
df[var_binning] = tmp

tmp2 = df.groupby(var_binning)['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

fig_creditlim_bin = ggplot(tmp2) +\
                        geom_col(aes(x=var_binning, y='Frequency_of_Loans', 
                                    fill='Attrition_Flag'))
fig_creditlim_bin


In [None]:
# Total_Revolving_Bal
var = 'Total_Revolving_Bal'
var_binning = var + '_Binning'
tmp = pd.qcut(df[var], q=4)
df[var_binning] = tmp

tmp2 = df.groupby(var_binning)['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

fig_revolve_bin = ggplot(tmp2) +\
                        geom_col(aes(x=var_binning, y='Frequency_of_Loans', 
                                    fill='Attrition_Flag'))
fig_revolve_bin


In [None]:
# ggsave(fig_revolve_bin, './fig6_revolve_binning.png')

In [None]:
# Avg_Open_To_Buy
var = 'Avg_Open_To_Buy'
var_binning = var + '_Binning'
tmp = pd.qcut(df[var], q=4)
df[var_binning] = tmp

tmp2 = df.groupby(var_binning)['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

fig_opentobuy_bin = ggplot(tmp2) +\
                        geom_col(aes(x=var_binning, y='Frequency_of_Loans', 
                                    fill='Attrition_Flag'))
fig_opentobuy_bin


In [None]:
# Total_Amt_Chng_Q4_Q1, 

var = 'Total_Amt_Chng_Q4_Q1'
var_binning = var + '_Binning'
tmp = pd.qcut(df[var], q=4)
df[var_binning] = tmp

tmp2 = df.groupby(var_binning)['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

fig_amtq4q1_bin = ggplot(tmp2) +\
                        geom_col(aes(x=var_binning, y='Frequency_of_Loans', 
                                    fill='Attrition_Flag'))
fig_amtq4q1_bin

In [None]:
# ggsave(fig_amtq4q1_bin, './fig6_amtq4q1_binning.png')


In [None]:
# Total_Trans_Amt 

var = 'Total_Trans_Amt'
var_binning = var + '_Binning'
tmp = pd.qcut(df[var], q=4)
df[var_binning] = tmp

tmp2 = df.groupby(var_binning)['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

fig_transamt_bin = ggplot(tmp2) +\
                        geom_col(aes(x=var_binning, y='Frequency_of_Loans', 
                                    fill='Attrition_Flag'))
fig_transamt_bin

In [None]:
# ggsave(fig_transamt_bin, './fig9_transamt_binning.png')


In [None]:
# Total_Trans_Ct

var = 'Total_Trans_Ct'
var_binning = var + '_Binning'
tmp = pd.qcut(df[var], q=4)
df[var_binning] = tmp

tmp2 = df.groupby(var_binning)['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

fig_transct_bin = ggplot(tmp2) +\
                        geom_col(aes(x=var_binning, y='Frequency_of_Loans', 
                                    fill='Attrition_Flag'))
fig_transct_bin

In [None]:
# ggsave(fig_transct_bin, './fig10_transct_binning.png')

In [None]:
# Total_Ct_Chng_Q4_Q1,
var = 'Total_Ct_Chng_Q4_Q1'
var_binning = var + '_Binning'
tmp = pd.qcut(df[var], q=4)
df[var_binning] = tmp

tmp2 = df.groupby(var_binning)['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

fig_ctq4q1_bin = ggplot(tmp2) +\
                        geom_col(aes(x=var_binning, y='Frequency_of_Loans', 
                                    fill='Attrition_Flag'))
fig_ctq4q1_bin

In [None]:
# Avg_Utilization_Ratio
var = 'Avg_Utilization_Ratio'
var_binning = var + '_Binning'
tmp = pd.qcut(df[var], q=4)
df[var_binning] = tmp

tmp2 = df.groupby(var_binning)['Attrition_Flag'].value_counts()
tmp2 = pd.DataFrame(tmp2)
tmp2 = tmp2.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
tmp2 = tmp2.reset_index()

fig_utilize_bin = ggplot(tmp2) +\
                        geom_col(aes(x=var_binning, y='Frequency_of_Loans', 
                                    fill='Attrition_Flag'))
fig_utilize_bin

### WOE

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['Attrition_Flag'].value_counts()

In [None]:
df['Card_Category'].value_counts()

In [None]:
df[df['Avg_Utilization_Ratio'] == 0]['Attrition_Flag'].value_counts()

In [None]:
selected_feature = ['Attrition_Flag', 
                   'Gender', 'Customer_Age_Binning', 'Dependent_count', 'Education_Level', 'Marital_Status', 'Income_Category', 
                   'Months_on_book_Binning', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 
                   'Credit_Limit_Binning', 'Total_Revolving_Bal_Binning', 'Avg_Open_To_Buy_Binning', 'Total_Amt_Chng_Q4_Q1_Binning', 
                   'Total_Trans_Amt_Binning', 'Total_Trans_Ct_Binning', 'Total_Ct_Chng_Q4_Q1_Binning', 'Avg_Utilization_Ratio_Binning']

df = df[selected_feature]
df.info()

In [None]:
iv_list = []

# loop
for var in selected_feature[1:]:
    new_var = var + '_woe'
    # create woe table
    tmp = df.groupby(var)['Attrition_Flag'].value_counts()
    tmp = pd.DataFrame(tmp)
    tmp = tmp.rename(columns={'Attrition_Flag':'Frequency_of_Loans'})
    tmp = tmp.reset_index()
    woe_table = tmp.pivot(index=var, columns='Attrition_Flag', values='Frequency_of_Loans')

    # calculate WOE&IV
    num_attrited = woe_table['Attrited Customer'].sum()
    num_existing = woe_table['Existing Customer'].sum()

    woe_table['DB'] = woe_table['Attrited Customer']/num_attrited
    woe_table['DG'] = woe_table['Existing Customer']/num_existing
    woe_table['perc_diff'] = woe_table['DG']-woe_table['DB']
    woe_table['woe'] = np.log(woe_table['DG']/woe_table['DB'])

    # store IV value
    iv_data = {'var':var, 
               'iv':(woe_table['perc_diff']*woe_table['woe']).sum()}
    iv_list.append(iv_data)

    # transform categorical into woe
    df[new_var] = pd.merge(df, woe_table, how='left', left_on=var, 
                                right_index=True)['woe']



In [None]:
df.head()

## Feature Selection

In [None]:
# IV
iv_table = pd.DataFrame(iv_list)
iv_table = iv_table.sort_values('iv', ascending=True).reset_index(drop=True)
iv_table['var'] = pd.Categorical(iv_table['var'], categories=iv_table['var'].unique().tolist(), 
                                 ordered=True)
iv_table

In [None]:
iv_table = iv_table.rename(columns={'iv':'information value'})
iv_table.head()

In [None]:
ggplot(iv_table) +\
    geom_col(aes(x='var', y='information value'), fill='black') +\
    geom_hline(yintercept=0.3, linetype='dashed', color='red', size=1.0) +\
    theme(axis_title_y = element_blank()) +\
    coord_flip()


## Churner Profile

In [None]:
#  
selected_features = iv_table[iv_table['information value'] >= 0.3]['var'].unique().tolist()[::-1]
selected_features

In [None]:
org_df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
org_df.head()

var = ['Attrition_Flag', 
       'Total_Trans_Ct',
       'Total_Trans_Amt',
       'Total_Revolving_Bal',
       'Total_Ct_Chng_Q4_Q1',
       'Avg_Utilization_Ratio',
       'Months_Inactive_12_mon']

org_df[var]

In [None]:
tmp = pd.melt(org_df[var], id_vars='Attrition_Flag')

ggplot(tmp, aes(x='Attrition_Flag', y='value', fill='Attrition_Flag')) +\
    geom_boxplot() +\
    facet_wrap('variable', scales='free_y') +\
    theme(subplots_adjust={'wspace':0.40, 'hspace':0.25}, 
          axis_text_x=element_blank(), 
         axis_title_x=element_blank(), 
         axis_title_y=element_blank(), 
         figure_size=(16,8))
    


In [None]:
org_df.groupby('Attrition_Flag')[var].agg([np.mean, np.std])

## Churn Prediction Model
* Dealing with imbalance data using SMOTE
* Logistic Regression
* Diagnosis
* Evaluation


In [None]:
# df
selected_features_woe = [x + '_woe' for x in selected_features]
df = df[['Attrition_Flag'] + selected_features_woe]
df.info()

In [None]:
df['Attrition_Flag'].value_counts()

In [None]:
# split train/test 80:20
X = df[selected_features_woe]
y = df['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, 
                                                    test_size=0.20, 
                                                    random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


In [None]:
print('train : {}\ntest : {}'.format(Counter(y_train), Counter(y_test)))

### SMOTE on training data
* use typical k (k=5)

In [None]:
# oversampling
smote =  imblearn.over_sampling.SMOTE(sampling_strategy='auto', 
                                   k_neighbors=5, 
                                   random_state=42)

X_res, y_res = smote.fit_resample(X_train, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res)))


In [None]:
df_train = pd.concat([X_res, pd.DataFrame({'Attrition_Flag':y_res})], axis=1)
df_train.head()


In [None]:
df_train.info()

### Logistic Regression
* GLM 
* Diagnosis : Goodness of Fit, Overdispersion, Influence
* Evaluation

ref.<br>
https://www.statsmodels.org/stable/examples/notebooks/generated/glm_formula.html <br>
https://www.statsmodels.org/stable/examples/notebooks/generated/influence_glm_logit.html <br>


In [None]:
df_train.columns

In [None]:
formula = 'Attrition_Flag ~ 0 + Total_Trans_Ct_Binning_woe +\
                            Total_Trans_Amt_Binning_woe +\
                            Total_Revolving_Bal_Binning_woe +\
                            Total_Ct_Chng_Q4_Q1_Binning_woe +\
                            Avg_Utilization_Ratio_Binning_woe +\
                            Months_Inactive_12_mon_woe'


model1 = smf.glm(data = df_train, formula=formula, family=sm.families.Binomial()).fit()
print(model1.summary())


Based on the results of Wald's test, we omit the variable <br>Avg_Utilization_Ratio_Binning_woe because p-value > 0.05.

In [None]:
# prediction 
model1.mu

#### Diagnosis

In [None]:
# Goodness of Fit : I use Hosmer-Lemeshow test instead Deviance due to our target variable was binary response 
## https://en.wikipedia.org/wiki/Hosmer%E2%80%93Lemeshow_test

# H-L component
obs = pd.get_dummies(df_train['Attrition_Flag'])
prob =  pd.DataFrame({'prob':model1.mu})
q_prob = pd.DataFrame({'q_prob':pd.qcut(prob['prob'], q=10)})
hl_frame = pd.concat([obs, prob, q_prob], axis=1)

# H-L statframe
hl_pivotframe = hl_frame.groupby('q_prob').agg({'Attrited Customer':'sum', 
                                                'Existing Customer':'sum', 
                                                'prob':'mean'})

# calculate H-L
hl_pivotframe['obs_all'] = hl_pivotframe.iloc[:, 0] + hl_pivotframe.iloc[:, 1]
hl_pivotframe['expect_1'] = hl_pivotframe['prob'] * hl_pivotframe['obs_all']
hl_pivotframe['expect_0'] = hl_pivotframe['obs_all'] - hl_pivotframe['expect_1']

hl_pivotframe['HL'] = ((hl_pivotframe['Attrited Customer'] - hl_pivotframe['expect_1'])**2/hl_pivotframe['expect_1']) +\
                        ((hl_pivotframe['Existing Customer'] - hl_pivotframe['expect_0'])**2/hl_pivotframe['expect_0'])
hl_pivotframe



In [None]:
# calculate H-L stat and p-value
hl_stat = hl_pivotframe['HL'].sum()
p_value = 1 - stats.chi2.cdf(hl_stat, 8)
print('H-L = {}\np-value = {}'.format(hl_stat, p_value))


From the H-L test, it was found that the overall model had Goodness of Fit.

In [None]:
ggplot(hl_pivotframe) +\
    geom_point(aes(x='Attrited Customer', y='expect_1')) +\
    labs(title='Logistic Regression (model1) for training dataset:\nobserved vs expected prob.')
    

In [None]:
# Checking Overdispersion 
print('estimated phi_hat : {}'.format(model1.pearson_chi2/model1.df_resid))


Overdispersion not found

In [None]:
# Filter Influencer
infl = model1.get_influence(observed=False)
summ_df = infl.summary_frame()
threshold = 4/(df_train.shape[0] - len(model1.params))
infl_index = summ_df['cooks_d'] > threshold


In [None]:
ggplot(summ_df) +\
    geom_point(aes(x=summ_df.index, y=summ_df['cooks_d']), alpha=0.5) +\
    geom_hline(yintercept=threshold, linetype='dashed', color='red', size=1.0)

note:
from all diagnosis I will
1. remove Avg_Utilization_Ratio_Binning_woe from the model
2. Filter influencer
then fitting the new model

In [None]:
# remove Avg_Utilization_Ratio_Binning_woe
df_train = df_train.drop(columns='Avg_Utilization_Ratio_Binning_woe')

In [None]:
df_train[~infl_index]['Attrition_Flag'].value_counts()

In [None]:
# Filter influencer
newdf_train = df_train[~infl_index]
newdf_train.head()

In [None]:
newdf_train.info()

In [None]:
# fitting new model (model2)
formula = 'Attrition_Flag ~ 0 + Total_Trans_Ct_Binning_woe +\
                            Total_Trans_Amt_Binning_woe +\
                            Total_Revolving_Bal_Binning_woe +\
                            Total_Ct_Chng_Q4_Q1_Binning_woe +\
                            Months_Inactive_12_mon_woe'


model2 = smf.glm(data = newdf_train, formula=formula, family=sm.families.Binomial()).fit()
print(model2.summary())


In [None]:
# Goodness of Fit : I use Hosmer-Lemeshow test instead Deviance due to our target variable was binary response 
## https://en.wikipedia.org/wiki/Hosmer%E2%80%93Lemeshow_test

# H-L component
obs = pd.get_dummies(newdf_train['Attrition_Flag'])
prob =  pd.DataFrame({'prob':model2.mu})
q_prob = pd.DataFrame({'q_prob':pd.qcut(prob['prob'], q=10)})
hl_frame = pd.concat([obs, prob, q_prob], axis=1)

# H-L statframe
hl_pivotframe = hl_frame.groupby('q_prob').agg({'Attrited Customer':'sum', 
                                                'Existing Customer':'sum', 
                                                'prob':'mean'})

# calculate H-L
hl_pivotframe['obs_all'] = hl_pivotframe.iloc[:, 0] + hl_pivotframe.iloc[:, 1]
hl_pivotframe['expect_1'] = hl_pivotframe['prob'] * hl_pivotframe['obs_all']
hl_pivotframe['expect_0'] = hl_pivotframe['obs_all'] - hl_pivotframe['expect_1']

hl_pivotframe['HL'] = ((hl_pivotframe['Attrited Customer'] - hl_pivotframe['expect_1'])**2/hl_pivotframe['expect_1']) +\
                        ((hl_pivotframe['Existing Customer'] - hl_pivotframe['expect_0'])**2/hl_pivotframe['expect_0'])
hl_pivotframe


In [None]:
# calculate H-L stat and p-value
hl_stat = hl_pivotframe['HL'].sum()
p_value = 1 - stats.chi2.cdf(hl_stat, 8)
print('H-L = {}\np-value = {}'.format(hl_stat, p_value))


ggplot(hl_pivotframe) +\
    geom_point(aes(x='Attrited Customer', y='expect_1')) +\
    labs(title='Logistic model for train dataset:\nobserved vs expected prob.')
    

In [None]:
# Checking Overdispersion 
print('estimated phi_hat : {}'.format(model1.pearson_chi2/model1.df_resid))


In [None]:
# Finalize Model
lr_model = model2

In [None]:
lr_model.summary()

### Evaluation

In [None]:
y_test = y_test.replace({'Attrited Customer':1, 'Existing Customer':0})

In [None]:
# remove Avg_Utilization_Ratio_Binning_woe
X_test = X_test.drop(columns='Avg_Utilization_Ratio_Binning_woe')
X_test.head()

In [None]:
# predict
y_pred = lr_model.predict(X_test)


In [None]:
threshold = 0.5
prediction = (y_pred > threshold).values.astype('int')

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, prediction)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot(cmap='Greens') 

fig = plt.figure(figsize=(7, 7))

ax = sns.heatmap(cm, annot=True, fmt='d', 
                 cbar=False, cmap='Greens', linewidth=0.5)
ax.set_title('Confusion Matrix using LR\n\
                F1 : {}\n\
                Precision : {}\n\
                Recall : {}'.format(np.round(f1_score(y_test, prediction),2), 
                                   np.round(precision_score(y_test, prediction),2), 
                                   np.round(recall_score(y_test, prediction),2)))




In [None]:
print('Accuracy of lr_model : {}'.format(accuracy_score(y_test, prediction)))
print('F1 Score of lr_model : {}'.format(f1_score(y_test, prediction)))
print('Precision of lr_model : {}'.format(precision_score(y_test, prediction)))
print('Recall of lr_model : {}'.format(recall_score(y_test, prediction)))

In [None]:
# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

plt.plot(recall, precision, marker='.', c='black', label='Logistic Regression')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()


In [None]:
# roc-auc
lr_auc = np.round(roc_auc_score(y_test, y_pred), 2)
lr_fpr, lr_tpr, _ = roc_curve(y_test, y_pred)

plt.figure(figsize=(7, 7))
plt.plot(lr_fpr, lr_tpr, marker='.', c='black', label='LR auc = {}'.format(lr_auc))

plt.title('ROC ')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()


In [None]:
tmp = pd.DataFrame({'Attrition_Flag':y_test, 
                   'y_pred':y_pred})
tmp = tmp.sort_values('y_pred').reset_index(drop=True)
tmp['id'] = [*range(1, tmp.shape[0]+1)]
tmp['Attrition_Flag'] = tmp['Attrition_Flag'].astype('str')
# tmp.head()

ggplot(tmp) +\
    geom_col(aes(x='id', y='y_pred', fill='Attrition_Flag')) +\
    labs(title='Probability of Customer Churn') +\
    coord_flip() +\
    scale_fill_manual(values = ("black", "red")) +\
    theme_light()
    
# theme(axis_title_y = element_blank()) +\

In [None]:
# Threshold Comparison : 0.5, 0.6, 0.7, 0.8


thresholds = [0.5, 0.6, 0.7, 0.8]

fig = plt.figure(figsize=(10,10))
fig.subplots_adjust(hspace=0.5, wspace=0.4)
for i,thr in enumerate(thresholds):
    prediction = (y_pred > thr).values.astype('int')
    cm = confusion_matrix(y_test, prediction)
    
    ax = fig.add_subplot(2, 2, i+1)
    ax = sns.heatmap(cm, annot=True, fmt='d', 
                    cbar=False, cmap='Greens', linewidth=0.5)
    ax.set_title('Confusion Matrix using LR with threshold {}\n\
                    F1 Score : {}\n\
                    Precision : {}\n\
                    Recell : {}'.format(thr, 
                                       np.round(f1_score(y_test, prediction),2), 
                                       np.round(precision_score(y_test, prediction),2), 
                                       np.round(recall_score(y_test, prediction),2)))
    