In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, confusion_matrix

pd.options.display.float_format = '{:,.1f}'.format

# 1. Load data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')

As suggested in the dataset's description on Kaggle, we delete the last two columns. They won't be needed for churn prediction : https://www.kaggle.com/sakshigoyal7/credit-card-customers

In [None]:
df = df.iloc[:, :-2]

# 2. Explore and prepare data

### 2.1 First look

We start by looking at the few random rows of our table to see what the data looks like.

In [None]:
df.sample(5)

In [None]:
print('Dataset size : '+str(df.shape))

We will work with 10 127 observations and 21 variables if we include the label column.

In [None]:
df.CLIENTNUM.nunique()

As it is supposed to be, for all clients we have a unique identification code. We won't be able to do much with the **CLIENTNUM** column beside identifying specific clients.

### 2.2 Focus on the label column

The label column called **Attrition_Flag** contains the following values: *Existing Customer* and *Attrited Customer*

In [None]:
print('Label column distribution in our dataset:')
print(df.Attrition_Flag.value_counts())
sns.set(rc={'figure.figsize':(4,3)})
sns.countplot(data=df,x = 'Attrition_Flag')

We can observe that these two classes are quite unbalanced as we have around 5 times more existing customers than attrited ones.

For easier manipulation of the label column, we will give the value **0** to existing customers and **1** for the customers who quit the bank. 

In [None]:
df = df.replace({'Attrition_Flag' : { 'Existing Customer' : 0, 'Attrited Customer' : 1}})

### 2.3 Missing values

In [None]:
print('Number of missing values :')
print('')
print(df.isna().sum())

We don't have any missing values.

### 2.4 Attribute types

In [None]:
print(df.dtypes.value_counts())
print("====================================================")
print(df.info())

We count 16 variables that are numerical (label column included), the other 5 are categorical.

In [None]:
df_num = df.select_dtypes(include=['int64','float64'])
df_cat = df.select_dtypes(include=['object'])

### 2.5 Analysis of the numerical columns

In [None]:
df_num.describe()

#### A few of many interesting facts we can extract from the numerical attributes:

* **Customer_Age**: Most of our customers are middle aged people. 75% of them are older than 41yo. Standard deviation and range (max-min) are also relatively small. 

* **Dependent_count**: The number of dependents are in avarage 2.3 and never more than 5 people. The metrics indicate that most of our customers are part of small or avraged size households/families.

* **Months_on_book**: Most of the customers were in the bank for about 3 years. The most recent customer was on the book for more than a year while the person who has been with the bank for the longest will be able to celabrate a 5 year old anniversary as a cutomer of the bank (unless this person is an attrited customer).

* **Total_Relationship_Count**: The number of products held by the customer is usually at least 2 and in avarage close to 4 products. This is not very surprising as a product can be different kind of accounts, but also insurance contract or loan. 

* **Months_Inactive_12_mon**: In the last 12 months, 75% of the customers were inactive for at least two months. Also, nonone was inactive for more than 6 months.

* **Contacts_Count_12_mon**: The amount of times the cutomers were in contact with the bank is rather few for 12 months, 2.5 times in avarage. Range goes from 0 to 6.

### 2.6 Correlation matrix 

In [None]:
plt.figure(figsize=(20,10))
mask = np.zeros((df_num.shape[1],df_num.shape[1]))
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df_num.corr(), annot=True, vmin=-1, vmax=1, linewidths=.5, mask = mask)

### 2.7 Analysis of the categorical columns

In [None]:
df_cat.describe()

In [None]:
df_cat = df_cat.join(df['Attrition_Flag'])
df_cat['Attrition_Flag'] = df_cat['Attrition_Flag'].astype(str)

In [None]:
def stats_on_categ(var1,var2 = df_cat.Attrition_Flag):
    pd.options.display.float_format = '{:,.1f}'.format
    tab = pd.crosstab(var1,var2, margins = True)
    #tab['Distribution of attrited custemers by index values (%)'] = (tab['1']/tab.loc['All','1'])*100
    #tab['Part of the ("All") index value that attrited (%)'] = (tab['1']/tab.All)*100
    return tab

def stats_on_categ_visual(var1):
    sns.set(rc={'figure.figsize':(8,6)})
    tab = stats_on_categ(var1)
    tab = tab.drop(['All'], axis=1)
    return tab.plot(kind='bar', stacked=True)

In [None]:
print(stats_on_categ(df_cat.Gender))
stats_on_categ_visual(df_cat.Gender)

In [None]:
print(stats_on_categ(df_cat.Education_Level))
stats_on_categ_visual(df_cat.Education_Level)

In [None]:
print(stats_on_categ(df_cat.Marital_Status))
stats_on_categ_visual(df_cat.Marital_Status)

In [None]:
print(stats_on_categ(df_cat.Income_Category))
stats_on_categ_visual(df_cat.Income_Category)

In [None]:
print(stats_on_categ(df_cat.Card_Category))
stats_on_categ_visual(df_cat.Card_Category)

# 3. Creating a validation and a train/test dataset

Before injecting our data into our model, we set apart around 10% of our data that we won't use during the traing. This will allow us to test the performance of our model on completely unseen data.

The data that will be used for the model training will be split up into train and test.

Before these steps, we randomly shuffle our records to avoid selection bias.

In [None]:
df = df.sample(frac=1,random_state=1).reset_index(drop=True)

In [None]:
df = df.set_index('CLIENTNUM') # customer ID won't be used for churn prediction

In [None]:
df_train = df.iloc[0:9000,:]
df_valid = df.iloc[9000:,:]

In [None]:
target_train = df_train.Attrition_Flag
target_valid = df_valid.Attrition_Flag

data_train = df_train.drop(['Attrition_Flag'],axis = 1)
data_valid = df_valid.drop(['Attrition_Flag'],axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_train, target_train, test_size=0.20, random_state= 7)

# 4.Transforming categorical attributes into dummies

In [None]:
X_train_dummy = pd.get_dummies(X_train)
X_test_dummy = pd.get_dummies(X_test)

# verify if we have the same columns in train and test dataset
assert X_train_dummy.columns.equals(X_test_dummy.columns)

# 5. Standard scaling our data

In [None]:
scaler = StandardScaler()
scaler.fit(X_train_dummy)

X_train_sc = pd.DataFrame(scaler.transform(X_train_dummy), columns = X_train_dummy.columns)
X_test_sc = pd.DataFrame(scaler.transform(X_test_dummy), columns = X_train_dummy.columns)

# 6.Training a Random Forest model

In [None]:
%%time 

param_rf = {
    'bootstrap': [True],
    'max_depth': [10,11,12,13],
    'max_features': [10,11,12,13,14],
    'n_estimators': [400,500,600,700]
}


clf_rf = RandomForestClassifier()

grid_rf = GridSearchCV(clf_rf, param_rf, cv = 5, n_jobs = -1)
grid_rf.fit(X_train_sc,y_train)
print(grid_rf.best_estimator_)

# 7. Evaluate classification performance

In [None]:
def evaluate_perf(y_test,y_pred):
    print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))
    print('====================')
    print("Accuracy : ", "{:.2f}".format(accuracy_score(y_test,y_pred)))
    print("Precision : ", "{:.2f}".format(precision_score(y_test,y_pred)))
    print("Recall : ","{:.2f}".format(recall_score(y_test,y_pred)))
    print("F1 :", "{:.2f}".format(f1_score(y_test,y_pred)))

In [None]:
y_pred_rf = grid_rf.predict(X_test_sc)
evaluate_perf(y_test,y_pred_rf)

# 8. Validation of the model performance

In [None]:
data_valid_dummy = pd.get_dummies(data_valid)
assert X_train_dummy.columns.equals(data_valid_dummy.columns)

In [None]:
data_valid_sc = pd.DataFrame(scaler.transform(data_valid_dummy), columns = data_valid_dummy.columns)

In [None]:
valid_pred_rf = grid_rf.predict(data_valid_sc)
evaluate_perf(target_valid,valid_pred_rf)

# 9. Explainability and feature selection

In [None]:
pd.options.display.float_format = '{:,.3f}'.format
feature_imp = pd.DataFrame(data=[X_train_sc.columns.tolist(), grid_rf.best_estimator_.feature_importances_]).T
feature_imp.columns = ['feature','importance']
feature_imp.sort_values('importance',ascending=False)