In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

import pandas_profiling
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
df = df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1'], axis=1)
df

In [None]:
#let's explore our features

profile = pandas_profiling.ProfileReport(df)
profile

We have 6 categorical and 15 numerical columns, no missing values. The dataset is imbalanced, there are 8500 and 1627 of existing and attrited clients respectively. Customer age distribution is normal, the numbers of femae and male clients are roughly equal. Numbers of married and single clients are also quite close. 
The 'less than $40k' category incudes the biggest quantity of clients in this dataset. Most of the clients have Blue card, 555 have Silver one, 16 have Gold and only 20 clients have Platinum card.   Let's have a look at these and other features with respect to Attrition Flag if we can find some interesting patterns there.

In [None]:
sns.displot(df, x="Customer_Age", hue="Attrition_Flag")
sns.displot(df, x="Dependent_count", hue="Attrition_Flag")

Here attrition is normally distributed, so we may think about excluding these features after looking at PPS matrix later.

In [None]:
sns.displot(df, x="Months_on_book", hue="Attrition_Flag")

Months on book looks very unsual, like there are a lot (2000) of clients who spent here exactly 36 months or so and almost 500 of them are attrited.

In [None]:
sns.displot(df, x="Total_Relationship_Count", hue="Attrition_Flag")

As we can't talk about nmal distribution here (after 3 number of attrited customers is pretty the same for 4, 5 and 6), so, this column may be important for our model. 

In [None]:
sns.displot(df, x="Months_Inactive_12_mon", hue="Attrition_Flag")

Clients are mostly active that can be notactive during 1-3 months period. There are a few of them who has been inanctive during 5 or6 months and surprisingly they still are existing customers.
In the graph below it is clea that the more contacts within 12 months, the highest probability for a client to attrite. 

In [None]:
sns.displot(df, x="Contacts_Count_12_mon", hue="Attrition_Flag")

In [None]:
sns.displot(df, x="Credit_Limit", hue="Attrition_Flag")

Generally, the lower credit limit is, the higher the probablity of attrition is. There is an outlier - maximum available credit limit - 35000. Roughly 1/4 of clients with this limit are attrited.

In [None]:
sns.displot(df, x="Total_Revolving_Bal", hue="Attrition_Flag")
sns.displot(df, x="Avg_Open_To_Buy", hue="Attrition_Flag")
sns.displot(df, x="Total_Amt_Chng_Q4_Q1", hue="Attrition_Flag")

In [None]:
sns.displot(df, x="Total_Trans_Amt", hue="Attrition_Flag")
sns.displot(df, x="Total_Trans_Ct", hue="Attrition_Flag")
sns.displot(df, x="Total_Ct_Chng_Q4_Q1", hue="Attrition_Flag")

All previous graphs show that the more money customers spend the less likely they become churned. Again, with some outliers.  Seems like those people took credit once for a buying an expensive thing like a house or a flat.
Average Utilization Ratio shows that there is the highest chance for churn if this ratio equals 0.

In [None]:
sns.displot(df, x="Avg_Utilization_Ratio", hue="Attrition_Flag")

In [None]:
sns.countplot(x ='Gender', hue='Attrition_Flag', data=df)

In [None]:
sns.countplot(x ='Education_Level', hue='Attrition_Flag', data=df)

Nothing unusual in Gender, Education Level and Marital Status distribution of Churned clients.

In [None]:
sns.countplot(x ='Marital_Status', hue='Attrition_Flag', data=df)

In [None]:
sns.countplot(x ='Income_Category', hue='Attrition_Flag', data=df)

In [None]:
sns.countplot(x ='Card_Category', hue='Attrition_Flag', data=df)

Card Category and Income Category have roughly the same proportions in categories for Churn and Existing customers.
Let's have a look at Predictive Power Score matrix and discover if there are other than linear relationships between features and target variable.

In [None]:
pip install ppscore

In [None]:
def heatmap(df):
    df = df[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
    fig, ax = plt.subplots(figsize=(20,20)) 
    ax = sns.heatmap(df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
    ax.set_title("PPS matrix")
    ax.set_xlabel("feature")
    ax.set_ylabel("target")
    return ax

import ppscore as pps
matrix = pps.matrix(df)
heatmap(matrix)

In [None]:
df.Attrition_Flag = df.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0})

In [None]:
#using Label Encoder for categorical features in case wewill use them for a model

for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = LabelEncoder().fit_transform(df[column])
df.head(10)

In [None]:
#making a function for evaluation model results, using confusion_matrix, classification_report

def evaluate(y_actual, y_hat):
    matrix = confusion_matrix(y_actual, y_hat)

    sns.heatmap(pd.DataFrame(matrix), annot = True, cmap ='PuBu', fmt = 'g')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    
    labels = ['Existing', 'Churn']
    print(classification_report(y_actual, y_hat, target_names = labels))

In [None]:
#dropping categorical features
X = df.drop(['Attrition_Flag', 'CLIENTNUM', 'Education_Level', 'Credit_Limit', "Customer_Age", 
             'Dependent_count', 'Marital_Status','Card_Category', 'Gender', 'Income_Category'], axis = 1)
X = RobustScaler().fit_transform(X)
             
y = df['Attrition_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [None]:
#set a proportion between negative and positive results for further usage in XGBoost
scale_num = int(y_train.value_counts().values[0]/y_train.value_counts().values[1])
scale_num

In [None]:
#modeling with handling imbalance in XGBoost 

xgb = XGBClassifier(n_estimators = 70, verbosity = 1, use_label_encoder=False, scale_pos_weight = scale_num)
xgb.fit(X_train, y_train)
predictions_xgb = xgb.predict(X_test)
evaluate(y_test, predictions_xgb)

In [None]:
#checking AUC score

fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions_xgb)
auc = metrics.auc(fpr, tpr)
print(auc)

Now let's try SMOTE - oversampling of positive samples to make equal quality of positive and negative examples

In [None]:
from imblearn.over_sampling import SMOTE
oversampler = SMOTE(random_state = 66)
x_train_smote, x_test_smote, y_train_smote, y_test_smote = train_test_split(X, y, test_size = 0.3,
                                                                            random_state = 66)
x_oversample, y_oversample = oversampler.fit_sample(x_train_smote, y_train_smote)
y_oversample.value_counts() 

In [None]:
#modeling using XGBoost with smote sets and classical parameters
xgb_smote = XGBClassifier(use_label_encoder=False)
xgb_smote.fit(x_oversample, y_oversample)

predictions_smote = xgb_smote.predict(x_test_smote)
evaluate(y_test_smote, predictions_smote)

It seems SMOTE didn't improve the results with XGBoost, recall became even lower than it was before.
Time to try another classification algorithm - CatBoostClassifier.

In [None]:
classifier = CatBoostClassifier(
    random_state=42, border_count=100,
    depth=6, iterations=100, l2_leaf_reg=100,
    learning_rate=0.1,auto_class_weights='Balanced',
    verbose=False
)

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
evaluate(y_test, y_pred)

In [None]:
fpr_cat, tpr_cat, thresholds_cat = metrics.roc_curve(y_test, y_pred)
auc = metrics.auc(fpr_cat, tpr_cat)
print(auc)

**In the end, CatBoost did the best job. Recall 95% is pretty nice result for business in this case. Precision is very low - only 72%, but it is not very important here. Thank you for reading this notebook.**