In [None]:
import warnings
warnings.filterwarnings('ignore')

# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score,roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split ,GridSearchCV, RandomizedSearchCV,StratifiedKFold, cross_val_score

#libraries for UP/Down sampling, Imputation and Pipelines
from sklearn.pipeline import Pipeline, make_pipeline
#libraries to help with model building
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,BaggingClassifier)
from xgboost import XGBClassifier

In [None]:
path = '/kaggle/input/credit-card-customers/BankChurners.csv'
data = pd.read_csv(path) #load the data

In [None]:
df= data.copy()
print(f'There is {df.shape[0]} rows and {df.shape[1]} columns in the dataset')
np.random.seed(4)
df.sample(10)

In [None]:
df.info()


In [None]:
df.nunique()

We can drop the CLIENTNUM column as its unique to each customer and not useful for the model.
We will also convert all object datatype to category for further processing.
The Dependent_count variable only has 6 unique values; from 0 to 5. As the dependents are people, we will convert this variable to category.

In [None]:
df.drop(['CLIENTNUM','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
        'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],axis=1,inplace = True)
df['Dependent_count'] = df['Dependent_count'].astype('category')
cols = df.select_dtypes(['object']) #selecting all object datatype
for i in cols.columns:
    df[i] = df[i].astype('category')

In [None]:
df.describe().T

In [None]:
cat_cols = df.select_dtypes(['category'])
for i in cat_cols.columns:
    print(cat_cols[i].value_counts())
    print('-'*50)

## EDA - Relationship of variables with the target 

In [None]:
#Stacked plot of categorical variables with Personal Loans
def stacked_plot(x):
    sns.set(palette='Set1')
    tab1 = pd.crosstab(x,df['Attrition_Flag'],margins=True)
    print(tab1)
    print('-'*120)
    tab = pd.crosstab(x,df['Attrition_Flag'],normalize='index')
    tab.plot(kind='bar',stacked=True,figsize=(10,5))
    plt.legend(loc='lower left', frameon=True)
    plt.legend(loc="upper left", bbox_to_anchor=(1,1))
    plt.ylabel('Percentage')
    plt.show()

In [None]:
stacked_plot(df.Gender)

In [None]:
stacked_plot(df.Dependent_count)

In [None]:
stacked_plot(df.Education_Level)

In [None]:
stacked_plot(df.Marital_Status)

In [None]:
stacked_plot(df.Income_Category)

In [None]:
stacked_plot(df.Card_Category)

Insights from the Analysis

Interestingly, we note that the among categorical variables the percentage of Attrited Customers seems to be fairly equal across all categories of all the Variables.
Despite having a large imbalance in the proportions across the categories; the attrition however is quite similar.
There seems to be no significant categorical variable that shows a strong indicator for Attrition.

## Correlation Matrix

In [None]:
corr= df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr,annot= True,vmin=-0.5,vmax=1, cmap='coolwarm',linewidths=0.75)
plt.show()

In [None]:
#Let's group and plot certain Numerical variables together for a comparison study with the target variable
cols1 = df[['Total_Relationship_Count',
           'Months_Inactive_12_mon',
            'Contacts_Count_12_mon']].columns.tolist()
cols2 = df[['Credit_Limit',
           'Total_Revolving_Bal',
           'Avg_Open_To_Buy',
            'Avg_Utilization_Ratio'
            ]].columns.tolist()
cols3 = df[['Total_Trans_Amt',
           'Total_Trans_Ct',
           'Total_Ct_Chng_Q4_Q1',
            'Total_Amt_Chng_Q4_Q1']].columns.tolist()
cols4 = df[['Customer_Age','Months_on_book']]

In [None]:
def bi_plot(x):
    plt.figure(figsize=(9,7))
    for i,count in enumerate(x):
        plt.subplot(2,2,i+1)
        #plt.subplots_adjust(hspace=3, wspace=7)
        sns.boxplot(df['Attrition_Flag'],df[count],palette="YlOrBr_r",showmeans=True)
        plt.title('Attrition_Flag Vs '+count,fontsize=12,fontweight = 'bold')
        plt.tight_layout()

In [None]:
bi_plot(cols1)

In [None]:
bi_plot(cols2)

In [None]:
bi_plot(cols3)

In [None]:
bi_plot(cols4)

**Insights on above Analysis**

In [None]:
#  Total_Trans_Ct Vs Total_Trans_Amt
plt.figure(figsize=(15,7))
sns.lineplot(df.Total_Trans_Ct,df.Total_Trans_Amt,hue=df.Attrition_Flag)


In [None]:
plt.figure(figsize=(15,7))
sns.scatterplot(x='Total_Ct_Chng_Q4_Q1', y='Total_Amt_Chng_Q4_Q1',hue='Attrition_Flag',
             data=df)

In [None]:
plt.figure(figsize=(15,7))
sns.jointplot(df.Avg_Utilization_Ratio,df.Total_Revolving_Bal, hue = df.Attrition_Flag)

## Split Data into Train and Test set

In [None]:
X= df.drop(['Attrition_Flag'],axis=1)
Y = df['Attrition_Flag'].apply(lambda x: x=='Attrited Customer').astype('int')
# Splitting data into training and test set:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=23,stratify=Y)
print(X_train.shape, X_test.shape)

In [None]:
X_train=pd.get_dummies(X_train,drop_first=True)
X_test=pd.get_dummies(X_test,drop_first=True)
print(X_train.shape, X_test.shape)

Model Evaluation Criterion:
Model can make two kinds of wrong predictions:

1.Predicting that a customer will cancel their Credit Card services but doesnt : False Positive
2. Predicting that a customer wont cancel their Credit Card servicebut does : False Negative

The Bank's objective is to identify all potential Customer's who wish to close their Credit Card Services.
Predicting that a Customer wont cancel their Card Serivces but they do end up attriting, will lead to loss.
Hence the False Negative values must be reduced
Metric for Optimization

The Recall must be maximized to ensure lesser chances of False Negatives.

In [None]:
def make_confusion_matrix(model,y_actual):
    '''
    y_predict: prediction of class
    y_actual : ground truth  
    '''
    sns.set(font_scale=2.0) # to set font size for the matrix
    y_predict = model.predict(X_test)
    cm=confusion_matrix(y_actual,y_predict)
    group_names = ['True -ve','False +ve','False -ve','True +ve']
    group_counts = ["{0:0.0f}".format(value) for value in
                cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cm.flatten()/np.sum(cm)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2,v3 in
              zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    plt.figure(figsize = (10,7))
    sns.heatmap(cm, annot=labels,fmt='',cmap='Blues')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
def scores(model,train_x,train_y,flag=True):
    """ model : classifier to predict X values """
    score_list=[] # creating an empty list to store the accuracy and f1(metric of interst)
    
    y_pred_train = model.predict(train_x)
    y_pred_test = model.predict(X_test)
    
    train_acc = metrics.accuracy_score(train_y,y_pred_train)
    test_acc = metrics.accuracy_score(y_test,y_pred_test)
    
    train_recall = metrics.recall_score(train_y,y_pred_train)
    test_recall = metrics.recall_score(y_test,y_pred_test)
    
    train_precision = metrics.precision_score(train_y,y_pred_train)
    test_precision = metrics.precision_score(y_test,y_pred_test)
    
    score_list.extend((train_acc,test_acc,train_recall,test_recall,train_precision,test_precision))
    
    if flag== True:
        print("Accuracy on training set : ",metrics.accuracy_score(train_y,y_pred_train))
        print("Accuracy on test set : ",metrics.accuracy_score(y_test,y_pred_test))

        print("\nRecall on training set : ",metrics.recall_score(train_y,y_pred_train))
        print("Recall on test set : ",metrics.recall_score(y_test,y_pred_test))
    
        print("\nPrecision on training set : ",metrics.precision_score(train_y,y_pred_train))
        print("Precision on test set : ",metrics.precision_score(y_test,y_pred_test))
    
    elif flag == False:
        return score_list #return this when flag is False

### Model Building: Decision Tree, Random Forest, Bagging Classifier,Ada, Gradient and XG Boost

We will use the **make_pipeline** function to create pipelines for all the models.
This function does not need naming the estimators and will provide lowecase names of the types automatically.

In [None]:
#creating an empty list to store models
all_models = []

#Appending pipelines to the empty list
all_models.append(('ADA',make_pipeline
                   (AdaBoostClassifier(random_state=23)
                   )))
all_models.append(('GRB',make_pipeline
                   (GradientBoostingClassifier(random_state=23)
                   )))
all_models.append(('XGB',make_pipeline
                   (XGBClassifier(random_state=23,eval_metric='logloss')
                   )))

In [None]:
for i,j in all_models:
    j.fit(X_train,y_train)
    scoring='recall'
    kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=23)#Setting number of splits equal to 5
    #performing Cross-validation on undersampled train set
    cv_result=cross_val_score(estimator=j, X=X_train, y=y_train, scoring=scoring, cv=kfold) 
    print(f'{i}: {cv_result.mean()}')

In [None]:
for i,j in all_models:
    print(i)
    scores(j,X_train,y_train)
    print('-'*35)

**Building Hypertuned Boosting models using RandomSearchCV**

In [None]:
%%time
#Creating pipeline
pipe_ADA2 = make_pipeline(AdaBoostClassifier(random_state=23))

# Parameter Grid
parameters = {
    "adaboostclassifier__base_estimator":[DecisionTreeClassifier(max_depth=1,random_state=23),
                                         DecisionTreeClassifier(max_depth=2,random_state=23),
                                         DecisionTreeClassifier(max_depth=3,random_state=23)],
    "adaboostclassifier__n_estimators": np.arange(10,60,5),
    "adaboostclassifier__learning_rate": [0.05,0.15,0.45,0.75]} 
                                                   

#scoring metric
scoring = metrics.make_scorer(metrics.recall_score)

# GridSearch CV
RS_cv = RandomizedSearchCV(estimator=pipe_ADA2,scoring=scoring,param_distributions=parameters,random_state=23,n_iter=50,n_jobs=-1,cv=5)
RS_cv.fit(X_train,y_train)

print(RS_cv.best_params_)
print(RS_cv.best_score_)

In [None]:
#creating new pipeline with the best parameters 
ada_tuned2 = make_pipeline(AdaBoostClassifier
                         (base_estimator=DecisionTreeClassifier(max_depth=3, random_state=23),
                         learning_rate=0.75,
                         n_estimators= 35,
                         random_state=23))
#fitting model on train data
ada_tuned2.fit(X_train,y_train)

In [None]:
#calculate the metric scores
scores(ada_tuned2,X_train,y_train)
make_confusion_matrix(ada_tuned2,y_test)

In [None]:
%%time
#Creating pipeline
pipe_GRB2 = make_pipeline(GradientBoostingClassifier(random_state=23))
# Parameter Grid
parameters = {
    "gradientboostingclassifier__n_estimators": np.arange(20,100,20),
    "gradientboostingclassifier__max_features":[0.6,0.7,0.8,0.9],
    'gradientboostingclassifier__learning_rate': [0.01,0.05,0.35,0.5],
    'gradientboostingclassifier__subsample':[0.6,0.7,0.8,0.9]
    }

#scoring metric
scoring = metrics.make_scorer(metrics.recall_score)

RS_cv = RandomizedSearchCV(estimator=pipe_GRB2,
                           scoring=scoring,
                           param_distributions=parameters,
                           random_state=23,n_iter=50,n_jobs=-1,cv=5)
RS_cv.fit(X_train,y_train)

print(RS_cv.best_params_)
print(RS_cv.best_score_)

In [None]:
grb_tuned2 = make_pipeline(GradientBoostingClassifier
                          (learning_rate=0.35,
                           max_features=0.6,
                           n_estimators=80,random_state=23,
                          subsample = 0.7))
                                                                                      
#fitting model on train data
grb_tuned2.fit(X_train,y_train)

In [None]:
#calculate the metric scores
scores(grb_tuned2,X_train,y_train)
make_confusion_matrix(grb_tuned2,y_test)

In [None]:
%%time
#Creating pipeline
pipe_XBG2 = make_pipeline(
                   (XGBClassifier
                    (random_state=23,eval_metric='logloss')
                   ))

# Parameter Grid
parameters = {
    "xgbclassifier__n_estimators": np.arange(30,100,20),
     "xgbclassifier__subsample":[0.6,0.7,0.8],
    "xgbclassifier__learning_rate":[0.05,0.15,0.2,0.3],
    "xgbclassifier__gamma":[0,1,2,3],
    }
#scoring metric
scoring = metrics.make_scorer(metrics.recall_score)

#RandomizedSearch CV
RS_cv = RandomizedSearchCV(estimator=pipe_XBG2,
                           scoring=scoring,
                           param_distributions=parameters,
                           random_state=23,n_iter=50,n_jobs=-1,cv=5)
RS_cv.fit(X_train,y_train)

print(RS_cv.best_params_)
print(RS_cv.best_score_)

In [None]:
#creating new pipeline with the best parameters 
xgb_tuned2 = make_pipeline(
                   (XGBClassifier
                    (random_state=23,
                     eval_metric='logloss',
                    learning_rate = 0.2,
                    n_estimators =90,
                    subsample = 0.7,
                    gamma = 3
                    )))
#fitting model on train data
xgb_tuned2.fit(X_train,y_train)

In [None]:
#calculate the metric scores
scores(xgb_tuned2,X_train,y_train)
make_confusion_matrix(xgb_tuned2,y_test)

### Comparing all models

In [None]:
models = [ada_tuned2,
          grb_tuned2,
          xgb_tuned2]
# defining empty lists to add train and test results
acc_train = []
acc_test = []
recall_train = [] 
recall_test = []
precision_train = [] 
precision_test = []

# looping through all the models to get the metrics score - Accuracy and F1 Score
for model in models:
    j = scores(model,X_train,y_train,False)
    acc_train.append(j[0])
    acc_test.append(j[1])
    recall_train.append(j[2])
    recall_test.append(j[3])
    precision_train.append(j[4])
    precision_test.append(j[5])

In [None]:
comparison_frame = pd.DataFrame({'Model':['ADA_Boost-RandomizedSearchCV',
                                          'Gradient_Boost-RandomizedSearchCV',
                                          'XG_Boost-RandomizedSearchCV'],
                                 'Train_Accuracy': acc_train,'Test_Accuracy': acc_test,
                                          'Train_Recall':recall_train, 'Test_Recall':recall_test,
                                           'Train_Precision':precision_train, 'Test_Precision':precision_test}) 

#Sorting models in decreasing order of test recall
comparison_frame.sort_values(by='Test_Recall',ascending=False).reset_index()


### Thus ADA boost has the best recall and is the best model for this dataset