# 1: IMPORT LIBRARIES AND DATASETS

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
credit_df = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')
credit_df

In [None]:
credit_df.info()

**DATASET DESCRIPTION :** There are 25 variables

* ID: ID of each client
* LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
* SEX: Gender (1=male, 2=female)
* EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
* MARRIAGE: Marital status (1=married, 2=single, 3=others)
* AGE: Age in years
* PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, â€¦ 8=payment delay for eight months, 9=payment delay for nine months and above)
* PAY_2: Repayment status in August, 2005 (scale same as above)
* PAY_3: Repayment status in July, 2005 (scale same as above)
* PAY_4: Repayment status in June, 2005 (scale same as above)
* PAY_5: Repayment status in May, 2005 (scale same as above)
* PAY_6: Repayment status in April, 2005 (scale same as above)
* BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
* BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
* BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
* BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
* BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
* BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
* PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
* PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
* PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
* PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
* PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
* PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
* default.payment.next.month: Default payment (1=yes, 0=no)

In [None]:
credit_df.isnull().sum().sum()

In [None]:
credit_df.describe()

# 2: VISUALIZE DATASET

In [None]:
credit_df.hist(figsize=(20,20), bins=20);

In [None]:
credit_df.drop(['ID'], axis=1, inplace=True)

In [None]:
# Let's see how many customers could potentially default on their credit card payment!
cc_default = credit_df[credit_df['default.payment.next.month']==1]
cc_notdefault = credit_df[credit_df['default.payment.next.month']==0]

print('Total Customers = ', len(credit_df))
print('Number of customers who defaulted on their credit card payments = ', len(cc_default))
print('Percentage of customers who defaulted on their credit card payments = ', len(cc_default) / len(credit_df)*100, "%")

print('Number of customers who did not default on their credit card payments (paid their balance)= ', len(cc_notdefault))
print('Percentage of customers who did not default on their credit card payments (paid their balance)= {:.2f}%'.format(len(cc_notdefault) / len(credit_df)*100))

In [None]:
# Let's compare the mean and std of the customers who stayed and left 
cc_default.describe()

In [None]:
cc_notdefault.describe()

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(credit_df.corr(), annot=True);

**OBSERVATIONS**
* BILL_AMT1, BILL_AMT2, BILL_AMT3, BILL_AMT4, BILL_AMT5, & BILL_AMT6 are strongly correlated
* PAY_0, PAY_2, PAY_3, PAY_4, PAY_5 & PAY_6 are highly correlated
* There is a weak correlation between PAY and BILL_AMT (all columns)
* There is a slight -ve correlation between PAY_ and PAY_AMT (all columns)
* There is -ve corr of LIMIT_BAL with PAY_0, PAY_2, PAY_3, PAY_4, PAY_5 & PAY_6

In [None]:
plt.figure(figsize=(25,12))
sns.countplot(data=credit_df, x='AGE', hue='default.payment.next.month');

In [None]:
plt.figure(figsize=(20,20))
plt.subplot(311)
sns.countplot(data=credit_df, x='EDUCATION', hue='default.payment.next.month');
plt.subplot(312)
sns.countplot(data=credit_df, x='MARRIAGE', hue='default.payment.next.month');
plt.subplot(313)
sns.countplot(data=credit_df, x='SEX', hue='default.payment.next.month');

**OBESRVATIONS**
* Most defaulters lie in Education 1=graduate school, 2=university
* Most defaulters in Marrige are 1=married & 2=single
* There are more female defaulters as compared to male but there are also more female customes paying the bills on time
* Most defaulters belong to age group of 23 to 30 years and there is also increses after 34 to 38 years band

In [None]:
# KDE (Kernel Density Estimate) is used for visualizing the Probability Density of a continuous variable. 
# KDE describes the probability density at different values in a continuous variable. 

plt.figure(figsize=(20,21));

plt.subplot(311);
sns.distplot(cc_notdefault['LIMIT_BAL'], bins = 250, color = 'r');
sns.distplot(cc_default['LIMIT_BAL'], bins = 250, color = 'b');

plt.xlabel('Amount of bill statement in September, 2005 (NT dollar)');

plt.subplot(312);
sns.kdeplot(cc_notdefault['BILL_AMT1'], label = 'Customers who did not default (paid balance)', shade = True, color = 'r');
sns.kdeplot(cc_default['BILL_AMT1'], label = 'Customers who defaulted (did not pay balance)', shade = True, color = 'b');

plt.xlabel('Amount of bill statement in September, 2005 (NT dollar)');

plt.subplot(313);
sns.kdeplot(cc_notdefault['PAY_AMT1'], label = 'Customers who did not default (paid balance)', shade = True, color = 'r');
sns.kdeplot(cc_default['PAY_AMT1'], label = 'Customers who defaulted (did not pay balance)', shade = True, color = 'b');

plt.xlabel('PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)');

In [None]:
#fig, axs = plt.subplots(3,2, figsize=(30,30))
axs = 1
for i in ['MARRIAGE', 'SEX', 'EDUCATION']:
    plt.figure(figsize=(25,25))
    plt.subplot(3,2,axs)
    sns.boxplot(data=credit_df, y='LIMIT_BAL', x=i)
    plt.title(i + ' vs LIMIT_BAL (with Outlaiers)')
    
    plt.subplot(3,2,axs+1)
    sns.boxplot(data=credit_df, y='LIMIT_BAL', x=i, showfliers=False)
    plt.title(i + ' vs LIMIT_BAL (with Outlaiers disabled)')
    axs += 2

plt.show()

**OBSERVATIONS**
* MARRIGE: married customers are having more credit limit
* Gender don't seem to have much impact on credit limit
* Customers with EDUCATED (1) and OTHERS (4) class are having higher credit limit

# 3. DATA CLEANING, STANDARDIZING AND TRAIN & TEST SPLIT

In [None]:
#creating target values
y = credit_df['default.payment.next.month']
y

In [None]:
# one hot encoding for 'SEX', 'MARRIAGE' & 'EDUCATION'
X = pd.get_dummies(data=credit_df, columns=['SEX', 'MARRIAGE', 'EDUCATION'])
X.drop(columns=['default.payment.next.month'], axis=1, inplace=True)
X

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
print("Variables in Train Set : {} & Test Set : {}".format(len(X_train), len(X_test)))

# 4: TRAIN AND EVALUATE AN XGBOOST CLASSIFIER

In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(objective='reg:squarederror', learning_rate = 0.1, max_depth = 5, n_estimators = 100)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)
print("Accuracy {:.2f} %".format( 100 * accuracy_score(y_pred, y_test)))

In [None]:
# Testing Set Performance
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True);
print("Classification Report :- \n",classification_report(y_test, y_pred))

# 5: OPTIMIZE XGBOOST HYPERPARAMETERS BY PERFORMING GRID SEARCH

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'gamma': [1, 5],   # regularization parameter
              'subsample': [0.8, 1.0], # % of rows taken to build each tree
              'colsample_bytree': [0.8, 1.0], # number of columns used by each tree
              'max_depth': [7, 5], # depth of each tree
              'n_estimators': [100, 300] #no. of trees
             }

xgb_model = xgb.XGBClassifier(objective='binary:logistic', learning_rate = 0.01)
grid = GridSearchCV(xgb_model, 
                    param_grid, 
                    refit = True, 
                    verbose = 4
                   )
grid.fit(X_train, y_train)

In [None]:
print("Best Score: {:.2f} %".format(grid.best_score_ * 100))
print("Best Parameters:", grid.best_params_)
model = grid.best_estimator_

In [None]:
pred = model.predict(X_test)

print("Accuracy {:.2f} %".format( 100 * accuracy_score(y_test, pred)))
# Testing Set Performance
cm = confusion_matrix(y_test, pred)
sns.heatmap(cm, annot=True);
print("Classification Report :- \n",classification_report(y_test, pred))

the scores a little bit inproved there is not much difference

In [None]:
fig, (ax) = plt.subplots(ncols=1, figsize=(15,15))
xgb.plot_importance(model, height=0.8, title="Features importance (XGBoost)", ax=ax, color="green") 
plt.show()

* LIMIT_BAL & BILL_AMT are the most important features
* Your feedback in comments is much appreciated, Comment if you have any doubts or for inprovement
* Please **UPVOTE** if you LIKE this notebook, it will keep me motivated