In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!ls /kaggle/input

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# load the dataset into dataframe cc_df
cc_df = pd.read_csv('/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')

In [None]:
cc_df.describe()

In [None]:
cc_df.head()

In [None]:
#No missing data as all the columns have 30000 entries
cc_df.info()

In [None]:
plt.figure(figsize=(12,8))
cc_df.corr()['default.payment.next.month'][:].sort_values().plot(kind='bar')

In [None]:
cc_df.hist(bins = 30, figsize = (20,20), color = 'b')


In [None]:
#Check how many are samples we have with default and no-default cases

cc_df['default.payment.next.month'].value_counts()

**It is slightly imbalanced dataset as we have 23364 for 0, and 6636 for 1 **

In [None]:
#Check the correlations

correlations = cc_df.corr()
f, ax = plt.subplots(figsize=(20,20))
sns.heatmap(correlations, annot=True)

In [None]:
plt.figure(figsize=[20, 10])
sns.countplot(x = 'AGE', hue = 'default.payment.next.month', data = cc_df)

In [None]:
#Check the corelation with the categorical variables

plt.figure(figsize=[20,20])
plt.subplot(311)
sns.countplot(x = 'EDUCATION', hue = 'default.payment.next.month', data = cc_df)
plt.subplot(312)
sns.countplot(x = 'SEX', hue = 'default.payment.next.month', data = cc_df)
plt.subplot(313)
sns.countplot(x = 'MARRIAGE', hue = 'default.payment.next.month', data = cc_df)


In [None]:

plt.figure(figsize=(12,7))
cc_default_df        = cc_df[cc_df['default.payment.next.month'] == 1]
cc_nodefault_df      = cc_df[cc_df['default.payment.next.month'] == 0]

sns.histplot(cc_nodefault_df['LIMIT_BAL'], bins = 250, color = 'b')
sns.histplot(cc_default_df['LIMIT_BAL'], bins = 250, color = 'r')


In [None]:
plt.figure(figsize=(12,7))

sns.kdeplot(cc_nodefault_df['BILL_AMT1'], label = 'Customers who did not default', shade = True, color = 'b')
sns.kdeplot(cc_default_df['BILL_AMT1'], label = 'Customers who defaulted', shade = True, color = 'r')


In [None]:
# Let's see the impact of categorical variables on the balance

plt.figure(figsize=[10,20])
plt.subplot(311)
sns.boxplot(x = 'SEX', y = 'LIMIT_BAL', data = cc_df, showfliers = False)
plt.subplot(312)
sns.boxplot(x = 'MARRIAGE', y = 'LIMIT_BAL', data = cc_df, showfliers = False)
plt.subplot(313)
sns.boxplot(x = 'EDUCATION', y = 'LIMIT_BAL', data = cc_df, showfliers = False)

In [None]:
# We need to convert categorical variables into onehotencoding
cats = cc_df[['SEX', 'EDUCATION', 'MARRIAGE']]
cats

In [None]:
#Convert the categorical variables into OneHot encoding

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
cats = onehotencoder.fit_transform(cats).toarray()

In [None]:
cats

In [None]:
cats = pd.DataFrame(cats)
cats

In [None]:
X = cc_df.drop(['ID', 'default.payment.next.month','SEX', 'EDUCATION', 'MARRIAGE' ], axis = 1)
X

In [None]:
X = pd.concat([cats, X], axis=1)
X

In [None]:
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()
#X = scaler.fit_transform(X)

In [None]:
y = cc_df['default.payment.next.month']
y

**Model with XGBoost and use grid search to optimize the hyperparameters

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

param_grid = {
        'gamma': [0.5, 1, 5],   # regularization parameter 
        'subsample': [0.3,  0.7, 1.0], # % of rows taken to build each tree
        'colsample_bytree': [0.6, 0.8, 1.0], # number of columns used by each tree
        'max_depth': [ 5, 6] # depth of each tree
        }


import xgboost as xgb
xgb_model = xgb.XGBClassifier(learning_rate=0.01, n_estimators=100, eval_metric='logloss', use_label_encoder=False)
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(xgb_model, param_grid, refit = True, verbose = 4)
grid.fit(X_train, y_train)

In [None]:
# evaluate predictions
from sklearn.metrics import confusion_matrix, average_precision_score, roc_auc_score, roc_curve, classification_report, precision_recall_curve, f1_score
    
y_prob=grid.predict_proba(X_test)
y_pred = grid.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
roc_xgb = roc_auc_score(y_test, y_prob[:,1])
print('ROC-AUC', roc_xgb)
print('='*20)
print('Confusion Matrix')
cm_xgb = confusion_matrix(y_test, y_pred)
print(cm_xgb)
sns.heatmap(cm_xgb, annot=True, cmap='viridis')
cl_xgb = classification_report(y_test,y_pred )
print(cl_xgb)