In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Import libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

### Preprocessing

In [None]:
df = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')
df.head()

In [None]:
df.info

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df.isna().mean()

In [None]:
df.hist(bins=30, figsize=(20,20), color='b');

### Drop features

In [None]:
df.drop(['ID'], axis=1, inplace=True)

In [None]:
df.shape

In [None]:
default_df = df[df['default.payment.next.month'] == 1]
nodefault_df = df[df['default.payment.next.month'] == 0]


In [None]:
print('Total default:', default_df.shape[0])

In [None]:
print('Total nodefault:', nodefault_df.shape[0])

In [None]:
print((len(default_df) / len(df)) * 100, '%')

### Visualize

In [None]:
correlations = df.corr()

In [None]:
f, ax = plt.subplots(figsize=(20,20))
sns.heatmap(correlations, annot=True)

In [None]:
plt.figure(figsize=[25,12])
sns.countplot(x='AGE', hue='default.payment.next.month', data=df);

In [None]:
plt.figure(figsize=[20,20])
plt.subplot(311)
sns.countplot(x = 'EDUCATION', hue = 'default.payment.next.month', data = df)
plt.subplot(312)
sns.countplot(x = 'SEX', hue = 'default.payment.next.month', data = df)
plt.subplot(313)
sns.countplot(x = 'MARRIAGE', hue = 'default.payment.next.month', data = df);

### Prepare data

In [None]:
X_cat = df[['SEX', 'EDUCATION', 'MARRIAGE']]
X_cat.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
X_cat = onehotencoder.fit_transform(X_cat).toarray()

In [None]:
X_cat = pd.DataFrame(X_cat)

In [None]:
X_cat.head()

In [None]:
X_num = df[['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 
                'BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]
X_num.head()

In [None]:
X_con = pd.concat([X_cat, X_num], axis=1)
X_con.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X_con)

### Target 

In [None]:
y = df['default.payment.next.month']
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [None]:
(X_train.shape[0], X_test.shape[0])

### Model building

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
model = xgb.XGBClassifier(objective='reg:squarederror', learning_rate = 0.01, max_depth=10, n_estimators=100)

In [None]:
model.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_test, y_pred))

In [None]:
acc = accuracy_score(y_test, y_pred)
acc

In [None]:
roc=roc_auc_score(y_test, y_pred)
roc