### Опис даних

Даний набір даних містить інформацію про платежі, демографічні фактори, кредитні дані, історію платежів та виписки з рахунків клієнтів кредитних карт у Тайвані з квітня по вересень 2005р.

# Acknowledgements
This kernel uses such good kernels:

[P1 : sklearn SVM Model](https://www.kaggle.com/funxexcel/p1-sklearn-svm-model)

[Credit Card Fraud Detection](https://www.kaggle.com/xuanzhihuang/credit-card-fraud-detection)

[Credits Card Clients | XGBoosting (score 82.6%)](https://www.kaggle.com/anantgupt/credits-card-clients-xgboosting-score-82-6)

[Predictive Credit Default Customer by ANN](https://www.kaggle.com/reynoldms/predictive-credit-default-customer-by-ann)

[Predicting Credit Card Default (auc: 0.793)](https://www.kaggle.com/yuankunsong/predicting-credit-card-default-auc-0-793)

[P1 : sklearn SVM Model](https://www.kaggle.com/funxexcel/p1-sklearn-svm-model)

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-c3ow{border-color:inherit;text-align:center;vertical-align:top}
.tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-c3ow">Назва ознаки (Features)</th>
    <th class="tg-c3ow">Опис ознаки (Features Description)</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-c3ow">ID</td>
    <td class="tg-0pky">Ідентифікатор клієнта (унікальний)</td>
  </tr>
  <tr>
    <td class="tg-c3ow">LIMIT_BAL<br></td>
    <td class="tg-0pky">Сума наданого кредиту в доларах (включає індивідуальний та сімейний / додатковий кредит)</td>
  </tr>
  <tr>
    <td class="tg-c3ow">SEX</td>
    <td class="tg-0pky">Стать</td>
  </tr>
  <tr>
    <td class="tg-c3ow">EDUCATION</td>
    <td class="tg-0pky">Освіта (1=аспірантура, 2=університет, 3=середня школа, 4=інше, 0,5,6=невідомо)</td>
  </tr>
  <tr>
    <td class="tg-c3ow">MARRIAGE</td>
    <td class="tg-0pky">Сімейний стан (1=одружений, 2=неодружений, 0-3=інше)</td>
  </tr>
  <tr>
    <td class="tg-c3ow">AGE</td>
    <td class="tg-0pky">Вік</td>
  </tr>
  <tr>
    <td class="tg-c3ow"><span style="font-weight:500;font-style:normal">PAY_0 - PAY_6</span><br></td>
    <td class="tg-0pky">Статус погашення з квітня по вересень 2005 року<br>(-2,-1,0=оплата належним чином, 1=затримка платежу на один місяць, 2=затримка платежу на два місяці, ... 9=затримка платежу на дев'ять місяців)</td>
  </tr>
  <tr>
    <td class="tg-c3ow">BILL_AMT1 - <span style="font-weight:500;font-style:normal">BILL_AMT6</span><br></td>
    <td class="tg-0pky">Сума виписки за векселем з <span style="font-weight:400;font-style:normal">квітня по вересень 2005 року</span><br></td>
  </tr>
  <tr>
    <td class="tg-c3ow">PAY_AMT1 - PAY_AMT6</td>
    <td class="tg-0pky">Сума попереднього платежу з <span style="font-weight:400;font-style:normal">квітня по вересень 2005 року</span><br></td>
  </tr>
  <tr>
    <td class="tg-c3ow">default.payment.next.month</td>
    <td class="tg-0pky">Платіж за замовчуванням</td>
  </tr>
</tbody>
</table>

### Import and configs

In [None]:
import time
import lightgbm
import numpy as np
import pandas as pd

import matplotlib

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score
from sklearn.manifold import TSNE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [None]:
# matplotlib
plt.style.use('seaborn-whitegrid')
%config InlineBackend.figure_format = 'retina'
matplotlib.rcParams.update({'font.size': 14})

# pandas
pd.set_option('float_format', '{:f}'.format)

#### Load data

In [None]:
data = pd.read_csv('/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')
data.head(10)

In [None]:
data.describe().T

In [None]:
print('\033[1m Check none values \033[0m')
# check Nones
data.isna().sum()

In [None]:
# change target column name
data = data.rename(
    columns={'default.payment.next.month': 'Target'}
)

#### Visualization of categorical features

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(18, 8))
sns.countplot(data['MARRIAGE'], ax=axis[0])
sns.countplot(data['EDUCATION'], ax=axis[1])

In [None]:
fig, axis = plt.subplots(2, 3, figsize=(18, 9))
sns.countplot('PAY_0', ax=axis[0, 0], data=data)
sns.countplot('PAY_2', ax=axis[0, 1], data=data)
sns.countplot('PAY_3', ax=axis[0, 2], data=data)
sns.countplot('PAY_4', ax=axis[1, 0], data=data)
sns.countplot('PAY_5', ax=axis[1, 1], data=data)
sns.countplot('PAY_6', ax=axis[1, 2], data=data)

#### Visualization of Age and Sex distributions

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(18, 8))
sns.distplot(data['AGE'], ax=axis[0], norm_hist=False, kde=False)
sns.countplot(data['SEX'], ax=axis[1])

#### Fixing issues of data

In [None]:
# fix issues of labeling 

# assign 0 class to 3rd class
data['MARRIAGE'] = data['MARRIAGE'].replace({0: 3})

# assign 6 and 0 class to 5th class
data['EDUCATION'] = data['EDUCATION'].replace({6: 5, 0: 5})

# iterate over columns and assign -1, and -2 class to 0
for column in ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
    data[column] = data[column].replace({-1: 0, -2: 0})

#### Visualization of noncategorical features
#### Distribution visualization BILL_AMT 

In [None]:
fig, axis = plt.subplots(2, 3, figsize=(24, 15))

sns.distplot(data['BILL_AMT1'], ax=axis[0, 0], kde=False, norm_hist=False, bins=20)
sns.distplot(data['BILL_AMT2'], ax=axis[0, 1], kde=False, norm_hist=False, bins=20)
sns.distplot(data['BILL_AMT3'], ax=axis[0, 2], kde=False, norm_hist=False, bins=20)

sns.boxplot(data['BILL_AMT1'], ax=axis[1, 0])
sns.boxplot(data['BILL_AMT2'], ax=axis[1, 1])
sns.boxplot(data['BILL_AMT3'], ax=axis[1, 2])

In [None]:
fig, axis = plt.subplots(2, 3, figsize=(24, 15))

sns.distplot(data['BILL_AMT4'], ax=axis[0, 0], kde=False, norm_hist=False, bins=20)
sns.distplot(data['BILL_AMT5'], ax=axis[0, 1], kde=False, norm_hist=False, bins=20)
sns.distplot(data['BILL_AMT6'], ax=axis[0, 2], kde=False, norm_hist=False, bins=20)

sns.boxplot(data['BILL_AMT4'], ax=axis[1, 0])
sns.boxplot(data['BILL_AMT5'], ax=axis[1, 1])
sns.boxplot(data['BILL_AMT6'], ax=axis[1, 2])

#### Distribution visualization PAY_AMT 

In [None]:
fig, axis = plt.subplots(2, 3, figsize=(24, 15))
print("\033[1m Distribution of PAY_AMT 1-3 \033[0m")
sns.distplot(data['PAY_AMT1'], ax=axis[0, 0], kde=False, norm_hist=False)
sns.distplot(data['PAY_AMT2'], ax=axis[0, 1], kde=False, norm_hist=False)
sns.distplot(data['PAY_AMT3'], ax=axis[0, 2], kde=False, norm_hist=False)

sns.boxplot(data['PAY_AMT1'], ax=axis[1, 0])
sns.boxplot(data['PAY_AMT2'], ax=axis[1, 1])
sns.boxplot(data['PAY_AMT3'], ax=axis[1, 2])

In [None]:
fig, axis = plt.subplots(2, 3, figsize=(24, 15))
print("\033[1m Distribution of PAY_AMT 4-6 \033[0m")

sns.distplot(data['PAY_AMT4'], ax=axis[0, 0], kde=False, norm_hist=False)
sns.distplot(data['PAY_AMT5'], ax=axis[0, 1], kde=False, norm_hist=False)
sns.distplot(data['PAY_AMT6'], ax=axis[0, 2], kde=False, norm_hist=False)

sns.boxplot(data['PAY_AMT4'], ax=axis[1, 0])
sns.boxplot(data['PAY_AMT5'], ax=axis[1, 1])
sns.boxplot(data['PAY_AMT6'], ax=axis[1, 2])

#### Use 3rd quantile threshold for filtering PAY_AMT

In [None]:
filtered_data = list(
    data[column][data[column] <= data[column].quantile(0.75)]
    for column in ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
)

fig, axis = plt.subplots(2, 3, figsize=(24, 15))
print("\033[1m Розподіл значень PAY_AMT 1-3 \033[0m")

sns.distplot(filtered_data[0], ax=axis[0, 0], kde=False, norm_hist=False)
sns.distplot(filtered_data[1], ax=axis[0, 1], kde=False, norm_hist=False)
sns.distplot(filtered_data[2], ax=axis[0, 2], kde=False, norm_hist=False)

sns.boxplot(filtered_data[0], ax=axis[1, 0])
sns.boxplot(filtered_data[1], ax=axis[1, 1])
sns.boxplot(filtered_data[2], ax=axis[1, 2])

In [None]:
fig, axis = plt.subplots(2, 3, figsize=(24, 15))
print("\033[1m Розподіл значень PAY_AMT 4-6 \033[0m")

sns.distplot(filtered_data[3], ax=axis[0, 0], kde=False, norm_hist=False)
sns.distplot(filtered_data[4], ax=axis[0, 1], kde=False, norm_hist=False)
sns.distplot(filtered_data[5], ax=axis[0, 2], kde=False, norm_hist=False)

sns.boxplot(filtered_data[3], ax=axis[1, 0])
sns.boxplot(filtered_data[4], ax=axis[1, 1])
sns.boxplot(filtered_data[5], ax=axis[1, 2])

#### Matrix of correlation

In [None]:
correlation = data.corr()

mask = np.zeros_like(
    correlation,
    dtype=np.bool
)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(25, 25))

sns.heatmap(
    correlation,
    xticklabels=correlation.columns,
    yticklabels=correlation.columns,
    linewidths=.1,
    vmin=-1,
    vmax=1,
    annot=True,
    mask=mask
)

#### Encoding of categorical features (One Hot)

In [None]:
# encode
data = pd.get_dummies(data, columns=['SEX', 'MARRIAGE', 'EDUCATION'])

In [None]:
# drop junk columns
data = data.drop(columns=['SEX_2', 'ID'])

In [None]:
pd.set_option('display.max_columns', 50)
data.head(10)

#### Visualization of linear dependencies

In [None]:
# select columns for visualization
data_tmp_1 = data[['LIMIT_BAL', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
data_tmp_2 = data[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'AGE']]

In [None]:
sns.pairplot(data_tmp_1)

In [None]:
sns.pairplot(data_tmp_2)

#### Target visualiztion

In [None]:
plt.figure(figsize=(18, 8))
sns.countplot(data['Target'])

#### Features normalization

In [None]:
Y = data['Target']  # select target 
X = data[data.columns.difference(['Target'])]  # select features

In [None]:
scaler = MinMaxScaler()

for column in X.columns:
    X[column] = scaler.fit_transform(np.array(X[column]).reshape(-1,1))

In [None]:
X.head(10)

#### Features selection

In [None]:
selector_chi = SelectKBest(chi2, k=10)  # using Chi2 select 10 best features
selector_chi.fit(X, Y)
features_1_scores = selector_chi.scores_
features_1 = list(X.columns[selector_chi.get_support(indices=True)])

plt.figure(figsize=(18, 8))
features_scores = pd.DataFrame({'features': X.columns.tolist(), 'scores': features_1_scores})
sns.barplot('features', 'scores', data=features_scores, order=features_scores.sort_values('scores')['features'])
plt.title('Best features - Chi2')
plt.ylabel('Score')
plt.xticks(list(range(len(features_1_scores))), X.columns.tolist(), rotation=45)
plt.show()

In [None]:
selector = SelectKBest(f_classif, k=10)  # using f_classif select 10 best features
selector.fit(X, Y)
features_2_scores = selector.scores_
features_2 = list(X.columns[selector.get_support(indices=True)])

plt.figure(figsize=(18, 8))
features_scores = pd.DataFrame({'features': X.columns.tolist(), 'scores': features_2_scores})
sns.barplot('features', 'scores', data=features_scores, order=features_scores.sort_values('scores')['features'])
plt.title('Best features - ANOVA F-value')
plt.ylabel('Score')
plt.xticks(list(range(len(features_2_scores))), X.columns.tolist(), rotation=45)
plt.show()

#### Get a set of these best features

In [None]:
features = set(features_1 + features_2)
features

In [None]:
# select only best features
X = X[features]

#### Train Dev Test split

In [None]:
X_train, X_dev, y_train, y_dev = train_test_split(X, Y, test_size=0.3, stratify=Y)
X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, test_size=0.5, stratify=y_dev)

In [None]:
fig, axis = plt.subplots(1, 3, figsize=(25, 8))

sns.countplot(y_train, ax=axis[0], label='train')
sns.countplot(y_dev, ax=axis[1], label='dev')
sns.countplot(y_test, ax=axis[2], label='test')
axis[0].set_title('train')
axis[1].set_title('dev')
axis[2].set_title('test')
plt.show()

#### Modeling

<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-c3ow{border-color:inherit;text-align:center;vertical-align:top}
.tg .tg-feq9{background-color:#9698ed;border-color:inherit;font-weight:bold;text-align:center;vertical-align:top}
.tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-feq9">Model</th>
    <th class="tg-feq9">Link</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-c3ow">Logistic Regression</td>
    <td class="tg-0pky">https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html</td>
  </tr>
  <tr>
    <td class="tg-c3ow">Decision Tree</td>
    <td class="tg-0pky">https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html</td>
  </tr>
  <tr>
    <td class="tg-c3ow">Random Forest</td>
    <td class="tg-0pky">https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html</td>
  </tr>
  <tr>
    <td class="tg-c3ow">XGBoost</td>
    <td class="tg-0pky">https://xgboost.readthedocs.io/en/latest/</td>
  </tr>
  <tr>
    <td class="tg-c3ow">Gradient Boosting</td>
    <td class="tg-0pky">https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html</td>
  </tr>
  <tr>
    <td class="tg-c3ow">LightGBM</td>
    <td class="tg-0pky">https://lightgbm.readthedocs.io/en/latest/</td>
  </tr>
</tbody>
</table>

In [None]:
# list of models
models = [
    LogisticRegression(solver="liblinear"),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=10),
    XGBClassifier(),
    GradientBoostingClassifier(),
    LGBMClassifier(),
]

In [None]:
auc_scores = []
cv_scores = []
acc_scores = []

# iterate over models
for model in models:
    
    # fit model
    model.fit(X_train, y_train)
    
    # predict on dev
    prediction = model.predict(X_dev)
    
    # predict probability on dev
    probability = model.predict_proba(X_dev)
    
    # get AUC
    auc = roc_auc_score(y_dev, probability[:,1])
    
    # get CV score on train
    cv_score = cross_val_score(model, X_train, y_train, cv=10).mean()
    
    # get acc
    score = accuracy_score(y_dev, prediction)
    
    # get report
    report = classification_report(y_dev, prediction, zero_division=1)
    
    # print report
    name = str(model)
    print(name[0:name.find("(")])
    
    print("Accuracy :", score)
    print("CV Score :", cv_score)
    print("AUC Score : ", auc)
    print(report)
    print(confusion_matrix(y_dev, prediction))
    print(" \033[1m ------------------------------------------------------------ \033[0m ")
    
    auc_scores.append(auc)
    cv_scores.append(cv_score)
    acc_scores.append(score)

#### Metrics visualiztion


In [None]:
metrics = pd.DataFrame({
    'AUC': auc_scores,
    'CV Score': cv_scores,
    'Accuracy': acc_scores,
    'name': ['LogisticRegression',
             'DecisionTreeClassifier',
             'RandomForestClassifier',
             'XGBClassifier',
             'GradientBoostingClassifier',
             'LGBMClassifier']
})

In [None]:
fig, axis = plt.subplots(3, 1, figsize=(25, 25))

sns.barplot(x='name', y='AUC', data=metrics, order=metrics.sort_values('AUC')['name'], ax=axis[0])
sns.barplot(x='name', y='CV Score', data=metrics, order=metrics.sort_values('CV Score')['name'], ax=axis[1])
sns.barplot(x='name', y='Accuracy', data=metrics, order=metrics.sort_values('Accuracy')['name'], ax=axis[2])
plt.show()

### LightGBM

In [None]:
lgbm_params = {"n_estimators" : [100, 500, 1000],
               "num_leaf": [5, 15, 25, 30],
               "subsample" : [0.6, 0.8, 1.0],
               "learning_rate" : [0.1, 0.01, 0.02],
               "min_child_samples" : [5, 10, 20]}

In [None]:
lgbm_model = LGBMClassifier()

lgbm_cv_model = GridSearchCV(
    lgbm_model, 
    lgbm_params, 
    cv=5,
    verbose=1,
    n_jobs=-1)

In [None]:
lgbm_cv_model.fit(X_train, y_train)

In [None]:
print('Best params LightGBM')
best_params = lgbm_cv_model.best_params_
best_params

In [None]:
auc_scores = []
f1_scores = []
cv_scores = []
acc_scores = []

In [None]:
# train LightGBM with best params and predict on test

model = LGBMClassifier(
**best_params
)

# fit model
model.fit(X_train, y_train)

# predict on dev
prediction = model.predict(X_test)

# predict probability on dev
probability = model.predict_proba(X_test)

# get AUC
auc = roc_auc_score(y_test, probability[:,1])

# get CV score on train
cv_score = cross_val_score(model, X_train, y_train, cv=10).mean()

# get acc
score = accuracy_score(y_test, prediction)

# get report
report = classification_report(y_test, prediction, zero_division=1)


print("Accuracy :", score)
print("CV Score :", cv_score)
print("AUC Score : ", auc)
print(report)
print(confusion_matrix(y_test, prediction))
print(" \033[1m ------------------------------------------------------------ \033[0m ")


auc_scores.append(auc)
acc_scores.append(score)
cv_scores.append(cv_score)
f1_scores.append(f1_score(y_test, prediction))

### Gradient Boosting 

In [None]:
gb_params = parameters = {
    "learning_rate": [0.01, 0.05, 0.075, 0.1],
    "min_samples_leaf": np.linspace(0.1, 0.5, 4),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "subsample":[0.5, 0.6, 0.8],
    "n_estimators":[10, 30, 60, 90]
    }

In [None]:
gb_model = GradientBoostingClassifier()

gb_cv_model = GridSearchCV(
    gb_model, 
    gb_params, 
    cv=5,
    verbose=1,
    n_jobs=-1)

In [None]:
gb_cv_model.fit(X_train, y_train)
print(' ')

In [None]:
print('Best params Gradient Boosting')
best_params = gb_cv_model.best_params_
best_params

In [None]:
# train GradientBoosting with best params and predict on test

model = GradientBoostingClassifier(
**best_params
)

# fit model
model.fit(X_train, y_train)

# predict on dev
prediction = model.predict(X_test)

# predict probability on dev
probability = model.predict_proba(X_test)

# get AUC
auc = roc_auc_score(y_test, probability[:,1])

# get CV score on train
cv_score = cross_val_score(model, X_train, y_train, cv=10).mean()

# get acc
score = accuracy_score(y_test, prediction)

# get report
report = classification_report(y_test, prediction, zero_division=1)


print("Accuracy :", score)
print("CV Score :", cv_score)
print("AUC Score : ", auc)
print(report)
print(confusion_matrix(y_test, prediction))
print(" \033[1m ------------------------------------------------------------ \033[0m ")


auc_scores.append(auc)
acc_scores.append(score)
cv_scores.append(cv_score)
f1_scores.append(f1_score(y_test, prediction))

#### Metrics visualization (test data)

In [None]:
metrics = pd.DataFrame({
    'AUC': auc_scores,
    'Accuracy': acc_scores,
    'F1': f1_scores,
    'CV Score': cv_scores,
    'name': ['LGBMClassifier',
             'GradientBoostingClassifier']
})

In [None]:
fig, axis = plt.subplots(4, 1, figsize=(25, 25))

sns.barplot(x='name', y='AUC', data=metrics, order=metrics.sort_values('AUC')['name'], ax=axis[0])
sns.barplot(x='name', y='CV Score', data=metrics, order=metrics.sort_values('CV Score')['name'], ax=axis[1])
sns.barplot(x='name', y='Accuracy', data=metrics, order=metrics.sort_values('Accuracy')['name'], ax=axis[2])
sns.barplot(x='name', y='F1', data=metrics, order=metrics.sort_values('Accuracy')['name'], ax=axis[3])
plt.show()