In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
# machine learning
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import tree
import pickle
from scipy.stats import uniform, randint
from sklearn.metrics import f1_score

# Load the Drive helper and mount
from google.colab import drive
from google.colab import files

# This will prompt for authorization.
drive.mount('/content/drive')

from tqdm import tqdm
tqdm.pandas()
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data Preprocessing and Data Unpreprocessing

In [0]:
data_preprocessing = pd.read_excel ('drive/My Drive/Colab Notebooks/credit_scoring/data_preprocessing.xlsx')
data_unpreprocessing = pd.read_excel ('drive/My Drive/Colab Notebooks/credit_scoring/data_unpreprocessing.xlsx')

In [0]:
# Pisahkan antara feature data dengan target data
X_pre = data_preprocessing.drop(['CREDIT_SCORE'],axis=1).values
y_pre = data_preprocessing['CREDIT_SCORE'].values
##################################################
X_unpre = data_unpreprocessing.drop(['CREDIT_SCORE'],axis=1).values
y_unpre = data_unpreprocessing['CREDIT_SCORE'].values

## Split data for training and testing

In [0]:
# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_pre, y_pre, test_size=0.2, shuffle=True)
X_train_un, X_test_un, y_train_un, y_test_un = train_test_split(X_unpre, y_unpre, test_size=0.2, shuffle=True)

# Insights about Dataset
- Dataset is imbalanced
- Data is non-linear
- Dataset have outlier which is make-sense data (we not drop it)
- Feature of dataset is almost everything was categorical type
- Small Dataset

# Reason behind choosen algorithm
1. Because we have small dataset so we prefer use machine learning algorithm rather than neural network / deep learning
2. Credit Scoring is classification case (predict class)
3. Because of dataset is non-linear we use algorithm which can handle that and have good performance
4. We try use CatBoost algorithm because feature of dataset is almost everything was categorical type

Option : if you want to balance class in dataset, you can use SMOTE for over-sampling

# Metric to evaluate performance of models
I'm use F1-score which can give more insight into the accuracy of the model than traditional classification accuracy

# Create Models

Model :
- Support Vector Machine a.k.a SVM
- Random Forest a.k.a RF
- Extreme Gradient Boosting a.k.a XGBOOST
- CatBoost

In [0]:
models = {
    "SVC" : {"algo" : SVC(random_state=42),
             "parameter" : {'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000],
                            'gamma':[1e-2, 1e-3, 1e-4, 1e-5, 'auto'],
                            'kernel':['poly','rbf','sigmoid']},
                            'degree': [2, 3, 4]},

    "Random Forest" : {"algo" : RandomForestClassifier(random_state=42),
                       "parameter" : {"n_estimators" : [10, 20, 40, 60, 80, 100, 150],
                                      "criterion" : ['gini', 'entropy'],
                                      "max_features" : ['auto', 'sqrt', 'log2']} },
    
    "XGBoost" : {"algo" : XGBClassifier(random_state=42),
                 "parameter" : {"learning_rate": np.arange(0.01, 0.2, 0.02),  
                                "colsample_bytree": np.arange(0.7, 1, 0.05),
                                "max_depth" : np.arange(4, 10, 1),
                                "n_estimators": np.arange(100, 1000, 50),
                                "subsample": np.arange(0.7, 1, 0.05),
                                "objective": ["binary:logistic", "binary:hinge"],
                                "max_delta_step": np.arange(0, 4, 1)} }
}

In [0]:
def fine_tuning(model, model_param: dict, x_train, y_train, n_iteration, score, split):
  search = RandomizedSearchCV(model, model_param, n_iter=n_iteration, scoring=score, cv=split, verbose=1, n_jobs=-1)
  search.fit(x_train, y_train)
  print("Model={} \nScore={} ".format(model, search.best_score_))
  metric = search.best_score_
  best_param = search.best_estimator_
  return best_param, metric

# Fine-Tuning models

In [21]:
list_model=[]

for model, arg in models.items():
    print(model.upper())
    best_param, metric = fine_tuning(arg["algo"], arg["parameter"], X_train, y_train, 250, "f1", 4)
    list_model.append([model, metric, best_param])
    print("#################################################################################")
    print()

SVC
Fitting 4 folds for each of 120 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    6.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Model=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=42,
    shrinking=True, tol=0.001, verbose=False) 
Score=0.8210202761000862 
#################################################################################

RANDOM FOREST
Fitting 4 folds for each of 42 candidates, totalling 168 fits


[Parallel(n_jobs=-1)]: Done 168 out of 168 | elapsed:    8.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False) 
Score=0.8433875858673022 
#################################################################################

XGBOOST
Fitting 4 folds for each of 250 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   59.3s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  5.2min finished


Model=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1) 
Score=0.8325409695826282 
#################################################################################



# Create CatBoost model and Fine-Tuning

## Install catboost

In [0]:
!pip install catboost -q

## Change target label value

In [0]:
###### BEFORE
# '1' = LAYAK , '2' = TIDAK LAYAK
###### AFTER
# '1' = LAYAK , '0' = TIDAK LAYAK
y_train_un = np.where(y_train_un==2, 0, y_train_un)
y_test_un = np.where(y_test_un==2, 0, y_test_un)

## Fine-Tuning CatBoost

In [24]:
import catboost
import scipy
from catboost import CatBoostClassifier

model = CatBoostClassifier(eval_metric='F1', task_type='GPU')
train_pool = catboost.Pool(X_train_un, y_train_un, cat_features = [2,3,4,5,6,7,8,9,10,11,12])
test_pool = catboost.Pool(X_test_un, cat_features = [2,3,4,5,6,7,8,9,10,11,12])

parameter = {
    'learning_rate': scipy.stats.uniform(0.1, 0.3),
    'depth': scipy.stats.randint(8, 16),
    'l2_leaf_reg':scipy.stats.uniform(1, 10),
    'one_hot_max_size': [2, 5, 10],
    'iterations' : [50, 75, 100, 150] #number of trees
}

randomized_search_results = model.randomized_search(
    parameter,
    train_pool,
    n_iter=30,
    shuffle=True,
    cv=4
)

0:	loss: 0.8557692	best: 0.8557692 (0)	total: 15.7s	remaining: 7m 34s
1:	loss: 0.8341232	best: 0.8557692 (0)	total: 1m 3s	remaining: 14m 49s
2:	loss: 0.8458150	best: 0.8557692 (0)	total: 3m 15s	remaining: 29m 19s
3:	loss: 0.8584906	best: 0.8584906 (3)	total: 3m 27s	remaining: 22m 28s
4:	loss: 0.8682927	best: 0.8682927 (4)	total: 3m 34s	remaining: 17m 50s
5:	loss: 0.8316832	best: 0.8682927 (4)	total: 3m 40s	remaining: 14m 42s
6:	loss: 0.8487805	best: 0.8682927 (4)	total: 4m 13s	remaining: 13m 53s
7:	loss: 0.8405797	best: 0.8682927 (4)	total: 4m 29s	remaining: 12m 19s
8:	loss: 0.8487805	best: 0.8682927 (4)	total: 6m 21s	remaining: 14m 50s
9:	loss: 0.8173077	best: 0.8682927 (4)	total: 10m 37s	remaining: 21m 15s
10:	loss: 0.8430493	best: 0.8682927 (4)	total: 10m 58s	remaining: 18m 56s
11:	loss: 0.8446602	best: 0.8682927 (4)	total: 11m 4s	remaining: 16m 36s
12:	loss: 0.8487805	best: 0.8682927 (4)	total: 11m 11s	remaining: 14m 37s
13:	loss: 0.8599034	best: 0.8682927 (4)	total: 11m 29s	remain

## Fit CatBoost model

In [25]:
# Get best parameter of models
best_parameter = randomized_search_results['params']
# Create CatBoostClassifier
model_catboost = CatBoostClassifier(**best_parameter, task_type='GPU')
# Fit CatBoost
model_catboost.fit(X_train_un, y_train_un, cat_features = [2,3,4,5,6,7,8,9,10,11,12])

# Append score and model
list_model.append(['CatBoost', model_catboost.get_best_score(), model_catboost])

0:	learn: 0.6143064	total: 10.6ms	remaining: 518ms
1:	learn: 0.5805439	total: 20.3ms	remaining: 486ms
2:	learn: 0.5513370	total: 29.9ms	remaining: 468ms
3:	learn: 0.5280817	total: 39.7ms	remaining: 456ms
4:	learn: 0.5096263	total: 49.2ms	remaining: 442ms
5:	learn: 0.4959403	total: 58.8ms	remaining: 431ms
6:	learn: 0.4780001	total: 68.5ms	remaining: 421ms
7:	learn: 0.4651641	total: 78.2ms	remaining: 410ms
8:	learn: 0.4502458	total: 87.7ms	remaining: 400ms
9:	learn: 0.4388404	total: 97.3ms	remaining: 389ms
10:	learn: 0.4296538	total: 107ms	remaining: 379ms
11:	learn: 0.4197322	total: 116ms	remaining: 368ms
12:	learn: 0.4109223	total: 126ms	remaining: 358ms
13:	learn: 0.4078302	total: 131ms	remaining: 336ms
14:	learn: 0.3952962	total: 140ms	remaining: 328ms
15:	learn: 0.3847701	total: 150ms	remaining: 319ms
16:	learn: 0.3758305	total: 162ms	remaining: 314ms
17:	learn: 0.3682390	total: 171ms	remaining: 305ms
18:	learn: 0.3611891	total: 181ms	remaining: 296ms
19:	learn: 0.3513389	total: 191

# Testing all models and evaluate performance to Unknown data (testing data)

In [26]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
# Predict
for i in list_model:
  if i[0] != 'CatBoost':
    y_pred = i[2].predict(X_test)
    f1 = f1_score(y_test, y_pred)
  else :
    y_pred = i[2].predict(X_test_un, prediction_type='Class')
    f1 = f1_score(y_test_un, y_pred)
    
  print("Evaluation Performance", i[0])
  print("F1-Score ={:.4f}".format(f1))
  print()

Evaluation Performance SVC
F1-Score =0.8462

Evaluation Performance Random Forest
F1-Score =0.8459

Evaluation Performance XGBoost
F1-Score =0.8303

Evaluation Performance CatBoost
F1-Score =0.8551



# Result from Evaluation Performance
CatBoost has the highest F1-Score because the ability for handle categorical data and has gradient boosting concept, but we must note it need more computation time rather than other algorithm

But CatBoost can performance really well on large dataset and more robust to prevent overfitting

# Save CatBoost model

In [0]:
model_catboost.save_model("drive/My Drive/Colab Notebooks/credit_scoring/model_cs",
           format="cbm")