In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split
from imblearn.metrics import geometric_mean_score, sensitivity_score, specificity_score
from xgboost import XGBClassifier
from shaphypetune import BoostRFE
from sklearn.base import BaseEstimator
from sklearn.utils.metaestimators import if_delegate_has_method

import torch
import re, pickle, random, os
import warnings
warnings.filterwarnings('ignore')

from collections import Counter

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

RANDOM_STATE = 42
seed_everything(seed=RANDOM_STATE)

In [3]:
col_names = ["height","lenght","area","eccen","p_black","p_and","mean_tr","blackpix","blackand","wb_trans","class"]
df = pd.read_csv('page-blocks.csv', names = col_names, delim_whitespace=True)
df.shape

(5473, 11)

In [4]:
df.head()

Unnamed: 0,height,lenght,area,eccen,p_black,p_and,mean_tr,blackpix,blackand,wb_trans,class
0,5,7,35,1.4,0.4,0.657,2.33,14,23,6,1
1,6,7,42,1.167,0.429,0.881,3.6,18,37,5,1
2,6,18,108,3.0,0.287,0.741,4.43,31,80,7,1
3,5,7,35,1.4,0.371,0.743,4.33,13,26,3,1
4,6,3,18,0.5,0.5,0.944,2.25,9,17,4,1


In [5]:
df['class'].value_counts()

1    4913
2     329
5     115
4      88
3      28
Name: class, dtype: int64

In [6]:
df.isnull().sum()

height      0
lenght      0
area        0
eccen       0
p_black     0
p_and       0
mean_tr     0
blackpix    0
blackand    0
wb_trans    0
class       0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,height,lenght,area,eccen,p_black,p_and,mean_tr,blackpix,blackand,wb_trans,class
count,5473.0,5473.0,5473.0,5473.0,5473.0,5473.0,5473.0,5473.0,5473.0,5473.0,5473.0
mean,10.473232,89.568244,1198.405628,13.753977,0.368642,0.785053,6.219278,365.930751,741.108167,106.662891,1.202631
std,18.960564,114.721758,4849.37695,30.703737,0.177757,0.170661,69.079021,1270.333082,1881.504302,167.308362,0.72147
min,1.0,1.0,7.0,0.007,0.052,0.062,1.0,7.0,7.0,1.0,1.0
25%,7.0,17.0,114.0,2.143,0.261,0.679,1.61,42.0,95.0,17.0,1.0
50%,8.0,41.0,322.0,5.167,0.337,0.803,2.07,108.0,250.0,49.0,1.0
75%,10.0,107.0,980.0,13.625,0.426,0.927,3.0,284.0,718.0,126.0,1.0
max,804.0,553.0,143993.0,537.0,1.0,1.0,4955.0,33017.0,46133.0,3212.0,5.0


# Train and Test Split

In [8]:
x = df.drop(['class'], axis=1)
y = df['class']

X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [9]:
y_train.shape, y_test.shape

((4378,), (1095,))

In [10]:
y_train.value_counts()/len(y_train)

1    0.897670
2    0.060073
5    0.021014
4    0.015989
3    0.005254
Name: class, dtype: float64

In [11]:
y_test.value_counts()/len(y_test)

1    0.897717
2    0.060274
5    0.021005
4    0.016438
3    0.004566
Name: class, dtype: float64

In [12]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

# Model Building

In [13]:
model = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()
balanced_acc = list()
GMean = list()
sensi = list()
speci = list()

In [14]:
def test_eval(clf_model, X_test, y_test, algo=None):

    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('='*60)
    print('AUC-ROC')
    print(roc_auc_score(y_test, y_prob, multi_class='ovr'), "\n")
    print('Balanced Accuracy')
    print(balanced_accuracy_score(y_test, y_pred),"\n")
    print('Geometric Mean Score')
    print(geometric_mean_score(y_test, y_pred, average='macro'),"\n")
    print('Sensitivity')
    print(sensitivity_score(y_test, y_pred, average='macro'),"\n")
    print('Specificity')
    print(specificity_score(y_test, y_pred, average='macro'))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred, average='macro'))
    recall.append(recall_score(y_test,y_pred, average='macro'))
    F1score.append(f1_score(y_test,y_pred, average='macro'))
    AUCROC.append(roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro'))
    balanced_acc.append(balanced_accuracy_score(y_test, y_pred))
    GMean.append(geometric_mean_score(y_test, y_pred, average='macro'))
    sensi.append(sensitivity_score(y_test, y_pred, average='macro'))
    speci.append(specificity_score(y_test, y_pred, average='macro'))


## Model-1: Logistic Regression

In [15]:
clf_LR = LogisticRegression(multi_class='ovr', solver='liblinear')
clf_LR.fit(X_train, y_train)

LogisticRegression(multi_class='ovr', solver='liblinear')

In [16]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression')

Confusion Matrix
[[976   5   1   0   1]
 [ 22  44   0   0   0]
 [  2   0   3   0   0]
 [  2   0   0  16   0]
 [ 16   0   0   0   7]] 

Classification Report
              precision    recall  f1-score   support

           1       0.96      0.99      0.98       983
           2       0.90      0.67      0.77        66
           3       0.75      0.60      0.67         5
           4       1.00      0.89      0.94        18
           5       0.88      0.30      0.45        23

    accuracy                           0.96      1095
   macro avg       0.90      0.69      0.76      1095
weighted avg       0.95      0.96      0.95      1095
 

AUC-ROC
0.9776208830946086 

Balanced Accuracy
0.6905564647313509 

Geometric Mean Score
0.7986476647364904 

Sensitivity
0.6905564647313509 

Specificity
0.9236581292989408


## Model-2: Decision Tree

In [17]:
clf_DT = DecisionTreeClassifier()
clf_DT.fit(X_train, y_train)

DecisionTreeClassifier()

In [18]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree')

Confusion Matrix
[[970   3   2   2   6]
 [ 11  54   0   0   1]
 [  1   0   4   0   0]
 [  1   0   0  17   0]
 [  7   2   0   1  13]] 

Classification Report
              precision    recall  f1-score   support

           1       0.98      0.99      0.98       983
           2       0.92      0.82      0.86        66
           3       0.67      0.80      0.73         5
           4       0.85      0.94      0.89        18
           5       0.65      0.57      0.60        23

    accuracy                           0.97      1095
   macro avg       0.81      0.82      0.81      1095
weighted avg       0.97      0.97      0.97      1095
 

AUC-ROC
0.8942791264474726 

Balanced Accuracy
0.822923766391412 

Geometric Mean Score
0.889324880303748 

Sensitivity
0.822923766391412 

Specificity
0.9610838512969815


## Model-3: Gaussian NB

In [19]:
clf_NB = GaussianNB()
clf_NB.fit(X_train, y_train)

GaussianNB()

In [20]:
test_eval(clf_NB, X_test, y_test, 'Gaussian NB')

Confusion Matrix
[[919   5   1  42  16]
 [ 19  37   0   9   1]
 [  1   0   4   0   0]
 [  0   0   0  18   0]
 [ 11   1   1   1   9]] 

Classification Report
              precision    recall  f1-score   support

           1       0.97      0.93      0.95       983
           2       0.86      0.56      0.68        66
           3       0.67      0.80      0.73         5
           4       0.26      1.00      0.41        18
           5       0.35      0.39      0.37        23

    accuracy                           0.90      1095
   macro avg       0.62      0.74      0.63      1095
weighted avg       0.93      0.90      0.91      1095
 

AUC-ROC
0.9667176413839913 

Balanced Accuracy
0.7373607185124722 

Geometric Mean Score
0.8282228659218356 

Sensitivity
0.7373607185124722 

Specificity
0.9302816090062386


## Model-4: K-Nearest Neighbour

In [21]:
clf_KNN = KNeighborsClassifier()
clf_KNN.fit(X_train, y_train)

KNeighborsClassifier()

In [22]:
test_eval(clf_KNN, X_test, y_test, 'KNN')

Confusion Matrix
[[979   3   0   0   1]
 [ 15  51   0   0   0]
 [  2   0   3   0   0]
 [  3   1   0  12   2]
 [ 14   1   0   0   8]] 

Classification Report
              precision    recall  f1-score   support

           1       0.97      1.00      0.98       983
           2       0.91      0.77      0.84        66
           3       1.00      0.60      0.75         5
           4       1.00      0.67      0.80        18
           5       0.73      0.35      0.47        23

    accuracy                           0.96      1095
   macro avg       0.92      0.68      0.77      1095
weighted avg       0.96      0.96      0.96      1095
 

AUC-ROC
0.9513640768927576 

Balanced Accuracy
0.6766301700717199 

Geometric Mean Score
0.7965631053363789 

Sensitivity
0.6766301700717199 

Specificity
0.937754195494829


## MODEL-5 Support Vector Classifier

In [23]:
clf_SVC = SVC(probability=True)
clf_SVC.fit(X_train, y_train)

SVC(probability=True)

In [24]:
test_eval(clf_SVC, X_test, y_test, 'SVC')

Confusion Matrix
[[980   2   0   0   1]
 [ 16  50   0   0   0]
 [  5   0   0   0   0]
 [  4   0   0  13   1]
 [ 13   1   0   1   8]] 

Classification Report
              precision    recall  f1-score   support

           1       0.96      1.00      0.98       983
           2       0.94      0.76      0.84        66
           3       0.00      0.00      0.00         5
           4       0.93      0.72      0.81        18
           5       0.80      0.35      0.48        23

    accuracy                           0.96      1095
   macro avg       0.73      0.56      0.62      1095
weighted avg       0.95      0.96      0.95      1095
 

AUC-ROC
0.9795742944404665 

Balanced Accuracy
0.564914436952121 

Geometric Mean Score
0.7252143593253763 

Sensitivity
0.564914436952121 

Specificity
0.9310009314141345


# MODEL-6 XGBoost

In [25]:
class BoostRFEWrap(BaseEstimator, BoostRFE):

    @if_delegate_has_method(delegate='estimator')
    def predict_proba(self, X):
        return self.predict(X, method='predict_proba')


xgb_params = {'max_depth': (5,10),
          'learning_rate': (0.001, 0.3),
          'n_estimators': (25, 200),
          'reg_alpha' : (0.1, 1),
          'reg_lambda': (0.1, 1),
          'subsample': (0.5,  0.9),
          'colsample_bytree': (0.5,1),
          'min_child_weight': (0,10),
          'gamma': (0,1)}

clf_XGB = BoostRFEWrap(XGBClassifier(), param_grid=xgb_params, importance_type='shap_importances', train_importance=False, min_features_to_select=1, step=1)

In [26]:
xclf = clf_XGB.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=6, verbose=0)


512 trials detected for ('max_depth', 'learning_rate', 'n_estimators', 'reg_alpha', 'reg_lambda', 'subsample', 'colsample_bytree', 'min_child_weight', 'gamma')

trial: 0001 ### iterations: 00021 ### eval_score: 0.02374
trial: 0002 ### iterations: 00012 ### eval_score: 0.02557
trial: 0003 ### iterations: 00024 ### eval_score: 0.02557
trial: 0004 ### iterations: 00020 ### eval_score: 0.02648
trial: 0005 ### iterations: 00011 ### eval_score: 0.02374
trial: 0006 ### iterations: 00016 ### eval_score: 0.02374
trial: 0007 ### iterations: 00017 ### eval_score: 0.021
trial: 0008 ### iterations: 00009 ### eval_score: 0.02192
trial: 0009 ### iterations: 00013 ### eval_score: 0.02557
trial: 0010 ### iterations: 00011 ### eval_score: 0.02648
trial: 0011 ### iterations: 00024 ### eval_score: 0.02557
trial: 0012 ### iterations: 00010 ### eval_score: 0.02283
trial: 0013 ### iterations: 00006 ### eval_score: 0.02466
trial: 0014 ### iterations: 00013 ### eval_score: 0.02557
trial: 0015 ### iterations: 

trial: 0141 ### iterations: 00003 ### eval_score: 0.02374
trial: 0142 ### iterations: 00003 ### eval_score: 0.02466
trial: 0143 ### iterations: 00005 ### eval_score: 0.02831
trial: 0144 ### iterations: 00005 ### eval_score: 0.02831
trial: 0145 ### iterations: 00003 ### eval_score: 0.02831
trial: 0146 ### iterations: 00003 ### eval_score: 0.03014
trial: 0147 ### iterations: 00001 ### eval_score: 0.03379
trial: 0148 ### iterations: 00001 ### eval_score: 0.03379
trial: 0149 ### iterations: 00002 ### eval_score: 0.02557
trial: 0150 ### iterations: 00001 ### eval_score: 0.02557
trial: 0151 ### iterations: 00006 ### eval_score: 0.03196
trial: 0152 ### iterations: 00006 ### eval_score: 0.03196
trial: 0153 ### iterations: 00006 ### eval_score: 0.02922
trial: 0154 ### iterations: 00011 ### eval_score: 0.02922
trial: 0155 ### iterations: 00001 ### eval_score: 0.03196
trial: 0156 ### iterations: 00001 ### eval_score: 0.03196
trial: 0157 ### iterations: 00004 ### eval_score: 0.01918
trial: 0158 ##

trial: 0283 ### iterations: 00024 ### eval_score: 0.02466
trial: 0284 ### iterations: 00020 ### eval_score: 0.02557
trial: 0285 ### iterations: 00006 ### eval_score: 0.02192
trial: 0286 ### iterations: 00005 ### eval_score: 0.02192
trial: 0287 ### iterations: 00006 ### eval_score: 0.02192
trial: 0288 ### iterations: 00018 ### eval_score: 0.021
trial: 0289 ### iterations: 00014 ### eval_score: 0.02374
trial: 0290 ### iterations: 00021 ### eval_score: 0.02466
trial: 0291 ### iterations: 00025 ### eval_score: 0.02648
trial: 0292 ### iterations: 00032 ### eval_score: 0.02648
trial: 0293 ### iterations: 00012 ### eval_score: 0.02192
trial: 0294 ### iterations: 00017 ### eval_score: 0.02192
trial: 0295 ### iterations: 00014 ### eval_score: 0.02192
trial: 0296 ### iterations: 00010 ### eval_score: 0.021
trial: 0297 ### iterations: 00023 ### eval_score: 0.02283
trial: 0298 ### iterations: 00012 ### eval_score: 0.02374
trial: 0299 ### iterations: 00024 ### eval_score: 0.02557
trial: 0300 ### it

trial: 0425 ### iterations: 00011 ### eval_score: 0.0347
trial: 0426 ### iterations: 00013 ### eval_score: 0.02922
trial: 0427 ### iterations: 00002 ### eval_score: 0.03105
trial: 0428 ### iterations: 00001 ### eval_score: 0.03379
trial: 0429 ### iterations: 00005 ### eval_score: 0.02374
trial: 0430 ### iterations: 00004 ### eval_score: 0.02374
trial: 0431 ### iterations: 00004 ### eval_score: 0.02922
trial: 0432 ### iterations: 00003 ### eval_score: 0.02922
trial: 0433 ### iterations: 00001 ### eval_score: 0.03196
trial: 0434 ### iterations: 00001 ### eval_score: 0.03105
trial: 0435 ### iterations: 00002 ### eval_score: 0.0347
trial: 0436 ### iterations: 00002 ### eval_score: 0.0347
trial: 0437 ### iterations: 00005 ### eval_score: 0.0274
trial: 0438 ### iterations: 00001 ### eval_score: 0.02648
trial: 0439 ### iterations: 00000 ### eval_score: 0.03196
trial: 0440 ### iterations: 00000 ### eval_score: 0.03105
trial: 0441 ### iterations: 00011 ### eval_score: 0.0347
trial: 0442 ### ite

In [27]:
test_eval(xclf, X_test, y_test, 'XGB')

Confusion Matrix
[[981   1   1   0   0]
 [  7  59   0   0   0]
 [  1   0   4   0   0]
 [  1   0   0  17   0]
 [  8   0   0   1  14]] 

Classification Report
              precision    recall  f1-score   support

           1       0.98      1.00      0.99       983
           2       0.98      0.89      0.94        66
           3       0.80      0.80      0.80         5
           4       0.94      0.94      0.94        18
           5       1.00      0.61      0.76        23

    accuracy                           0.98      1095
   macro avg       0.94      0.85      0.89      1095
weighted avg       0.98      0.98      0.98      1095
 

AUC-ROC
0.9920514545680368 

Balanced Accuracy
0.8490089805123642 

Geometric Mean Score
0.9070595537130265 

Sensitivity
0.8490089805123642 

Specificity
0.9690793064232999


# MODEL-7 Stacking

In [28]:
clf_DT = DecisionTreeClassifier()
clf_SVC = SVC(probability=True)
clf_KNN = KNeighborsClassifier()
clf_LR = LogisticRegression()

sclf = StackingClassifier(classifiers=[clf_DT, clf_SVC, clf_KNN], use_probas=True, average_probas=False, meta_classifier=clf_LR)

In [29]:
sclf.fit(X_train, y_train)
test_eval(sclf, X_test, y_test, 'Stacking')

Confusion Matrix
[[969   6   1   1   6]
 [ 11  54   0   0   1]
 [  1   0   4   0   0]
 [  1   1   0  16   0]
 [ 11   1   0   1  10]] 

Classification Report
              precision    recall  f1-score   support

           1       0.98      0.99      0.98       983
           2       0.87      0.82      0.84        66
           3       0.80      0.80      0.80         5
           4       0.89      0.89      0.89        18
           5       0.59      0.43      0.50        23

    accuracy                           0.96      1095
   macro avg       0.82      0.79      0.80      1095
weighted avg       0.96      0.96      0.96      1095
 

AUC-ROC
0.9769285520157437 

Balanced Accuracy
0.7855222399589687 

Geometric Mean Score
0.8655482891551269 

Sensitivity
0.7855222399589687 

Specificity
0.9537270910350035


# MODEL-8 Bagging

In [30]:
clf_bg = BaggingClassifier(clf_DT)

In [31]:
clf_bg.fit(X_train, y_train)
test_eval(clf_bg, X_test, y_test, 'Bagging')

Confusion Matrix
[[975   3   1   2   2]
 [  9  57   0   0   0]
 [  1   0   4   0   0]
 [  1   0   0  17   0]
 [  9   1   0   1  12]] 

Classification Report
              precision    recall  f1-score   support

           1       0.98      0.99      0.99       983
           2       0.93      0.86      0.90        66
           3       0.80      0.80      0.80         5
           4       0.85      0.94      0.89        18
           5       0.86      0.52      0.65        23

    accuracy                           0.97      1095
   macro avg       0.88      0.82      0.85      1095
weighted avg       0.97      0.97      0.97      1095
 

AUC-ROC
0.9568896163711134 

Balanced Accuracy
0.8243363173063735 

Geometric Mean Score
0.890694542632175 

Sensitivity
0.8243363173063735 

Specificity
0.9623945368160788


In [32]:
clf_eval_df = pd.DataFrame({'model':model,
                            'precision':precision,
                            'recall':recall,
                            'f1-score':F1score,
                            'AUC-ROC':AUCROC,
                            'balanced_acc':balanced_acc,
                            'GMean':GMean,
                            'sensitivity':sensi,
                            'specificity':speci})
clf_eval_df

Unnamed: 0,model,precision,recall,f1-score,AUC-ROC,balanced_acc,GMean,sensitivity,specificity
0,Logistic Regression,0.89634,0.690556,0.760037,0.977621,0.690556,0.798648,0.690556,0.923658
1,Decision Tree,0.812344,0.822924,0.814787,0.894279,0.822924,0.889325,0.822924,0.961084
2,Gaussian NB,0.619559,0.737361,0.626693,0.966718,0.737361,0.828223,0.737361,0.930282
3,KNN,0.920885,0.67663,0.767523,0.951364,0.67663,0.796563,0.67663,0.937754
4,SVC,0.726928,0.564914,0.623439,0.979574,0.564914,0.725214,0.564914,0.931001
5,XGB,0.942149,0.849009,0.885624,0.992051,0.849009,0.90706,0.849009,0.969079
6,Stacking,0.824785,0.785522,0.802682,0.976929,0.785522,0.865548,0.785522,0.953727
7,Bagging,0.884294,0.824336,0.845374,0.95689,0.824336,0.890695,0.824336,0.962395


In [33]:
clf_eval_df.to_excel("D:\Skripsi\Final\project_pageblocks_final.xlsx")