# Códigos para modelos de classificação
Com diferentes opções de validação (Kfold, leave-one-out e holdout)
* Regressão Logística
* Naive Bayes
* KNN
* Árvore de Decisão
* Random Forest
* Gradient Boosting
* MLP
* SVM

In [51]:
# Bibliotecas auxiliares
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from scipy import stats
from time import time
import warnings
sns.set_style('whitegrid')
sns.set_context('notebook')

# Bibliotecas para modelagem
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
#import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, auc, roc_curve, precision_score, recall_score, classification_report, confusion_matrix, accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut
from sklearn.neighbors import DistanceMetric

from CrossValidationClassification import model_classif_cv, model_classif_holdout, holdout_clf, leave_one_out_clf, kfold_clf

## Gerando base de dados (breast_cancer)
Base com resposta com 2 níveis

In [3]:
cancer = datasets.load_breast_cancer()

dados = pd.DataFrame(cancer.data)
dados.columns = cancer.feature_names
dados['TARGET'] = cancer.target

X = dados.drop('TARGET',axis=1)
Y = dados['TARGET']

dados.head(5)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,TARGET
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


## Regressão logística

In [4]:
LR = LogisticRegression(penalty='l2', 
                                     dual=False, 
                                     tol=0.0001, 
                                     C=1.0, 
                                     fit_intercept=True, 
                                     intercept_scaling=1, 
                                     class_weight=None, 
                                     random_state=None, 
                                     solver='liblinear', 
                                     max_iter=100, 
                                     multi_class='ovr', 
                                     verbose=0, 
                                     warm_start=False, 
                                     n_jobs=1)


In [5]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(n_splits=10, random_state=12, shuffle=True)
cv_loo = LeaveOneOut()

model_classif_cv(LR, X, Y, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.969325,0.927536,0.963415,0.888889,0.975309,0.969697,0.960938,0.912281,0.995009,0.991162,0.936893,0.909091
1,0.967841,0.956522,0.960486,0.916667,0.975309,1.0,0.958984,0.947368,0.996126,0.986111,0.93072,0.958333
2,0.968553,0.95122,0.9625,0.95122,0.974684,0.95122,0.960938,0.929825,0.994753,0.987805,0.935288,0.902439
3,0.96423,0.974359,0.95679,0.95,0.971787,1.0,0.955078,0.964912,0.994835,0.99446,0.933211,0.921053
4,0.96423,0.987013,0.953846,1.0,0.974843,0.974359,0.955078,0.982456,0.994197,0.998575,0.936232,0.974359
5,0.966667,0.983607,0.957958,0.967742,0.975535,1.0,0.957031,0.982456,0.994082,0.993827,0.927101,0.962963
6,0.964119,0.974359,0.956656,0.974359,0.971698,0.974359,0.955078,0.964912,0.994554,0.997151,0.930202,0.948718
7,0.966463,0.96875,0.957704,0.96875,0.975385,0.96875,0.957031,0.964912,0.994159,0.9975,0.923225,0.96875
8,0.96904,0.945946,0.960123,0.945946,0.978125,0.945946,0.960938,0.929825,0.995182,0.982432,0.932292,0.895946
9,0.964451,0.956522,0.96,0.970588,0.968944,0.942857,0.955166,0.946429,0.994553,0.995918,0.927238,0.952381


In [7]:
kfold_clf(LR, X, Y, k=10)

Média da Accuracy base de treinamento:  0.9572349384746589
Média da Accuracy base de teste:  0.950814536340852
Média do LogLoss base de validação:  1.6988310066985235
Média do LogLoss base de treinamento:  1.4770736646778448


In [8]:
leave_one_out_clf(LR, X, Y)

Média da Accuracy base de treinamento:  0.9592749820540111
Média da Accuracy base de teste:  0.9507908611599297


In [9]:
holdout_clf(LR, X, Y, test_s=0.3, strat = None)

Média da Accuracy base de treinamento:  0.9522613065326633
Média da Accuracy base de teste:  0.9649122807017544
Matriz de confusão base de treinamento: 
 [[137  12]
 [  7 242]]
Classification Report base de treinamento: 
               precision    recall  f1-score   support

           0       0.95      0.92      0.94       149
           1       0.95      0.97      0.96       249

    accuracy                           0.95       398
   macro avg       0.95      0.95      0.95       398
weighted avg       0.95      0.95      0.95       398

Matriz de confusão base de teste: 
 [[ 59   4]
 [  2 106]]
Classification Report base de teste: 
               precision    recall  f1-score   support

           0       0.97      0.94      0.95        63
           1       0.96      0.98      0.97       108

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [11]:
# holdout_clf(LR, X, Y, test_s=0.3, strat = Y)

## Naive Bayes

In [12]:
NB = GaussianNB(priors=None)

In [13]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]


cv_kfold = KFold(10, random_state=12, shuffle=True)
model_classif_cv(NB, X, Y, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.95586,0.942857,0.942943,0.891892,0.969136,1.0,0.943359,0.929825,0.990265,0.981061,0.906357,0.867424
1,0.954407,0.956522,0.94012,0.916667,0.969136,1.0,0.941406,0.947368,0.989723,0.987374,0.904978,0.897727
2,0.954899,0.926829,0.938838,0.926829,0.971519,0.926829,0.943359,0.894737,0.990377,0.964939,0.91559,0.815549
3,0.956923,0.936709,0.939577,0.902439,0.974922,0.973684,0.945312,0.912281,0.989605,0.99169,0.909351,0.921053
4,0.952234,0.974359,0.933535,0.974359,0.971698,0.974359,0.939453,0.964912,0.988426,0.997151,0.89788,0.974359
5,0.954955,0.931034,0.938053,0.964286,0.972477,0.9,0.941406,0.929825,0.988561,0.98642,0.898653,0.9
6,0.950464,0.975,0.935976,0.95122,0.965409,1.0,0.9375,0.964912,0.988264,0.997151,0.89587,0.948718
7,0.94864,1.0,0.931751,1.0,0.966154,1.0,0.933594,1.0,0.987347,1.0,0.895681,1.0
8,0.955453,0.935065,0.939577,0.9,0.971875,0.972973,0.943359,0.912281,0.989046,0.989189,0.895833,0.972973
9,0.95586,0.942857,0.937313,0.942857,0.975155,0.942857,0.94347,0.928571,0.988943,0.993197,0.896979,0.942857


In [14]:
kfold_clf(NB, X, Y, k=10)

Média da Accuracy base de treinamento:  0.9410274275097466
Média da Accuracy base de teste:  0.9367794486215537
Média do LogLoss base de validação:  2.183594204983421
Média do LogLoss base de treinamento:  2.0368726595833997


In [15]:
leave_one_out_clf(NB, X, Y)

Média da Accuracy base de treinamento:  0.9419818559865342
Média da Accuracy base de teste:  0.9384885764499121


In [16]:
holdout_clf(NB, X, Y, test_s=0.3, strat = None)

Média da Accuracy base de treinamento:  0.9396984924623115
Média da Accuracy base de teste:  0.9415204678362573
Matriz de confusão base de treinamento: 
 [[133  16]
 [  8 241]]
Classification Report base de treinamento: 
               precision    recall  f1-score   support

           0       0.94      0.89      0.92       149
           1       0.94      0.97      0.95       249

    accuracy                           0.94       398
   macro avg       0.94      0.93      0.93       398
weighted avg       0.94      0.94      0.94       398

Matriz de confusão base de teste: 
 [[ 57   6]
 [  4 104]]
Classification Report base de teste: 
               precision    recall  f1-score   support

           0       0.93      0.90      0.92        63
           1       0.95      0.96      0.95       108

    accuracy                           0.94       171
   macro avg       0.94      0.93      0.94       171
weighted avg       0.94      0.94      0.94       171



## KNN

In [17]:
KNN = KNeighborsClassifier(n_neighbors=5, 
                           weights='uniform', 
                           leaf_size=30,
                           p=2, 
                           metric='minkowski', 
                           metric_params=None, 
                           n_jobs=1)

In [18]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]

cv_kfold = KFold(10, random_state=12, shuffle=True)

model_classif_cv(KNN, X, Y, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.963415,0.929577,0.951807,0.868421,0.975309,1.0,0.953125,0.912281,0.992571,0.909091,0.893486,0.833333
1,0.963526,0.901408,0.949102,0.842105,0.978395,0.969697,0.953125,0.877193,0.993031,0.958333,0.899659,0.806818
2,0.959627,0.95122,0.942073,0.95122,0.977848,0.95122,0.949219,0.929825,0.991927,0.950457,0.895182,0.82622
3,0.958398,0.947368,0.942424,0.947368,0.974922,0.947368,0.947266,0.929825,0.990969,0.970914,0.876476,0.868421
4,0.955316,0.95,0.936556,0.926829,0.974843,0.974359,0.943359,0.929825,0.990185,0.987892,0.872107,0.893162
5,0.957958,0.935484,0.941003,0.90625,0.975535,0.966667,0.945312,0.929825,0.991586,0.961728,0.884784,0.855556
6,0.955178,0.961039,0.93921,0.973684,0.971698,0.948718,0.943359,0.947368,0.990671,0.980769,0.880406,0.948718
7,0.954683,0.969697,0.937685,0.941176,0.972308,1.0,0.941406,0.964912,0.990473,0.978125,0.878832,0.92875
8,0.955316,0.931507,0.942249,0.944444,0.96875,0.918919,0.943359,0.912281,0.991829,0.977703,0.89375,0.85
9,0.960366,0.941176,0.943114,0.969697,0.978261,0.914286,0.949318,0.928571,0.992496,0.956463,0.893873,0.92381


In [19]:
kfold_clf(KNN, X, Y, k=10)

Média da Accuracy base de treinamento:  0.947080972831384
Média da Accuracy base de teste:  0.9262531328320801
Média do LogLoss base de validação:  2.5471658584582633
Média do LogLoss base de treinamento:  1.8277881131897928


In [20]:
leave_one_out_clf(KNN, X, Y)

Média da Accuracy base de treinamento:  0.9472728285353594
Média da Accuracy base de teste:  0.9332161687170475


In [21]:
holdout_clf(KNN, X, Y, test_s=0.3, strat = None)

Média da Accuracy base de treinamento:  0.9221105527638191
Média da Accuracy base de teste:  0.9590643274853801
Matriz de confusão base de treinamento: 
 [[126  23]
 [  8 241]]
Classification Report base de treinamento: 
               precision    recall  f1-score   support

           0       0.94      0.85      0.89       149
           1       0.91      0.97      0.94       249

    accuracy                           0.92       398
   macro avg       0.93      0.91      0.92       398
weighted avg       0.92      0.92      0.92       398

Matriz de confusão base de teste: 
 [[ 57   6]
 [  1 107]]
Classification Report base de teste: 
               precision    recall  f1-score   support

           0       0.98      0.90      0.94        63
           1       0.95      0.99      0.97       108

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171



## Árvore de decisão

In [24]:
DT = DecisionTreeClassifier(criterion='gini', 
                            splitter='best', 
                            max_depth=10, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, 
                            max_features=None, 
                            random_state=None, 
                            max_leaf_nodes=None, 
                            class_weight=None)

In [26]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]

cv_kfold = KFold(10, random_state=12, shuffle=True)

model_classif_cv(DT, X, Y, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,1.0,0.925373,1.0,0.911765,1.0,0.939394,1.0,0.912281,1.0,0.907197,1.0,0.814394
1,1.0,0.955224,1.0,0.941176,1.0,0.969697,1.0,0.947368,1.0,0.943182,1.0,0.886364
2,1.0,0.95122,1.0,0.95122,1.0,0.95122,1.0,0.929825,1.0,0.91311,1.0,0.82622
3,1.0,0.948718,1.0,0.925,1.0,0.973684,1.0,0.929825,1.0,0.907895,1.0,0.815789
4,1.0,0.974359,1.0,0.974359,1.0,0.974359,1.0,0.964912,1.0,0.959402,1.0,0.918803
5,1.0,0.931034,1.0,0.964286,1.0,0.9,1.0,0.929825,1.0,0.931481,1.0,0.862963
6,1.0,0.947368,1.0,0.972973,1.0,0.923077,1.0,0.929825,1.0,0.933761,1.0,0.867521
7,1.0,0.984127,1.0,1.0,1.0,0.96875,1.0,0.982456,1.0,0.984375,1.0,0.96875
8,1.0,0.944444,1.0,0.971429,1.0,0.918919,1.0,0.929825,1.0,0.934459,1.0,0.868919
9,1.0,0.895522,1.0,0.9375,1.0,0.857143,1.0,0.875,1.0,0.880952,1.0,0.761905


In [27]:
kfold_clf(DT, X, Y, k=10)

Média da Accuracy base de treinamento:  1.0
Média da Accuracy base de teste:  0.9296365914786968
Média do LogLoss base de validação:  2.430289905993747
Média do LogLoss base de treinamento:  9.992007221626413e-16


In [28]:
leave_one_out_clf(KNN, X, Y)

Média da Accuracy base de treinamento:  0.9472728285353594
Média da Accuracy base de teste:  0.9332161687170475


In [29]:
holdout_clf(KNN, X, Y, test_s=0.3, strat = None)

Média da Accuracy base de treinamento:  0.9221105527638191
Média da Accuracy base de teste:  0.9590643274853801
Matriz de confusão base de treinamento: 
 [[126  23]
 [  8 241]]
Classification Report base de treinamento: 
               precision    recall  f1-score   support

           0       0.94      0.85      0.89       149
           1       0.91      0.97      0.94       249

    accuracy                           0.92       398
   macro avg       0.93      0.91      0.92       398
weighted avg       0.92      0.92      0.92       398

Matriz de confusão base de teste: 
 [[ 57   6]
 [  1 107]]
Classification Report base de teste: 
               precision    recall  f1-score   support

           0       0.98      0.90      0.94        63
           1       0.95      0.99      0.97       108

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171



## Random forest

In [30]:
RF = RandomForestClassifier(n_estimators=10, 
                            criterion='gini', 
                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, 
                            max_features='auto', 
                            max_leaf_nodes=10, 
                            bootstrap=True, 
                            oob_score=False, 
                            n_jobs=1, 
                            random_state=None, 
                            verbose=0, 
                            warm_start=False, 
                            class_weight=None)

In [31]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]

cv_kfold = KFold(10, random_state=12, shuffle=True)

model_classif_cv(RF, X, Y, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.98773,0.956522,0.981707,0.916667,0.993827,1.0,0.984375,0.947368,0.998605,0.970328,0.967231,0.916667
1,0.986133,0.942857,0.984615,0.891892,0.987654,1.0,0.982422,0.929825,0.996109,0.954545,0.967757,0.92803
2,0.992151,0.926829,0.984424,0.926829,1.0,0.926829,0.990234,0.894737,0.996698,0.974085,0.978365,0.839939
3,0.992224,0.948718,0.984568,0.925,1.0,0.973684,0.990234,0.929825,0.99827,0.993075,0.974093,0.947368
4,0.984472,0.987013,0.972393,1.0,0.996855,0.974359,0.980469,0.982456,0.994156,0.995726,0.959638,0.974359
5,0.99239,0.966667,0.987879,0.966667,0.996942,0.966667,0.990234,0.964912,0.996082,0.996296,0.977668,0.933333
6,0.986003,0.987013,0.975385,1.0,0.996855,0.974359,0.982422,0.982456,0.997723,0.994302,0.971082,0.974359
7,0.987768,1.0,0.981763,1.0,0.993846,1.0,0.984375,1.0,0.997417,1.0,0.974726,1.0
8,0.992224,0.972222,0.987616,1.0,0.996875,0.945946,0.990234,0.964912,0.998983,0.997297,0.98125,0.972973
9,0.990712,0.927536,0.987654,0.941176,0.993789,0.914286,0.988304,0.910714,0.998155,0.989116,0.978082,0.914286


In [32]:
kfold_clf(RF, X, Y, k=10)

Média da Accuracy base de treinamento:  0.9824256822612085
Média da Accuracy base de teste:  0.9350563909774436
Média do LogLoss base de validação:  2.2431050798163343
Média do LogLoss base de treinamento:  0.607006204062552


In [33]:
leave_one_out_clf(RF, X, Y)

Média da Accuracy base de treinamento:  0.9817043738706401
Média da Accuracy base de teste:  0.9543057996485061


In [34]:
holdout_clf(RF, X, Y, test_s=0.3, strat = None)

Média da Accuracy base de treinamento:  0.9849246231155779
Média da Accuracy base de teste:  0.9766081871345029
Matriz de confusão base de treinamento: 
 [[145   4]
 [  2 247]]
Classification Report base de treinamento: 
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       149
           1       0.98      0.99      0.99       249

    accuracy                           0.98       398
   macro avg       0.99      0.98      0.98       398
weighted avg       0.98      0.98      0.98       398

Matriz de confusão base de teste: 
 [[ 61   2]
 [  2 106]]
Classification Report base de teste: 
               precision    recall  f1-score   support

           0       0.97      0.97      0.97        63
           1       0.98      0.98      0.98       108

    accuracy                           0.98       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.98      0.98      0.98       171



## Gradient boosting

In [43]:
GB = GradientBoostingClassifier(loss='deviance', 
                                learning_rate=0.01, 
                                n_estimators=30, 
                                subsample=1.0, 
                                criterion='friedman_mse', 
                                min_samples_split=8, 
                                min_samples_leaf=5, 
                                min_weight_fraction_leaf=0.0, 
                                max_depth=3, 
                                init=None, 
                                random_state=None, 
                                max_features=None, 
                                verbose=0, 
                                max_leaf_nodes=None, 
                                warm_start=False)

In [44]:
# resposta binaria
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]

cv_kfold = KFold(10, random_state=12, shuffle=True)

model_classif_cv(GB, X, Y, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.948755,0.929577,0.902507,0.868421,1.0,1.0,0.931641,0.912281,0.996593,0.950758,0.970318,0.916667
1,0.971514,0.929577,0.944606,0.868421,1.0,1.0,0.962891,0.912281,0.990076,0.953283,0.935317,0.886364
2,0.961832,0.928571,0.929204,0.906977,0.996835,0.95122,0.951172,0.894737,0.995027,0.9375,0.951627,0.766768
3,0.957958,0.926829,0.919308,0.863636,1.0,1.0,0.945312,0.894737,0.991473,0.98615,0.942875,0.894737
4,0.957831,0.962025,0.919075,0.95,1.0,0.974359,0.945312,0.947368,0.996855,0.995726,0.964793,0.974359
5,0.968889,0.9375,0.939655,0.882353,1.0,1.0,0.958984,0.929825,0.985528,0.97284,0.906687,0.896296
6,0.970992,0.95,0.94362,0.926829,1.0,0.974359,0.962891,0.929825,0.991725,0.991453,0.935875,0.948718
7,0.961538,0.984615,0.925926,0.969697,1.0,1.0,0.949219,0.982456,0.99508,1.0,0.964689,1.0
8,0.963855,0.911392,0.930233,0.857143,1.0,0.972973,0.953125,0.877193,0.996663,0.986486,0.965625,0.918919
9,0.956909,0.944444,0.917379,0.918919,1.0,0.971429,0.94347,0.928571,0.994155,0.990476,0.948798,0.914286


In [45]:
kfold_clf(GB, X, Y, k=10)

Média da Accuracy base de treinamento:  0.9517688535575048
Média da Accuracy base de teste:  0.8964285714285714
Média do LogLoss base de validação:  3.577303408160599
Média do LogLoss base de treinamento:  1.6658831915769095


In [46]:
leave_one_out_clf(GB, X, Y)

Média da Accuracy base de treinamento:  0.957718631649298
Média da Accuracy base de teste:  0.9226713532513181


In [47]:
holdout_clf(GB, X, Y, test_s=0.3, strat = None)

Média da Accuracy base de treinamento:  0.9396984924623115
Média da Accuracy base de teste:  0.935672514619883
Matriz de confusão base de treinamento: 
 [[125  24]
 [  0 249]]
Classification Report base de treinamento: 
               precision    recall  f1-score   support

           0       1.00      0.84      0.91       149
           1       0.91      1.00      0.95       249

    accuracy                           0.94       398
   macro avg       0.96      0.92      0.93       398
weighted avg       0.94      0.94      0.94       398

Matriz de confusão base de teste: 
 [[ 52  11]
 [  0 108]]
Classification Report base de teste: 
               precision    recall  f1-score   support

           0       1.00      0.83      0.90        63
           1       0.91      1.00      0.95       108

    accuracy                           0.94       171
   macro avg       0.95      0.91      0.93       171
weighted avg       0.94      0.94      0.93       171



## MLP

In [48]:
MLP = MLPClassifier(hidden_layer_sizes=(20,15,10), 
                    activation='relu', 
                    solver='adam', 
                    alpha=0.0001, 
                    batch_size='auto', 
                    learning_rate='constant', 
                    learning_rate_init=0.001, 
                    power_t=0.5, 
                    max_iter=200, 
                    shuffle=True, 
                    random_state=None, 
                    tol=0.0001, 
                    verbose=False, 
                    warm_start=False, 
                    momentum=0.9, 
                    nesterovs_momentum=True, 
                    early_stopping=False, 
                    validation_fraction=0.1, 
                    beta_1=0.9, 
                    beta_2=0.999, 
                    epsilon=1e-08)

In [52]:
# resposta binaria
warnings.filterwarnings('ignore')
metrics  = [f1_score, precision_score , recall_score,  accuracy_score, roc_auc_score]

cv_kfold = KFold(10, random_state=12, shuffle=True)

model_classif_cv(MLP, X, Y, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,ks - 1.Treino,ks - 2.Teste
0,0.952096,0.901408,0.924419,0.842105,0.981481,0.969697,0.9375,0.877193,0.978461,0.950758,0.862424,0.825758
1,0.948949,0.927536,0.923977,0.888889,0.975309,0.969697,0.933594,0.912281,0.972337,0.989899,0.844234,0.909091
2,0.945398,0.91358,0.932308,0.925,0.958861,0.902439,0.931641,0.877193,0.979527,0.940549,0.85682,0.8125
3,0.934551,0.948718,0.908284,0.925,0.962382,0.973684,0.916016,0.929825,0.972778,0.984765,0.83777,0.868421
4,0.94721,0.95,0.910145,0.926829,0.987421,0.974359,0.931641,0.929825,0.983807,0.994302,0.865201,0.948718
5,0.949102,0.9375,0.929619,0.882353,0.969419,1.0,0.933594,0.929825,0.974659,0.985185,0.845805,0.892593
6,0.954268,0.975,0.926036,0.95122,0.984277,1.0,0.941406,0.964912,0.985395,0.992877,0.872366,0.923077
7,0.944528,0.969697,0.921053,0.941176,0.969231,1.0,0.927734,0.964912,0.97731,0.97625,0.837664,0.92
8,0.943854,0.933333,0.917404,0.921053,0.971875,0.945946,0.927734,0.912281,0.972461,0.97027,0.836458,0.822973
9,0.947526,0.941176,0.915942,0.969697,0.981366,0.914286,0.931774,0.928571,0.984098,0.985034,0.870622,0.92381


In [55]:
kfold_clf(MLP, X, Y, k=10)

Média da Accuracy base de treinamento:  0.9281398483187134
Média da Accuracy base de teste:  0.9122180451127819
Média do LogLoss base de validação:  3.0319304595456704
Média do LogLoss base de treinamento:  2.4819993390968342


In [56]:
leave_one_out_clf(MLP, X, Y)

Média da Accuracy base de treinamento:  0.9201063145127354
Média da Accuracy base de teste:  0.9191564147627417


In [57]:
holdout_clf(MLP, X, Y, test_s=0.3, strat = None)

Média da Accuracy base de treinamento:  0.8793969849246231
Média da Accuracy base de teste:  0.9064327485380117
Matriz de confusão base de treinamento: 
 [[117  32]
 [ 16 233]]
Classification Report base de treinamento: 
               precision    recall  f1-score   support

           0       0.88      0.79      0.83       149
           1       0.88      0.94      0.91       249

    accuracy                           0.88       398
   macro avg       0.88      0.86      0.87       398
weighted avg       0.88      0.88      0.88       398

Matriz de confusão base de teste: 
 [[ 53  10]
 [  6 102]]
Classification Report base de teste: 
               precision    recall  f1-score   support

           0       0.90      0.84      0.87        63
           1       0.91      0.94      0.93       108

    accuracy                           0.91       171
   macro avg       0.90      0.89      0.90       171
weighted avg       0.91      0.91      0.91       171



## SVM

In [67]:
SVM = SVC(C=10, 
        gamma=0.01, 
        kernel='linear', 
        random_state=24)

In [62]:
kfold_clf(SVM, X, Y, k=10)

Média da Accuracy base de treinamento:  0.9689521655701755
Média da Accuracy base de teste:  0.9525689223057643
Média do LogLoss base de validação:  1.6382338565410302
Média do LogLoss base de treinamento:  1.0723698244020974


In [None]:
# leave_one_out_clf(SVM, X, Y)

In [63]:
holdout_clf(SVM, X, Y, test_s=0.3, strat = None)

Média da Accuracy base de treinamento:  0.9698492462311558
Média da Accuracy base de teste:  0.9707602339181286
Matriz de confusão base de treinamento: 
 [[140   9]
 [  3 246]]
Classification Report base de treinamento: 
               precision    recall  f1-score   support

           0       0.98      0.94      0.96       149
           1       0.96      0.99      0.98       249

    accuracy                           0.97       398
   macro avg       0.97      0.96      0.97       398
weighted avg       0.97      0.97      0.97       398

Matriz de confusão base de teste: 
 [[ 61   2]
 [  3 105]]
Classification Report base de teste: 
               precision    recall  f1-score   support

           0       0.95      0.97      0.96        63
           1       0.98      0.97      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171

