<a href="https://colab.research.google.com/github/sebastrogers/ensemble/blob/main/Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# models manual
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import cohen_kappa_score

# import mlflow
# import mlflow.sklearn

# for timestamp
import time
import calendar

# for plot graphic
import matplotlib.pyplot as plt

# for send bot messages
import requests

In [None]:
path = "/content/drive/MyDrive/DOUTORADO/projeto_arbovirose/base_de_dados/pre_processados/com_undersampling/baseCompleta-1617664035.csv"
df = pd.read_csv(path, sep = ";")
df.head()

Unnamed: 0,NU_IDADE_N,CS_SEXO,CS_GESTANT,CS_RACA,CS_ZONA,FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,...,DOR_RETRO,DIABETES,HEMATOLOG,HEPATOPAT,RENAL,HIPERTENSA,ACIDO_PEPT,AUTO_IMUNE,DIAS,CLASSI_FIN
0,4022.0,0,5.0,1.0,3.0,1.0,1.0,1.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1,CHIKUNGUNYA
1,4020.0,1,6.0,4.0,1.0,1.0,1.0,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,6,CHIKUNGUNYA
2,4061.0,0,6.0,4.0,1.0,1.0,1.0,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,32,CHIKUNGUNYA
3,4046.0,1,6.0,4.0,1.0,1.0,1.0,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2,CHIKUNGUNYA
4,4014.0,1,6.0,4.0,1.0,1.0,1.0,1.0,2.0,1.0,...,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2,CHIKUNGUNYA


In [None]:
#MALIBU
# X = df[['FEBRE', 'MIALGIA', 'CEFALEIA', 'EXANTEMA', 'NAUSEA', 'DOR_COSTAS', 'CONJUNTVIT', 'ARTRITE', 'ARTRALGIA', 'PETEQUIA_N', 'DOR_RETRO', 'DIABETES', 'HIPERTENSA', 'DIAS', 'HEMATOLOG', 'HEPATOPAT', 'RENAL']]

#BALBOA
X = df[['CS_RACA', 'MIALGIA', 'ARTRITE', 'ARTRALGIA', 'FEBRE', 'EXANTEMA', 'CEFALEIA', 'NAUSEA', 'DOR_COSTAS', 'PETEQUIA_N', 'LACO', 'DOR_RETRO', 'HIPERTENSA']]
y = df["CLASSI_FIN"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=46)
print(f"train shape {X_train.shape[0]}")
print(f"test shape {X_test.shape[0]}")

train shape 12020
test shape 5152


In [None]:
def shows_results_predict(y_test, y_pred, binary_classification, model_name):
    
    labels_multi=['DENGUE', 'CHIKUNGUNYA', 'OUTRAS_DOENCAS']

    data = []
    
    if (binary_classification):
        
        columns_test = [
            "model",
            "acc", "mcc", "kappa",
            "recall_macro", "precision_macro", "f1_score_macro",
            "recall_class", "precision_class", "f1_score_class",
            "specificity", "auc"
        ]
        
        acc = accuracy_score(y_test, y_pred)

        mcc = matthews_corrcoef(y_test, y_pred)
                
        kappa = cohen_kappa_score(y_test, y_pred)
                
        recall_macro = recall_score(y_test, y_pred, average="macro")
        recall_class = recall_score(y_test, y_pred)

        specificity = recall_score(y_test, y_pred, pos_label=0)

        precision_macro = precision_score(y_test, y_pred, average="macro")
        precision_class = precision_score(y_test, y_pred)

        f1_score_macro = f1_score(y_test, y_pred, average="macro")
        f1_score_class = f1_score(y_test, y_pred)

        fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
        auc_score = auc(fpr, tpr)

        data.append([
            model_name,
            acc, mcc, kappa,
            recall_macro, precision_macro, f1_score_macro,
            recall_class, precision_class, f1_score_class,
            specificity, auc_score
        ])
        
    else:
        
        metrics = ["recall", "precision", "f1_score"]

        columns_test = [
            "model",
            "acc", "mcc", "kappa",
            "recall_macro", "precision_macro", "f1_score_macro",
        ]

        for metric in metrics:
            for label in labels_multi:
                columns_test.append(f"{metric}_{label}")
        
        acc = accuracy_score(y_test, y_pred)

        mcc = matthews_corrcoef(y_test, y_pred)
                
        kappa = cohen_kappa_score(y_test, y_pred, labels=labels_multi)

        recall_macro = recall_score(y_test, y_pred, average="macro", labels=labels_multi)

        precision_macro = precision_score(y_test, y_pred, average="macro", labels=labels_multi)

        f1_score_macro = f1_score(y_test, y_pred, average="macro", labels=labels_multi)

        # data.append([
        #     model_name,
        #     acc, mcc, kappa,
        #     recall_macro, precision_macro, f1_score_macro,
        #     recall_score(y_test, y_pred, average=None, labels=labels_multi).ravel(),
        #     precision_score(y_test, y_pred, average=None, labels=labels_multi).ravel(),
        #     f1_score(y_test, y_pred, average=None, labels=labels_multi).ravel()
        # ])
        temp = [
            model_name,
            acc, mcc, kappa,
            recall_macro, precision_macro, f1_score_macro
        ]
        for i in recall_score(y_test, y_pred, average=None, labels=labels_multi).ravel():
            temp.append(i)
        for i in precision_score(y_test, y_pred, average=None, labels=labels_multi).ravel():
            temp.append(i)
        for i in f1_score(y_test, y_pred, average=None, labels=labels_multi).ravel():
            temp.append(i)
        data.append(temp)
        
    return pd.DataFrame(data=data, columns=columns_test)

In [None]:
estimators = [
      # ('rf', RandomForestClassifier(n_estimators=200, criterion="gini")),
      # ('gbm', GradientBoostingClassifier(max_depth=3, n_estimators=200))
    ('rf', RandomForestClassifier(n_estimators=100, criterion="entropy")),
    ('rfa', RandomForestClassifier(n_estimators=50, criterion="gini"))
]

clf = StackingClassifier(
    estimators=estimators, final_estimator=GradientBoostingClassifier()
)

clf.fit(X_train, y_train)

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(criterion='entropy')),
                               ('rfa',
                                RandomForestClassifier(n_estimators=50))],
                   final_estimator=GradientBoostingClassifier())

In [None]:
new_df = True

for i in range (30) :
  estimators = [
      # ('rf', RandomForestClassifier(n_estimators=200, criterion="gini")),
      # ('gbm', GradientBoostingClassifier(max_depth=3, n_estimators=200))
    ('rf', RandomForestClassifier(n_estimators=100, criterion="entropy")),
    ('rfa', RandomForestClassifier(n_estimators=50, criterion="gini"))
    ]

  clf = StackingClassifier(
      estimators=estimators, final_estimator=GradientBoostingClassifier()
    )

  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)

  temp_df = shows_results_predict(y_test, y_pred, False, i)

  if (new_df):
      final_df = temp_df
      new_df = False
  else:
      final_df = pd.concat([final_df, temp_df])
  
  print (i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [None]:
final_df

Unnamed: 0,model,acc,mcc,kappa,recall_macro,precision_macro,f1_score_macro,recall_DENGUE,recall_CHIKUNGUNYA,recall_OUTRAS_DOENCAS,precision_DENGUE,precision_CHIKUNGUNYA,precision_OUTRAS_DOENCAS,f1_score_DENGUE,f1_score_CHIKUNGUNYA,f1_score_OUTRAS_DOENCAS
0,0,0.619371,0.431642,0.42751,0.615775,0.614619,0.608963,0.434392,0.782217,0.630715,0.593776,0.65197,0.59811,0.50173,0.711179,0.61398
0,1,0.619565,0.430773,0.428183,0.616622,0.615188,0.611972,0.456561,0.746764,0.646542,0.58751,0.661515,0.596539,0.513823,0.70156,0.620534
0,2,0.620148,0.432742,0.428726,0.61659,0.615095,0.609762,0.432594,0.778841,0.638335,0.591319,0.655613,0.598352,0.499654,0.711934,0.617697
0,3,0.618595,0.430756,0.426271,0.614907,0.613906,0.607657,0.428999,0.785594,0.630129,0.593698,0.647796,0.600223,0.498087,0.710071,0.614813
0,4,0.619565,0.431549,0.427883,0.616105,0.614909,0.609981,0.443379,0.777153,0.627784,0.5939,0.654502,0.596325,0.507719,0.710574,0.61165
0,5,0.620536,0.432863,0.429359,0.617127,0.615681,0.611187,0.446375,0.775464,0.629543,0.59127,0.653081,0.602694,0.508706,0.70903,0.615826
0,6,0.620148,0.432607,0.428708,0.616587,0.615123,0.610024,0.438586,0.782217,0.628957,0.591754,0.653503,0.600112,0.503785,0.71209,0.614196
0,7,0.62073,0.432573,0.429755,0.617449,0.615384,0.6123,0.459557,0.773213,0.619578,0.58505,0.657102,0.604,0.514765,0.710445,0.61169
0,8,0.618789,0.430119,0.42683,0.615455,0.613573,0.609529,0.440983,0.76646,0.638921,0.585521,0.659245,0.595954,0.503076,0.708821,0.61669
0,9,0.622477,0.435464,0.432357,0.619179,0.617306,0.613661,0.452367,0.771525,0.633646,0.588006,0.656609,0.607303,0.511344,0.709444,0.620195


In [None]:
final_df.mean()

model                       14.500000
acc                          0.619481
mcc                          0.431300
kappa                        0.427790
recall_macro                 0.616063
precision_macro              0.614494
f1_score_macro               0.610021
recall_DENGUE                0.444318
recall_CHIKUNGUNYA           0.774601
recall_OUTRAS_DOENCAS        0.629269
precision_DENGUE             0.588960
precision_CHIKUNGUNYA        0.654805
precision_OUTRAS_DOENCAS     0.599717
f1_score_DENGUE              0.506373
f1_score_CHIKUNGUNYA         0.709644
f1_score_OUTRAS_DOENCAS      0.614047
dtype: float64