### Libraries

In [None]:
import time
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import ks_2samp
from IPython.display import Image

import shap
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import RepeatedStratifiedKFold, LeaveOneOut, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import export_graphviz
# from yellowbrick.model_selection import FeatureImportances

from imblearn.over_sampling import SMOTE, ADASYN


import joblib
import pickle
import xgboost as xgb
from plot_learning import *
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import GridSearchCV, learning_curve, RandomizedSearchCV
from sklearn.metrics  import average_precision_score, make_scorer, roc_curve,f1_score, precision_score, recall_score, fbeta_score, auc, roc_auc_score, accuracy_score, confusion_matrix, classification_report,precision_recall_curve
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

import skfuzzy as fuzz
from skfuzzy import control as ctrl

warnings.filterwarnings('ignore')

In [None]:
import sklearn
sklearn.__version__

### Read 

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

X_train = pd.read_csv("X_train.csv", sep = ";")
X_test = pd.read_csv("X_test.csv", sep = ";")

y_train = pd.read_csv("y_train.csv", sep = ";")
y_test = pd.read_csv("y_test.csv", sep = ";")

X_train.head()

In [None]:
X_train.columns

In [None]:
y_train.GR.unique()

In [None]:
y_train.GR.value_counts()

In [None]:
## removendo variáveis 
X_train = X_train.drop([
'H1RFV', 'H1LFV', 'H1RRO'
], axis=1)

X_test = X_test.drop([
'H1RFV', 'H1LFV', 'H1RRO'
], axis=1)

## Dataprep

In [None]:
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    
    return y_train_enc, y_test_enc

In [None]:
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

In [None]:
print(Counter(y_train_enc).keys())
print(Counter(y_train_enc).values())

In [None]:
print(Counter(y_test_enc).keys())
print(Counter(y_test_enc).values())

In [None]:
std = StandardScaler() #MinMaxScaler() 


X_train = std .fit_transform(X_train)

X_test = std.transform(X_test)

In [None]:
pickle.dump(std, open( "./models/std.pickle", "wb" ))

## Model

In [None]:
def fit_and_print(model, X_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test_enc, y_pred)
    index =["A", "B", "C"]
    columns =["A", "B", "C"]
    cm_df = pd.DataFrame(cm,columns,index)                      
    plt.figure(figsize=(8,6))  
    sns.heatmap(cm_df, annot=True)
    print("Classification Report: \n", classification_report(y_test_enc, y_pred))  

    lb = LabelBinarizer()
    lb.fit(y_test_enc)
    y_test = lb.transform(y_test_enc)
    y_pred = lb.transform(y_pred)


    roc_auc = roc_auc_score(y_test, y_pred, multi_class = 'ovr', average=None)
    gini = 2*roc_auc -1
    print("Gini: ",gini)
    print("ROC AUC:: ",roc_auc)

In [None]:
def plot_learning_curve_cross_validation(df, title, ylim=None, train_sizes=np.linspace(1, 32, 32)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Iterations")
    plt.ylabel("Score")

    train_scores_mean = df.mean_train_score
    train_scores_std = df.std_train_score
    test_scores_mean = df.mean_test_score
    test_scores_std = df.std_test_score
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

### Random Forest 

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
RF = RandomForestClassifier(random_state = 23)
cv = LeaveOneOut()
# cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

RF_search_space = {"bootstrap": Categorical([True]), # values for boostrap can be either True or False
        "criterion": Categorical(["gini", "entropy"]),
        "max_depth": Integer(3, 5, prior='uniform'),
        "max_features": Categorical(['sqrt','log2']), 
        "min_samples_leaf": Integer(2, 10, prior='uniform'),
        "min_samples_split": Integer(3, 15, prior='uniform'),
        "n_estimators": Integer(5, 20, prior='uniform')
    }

In [None]:
#https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html

RF_bayes_search = BayesSearchCV(RF, RF_search_space, n_iter=32, return_train_score = True, # specify how many iterations
                                    scoring="accuracy", n_jobs=-1, cv=cv, random_state = 100, optimizer_kwargs = {'base_estimator': 'GP'})

inicio = time.time()
RF_bayes_search.fit(X_train, y_train_enc) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(RF_bayes_search.best_score_)
RF_best = RF_bayes_search.best_estimator_
print(RF_bayes_search.best_estimator_)
print(RF_bayes_search.best_params_)

In [None]:
results = pd.DataFrame(RF_bayes_search.cv_results_).sort_values(by = 'rank_test_score')
results.head(3)

In [None]:
results_cv2 = pd.DataFrame(RF_bayes_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
plot_learning_curve_cross_validation(results_cv2, "RF learning curve - BayesSearchCV")
# results_cv2.head()

In [None]:
results_cv2 = pd.DataFrame(RF_bayes_search.cv_results_)
best_params = pd.DataFrame(results_cv2[results_cv2.index.isin([RF_bayes_search.best_index_])])

best_params[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]

In [None]:
results.shape

In [None]:
g = plot_learning_curve(RF_best,"Random Forest learning curves",X_train, y_train_enc,cv=cv)

In [None]:
filename_vot = './models/RF_best_bayes.sav'
joblib.dump(RF_best, filename_vot)

Validation

In [None]:
fit_and_print(RF_best, X_test)

### Support Vector Machine

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
SVM = SVC(random_state = 242, probability=True)
cv = LeaveOneOut()

SVM_search_space = {
        'gamma': Real(0.01, 10, prior='uniform'), 
        'kernel': Categorical(['rbf', 'poly', 'sigmoid']), 
        'C': Real(0.01, 10, prior='uniform')
    }


In [None]:
SVM_bayes_search = BayesSearchCV(SVM, SVM_search_space, n_iter=32, return_train_score = True,# specify how many iterations
                                    scoring="accuracy", n_jobs=-1, cv=cv, random_state = 101, optimizer_kwargs = {'base_estimator': 'GP'} )

inicio = time.time()
SVM_bayes_search.fit(X_train, y_train_enc) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(SVM_bayes_search.best_score_)
SVM_best = SVM_bayes_search.best_estimator_
print(SVM_bayes_search.best_estimator_)
print(SVM_bayes_search.best_params_)

In [None]:
# SVM_bayes_search.optimizer_results_

In [None]:
results = pd.DataFrame(SVM_bayes_search.cv_results_).sort_values(by = 'rank_test_score')
results.head(3)

In [None]:
results_cv2 = pd.DataFrame(SVM_bayes_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
plot_learning_curve_cross_validation(results_cv2, "SVM learning curve - BayesSearchCV")

In [None]:
results_cv2 = pd.DataFrame(SVM_bayes_search.cv_results_)
best_params = pd.DataFrame(results_cv2[results_cv2.index.isin([SVM_bayes_search.best_index_])])

best_params[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]

In [None]:
g = plot_learning_curve(SVM_best,"SVM learning curves",X_train,y_train_enc, cv=cv)

In [None]:
filename_vot = './models/SVM_best_bayes.sav'
joblib.dump(SVM_best, filename_vot)

Validation

In [None]:
fit_and_print(SVM_best, X_test)

### Logistic Regression 

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
LR = LogisticRegression(random_state=25)
cv = LeaveOneOut()

LR_search_space = {
    'solver': Categorical(['saga']),  
    'penalty': Categorical(['l1','l2']),
    'tol': Real(1e-5, 1e-3, 'uniform'),
    'C': Real(1e-3, 10, 'uniform'),
    'fit_intercept': Categorical([True, False]),
    'class_weight': Categorical(['balanced']),
    'multi_class': Categorical(['multinomial'])
}

In [None]:
LR_bayes_search = BayesSearchCV(LR, LR_search_space, n_iter=32, return_train_score = True, # specify how many iterations
                                    scoring="accuracy", n_jobs=-1, cv=cv, random_state = 102, optimizer_kwargs = {'base_estimator': 'GP'})

inicio = time.time()
LR_bayes_search.fit(X_train, y_train_enc) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(LR_bayes_search.best_score_)
LR_best = LR_bayes_search.best_estimator_
print(LR_bayes_search.best_estimator_)
print(LR_bayes_search.best_params_)

In [None]:
results_cv2 = pd.DataFrame(LR_bayes_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
plot_learning_curve_cross_validation(results_cv2, "LR learning curve - BayesSearchCV")

In [None]:
results_cv2 = pd.DataFrame(LR_bayes_search.cv_results_)
best_params = pd.DataFrame(results_cv2[results_cv2.index.isin([LR_bayes_search.best_index_])])

best_params[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]

In [None]:
g = plot_learning_curve(LR_best,"Logistic Regression learning curves",X_train, y_train_enc, cv=cv)

In [None]:
filename_vot = './models/LR_best_bayes.sav'
joblib.dump(LR_best, filename_vot)

Validation

In [None]:
fit_and_print(LR_best, X_test)

### KNN

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
KNN = KNeighborsClassifier()
cv = LeaveOneOut()

KNN_search_space = {
    'weights': Categorical(['uniform', 'distance']),  
    'n_neighbors': Integer(3, 8, 'uniform'), 
    'p': Integer(1, 3, 'uniform'), 
    'metric': Categorical(['minkowski']), 
    'leaf_size': Integer(10, 20, prior='uniform')}

In [None]:
KNN_bayes_search = BayesSearchCV(KNN, KNN_search_space, n_iter=32, return_train_score = True, # specify how many iterations
                                    scoring="accuracy", n_jobs=-1, cv=cv, random_state = 104, optimizer_kwargs = {'base_estimator': 'GP'})

inicio = time.time()
KNN_bayes_search.fit(X_train, y_train_enc) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(KNN_bayes_search.best_score_)
KNN_best = KNN_bayes_search.best_estimator_
print(KNN_bayes_search.best_estimator_)
print(KNN_bayes_search.best_params_)

In [None]:
results_cv2 = pd.DataFrame(KNN_bayes_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
plot_learning_curve_cross_validation(results_cv2, "KNN learning curve - BayesSearchCV")

In [None]:
results_cv2 = pd.DataFrame(KNN_bayes_search.cv_results_)
best_params = pd.DataFrame(results_cv2[results_cv2.index.isin([KNN_bayes_search.best_index_])])

best_params[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]

In [None]:
g = plot_learning_curve(KNN_best,"K-nearest neighbors learning curves", X_train, y_train_enc, cv=cv)

In [None]:
filename_vot = './models/KNN_best_bayes.sav'
joblib.dump(KNN_best, filename_vot)

Validation

In [None]:
fit_and_print(KNN_best, X_test)

### GBM

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [None]:
GBM = GradientBoostingClassifier(random_state=32)
cv = LeaveOneOut()
GBM_search_space = {

        'max_depth': Integer(3, 5, prior='uniform'),
        'loss': Categorical(['log_loss']),
        'learning_rate' : Real(0.001, 0.05, prior='uniform') , 
        'n_estimators': Integer(5, 15, prior='uniform'),
        'criterion': Categorical(['friedman_mse', 'squared_error']),
        "max_features": Categorical(['sqrt','log2']), 
        "min_samples_leaf": Integer(2, 10, prior='uniform'),
        "min_samples_split": Integer(2, 10, prior='uniform')
    }

In [None]:
GBM_bayes_search = BayesSearchCV(GBM, GBM_search_space, n_iter=32, return_train_score = True, # specify how many iterations
                                    scoring="accuracy", n_jobs=-1, cv=cv, random_state = 106, optimizer_kwargs = {'base_estimator': 'GP'})

inicio = time.time()
GBM_bayes_search.fit(X_train, y_train_enc) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(GBM_bayes_search.best_score_)
GBM_best = GBM_bayes_search.best_estimator_
print(GBM_bayes_search.best_estimator_)
print(GBM_bayes_search.best_params_)

In [None]:
results_cv2 = pd.DataFrame(GBM_bayes_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
plot_learning_curve_cross_validation(results_cv2, "GBM learning curve - BayesSearchCV")

In [None]:
results_cv2 = pd.DataFrame(GBM_bayes_search.cv_results_)
best_params = pd.DataFrame(results_cv2[results_cv2.index.isin([GBM_bayes_search.best_index_])])

best_params[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]

In [None]:
g = plot_learning_curve(GBM_best,"GBM learning curves",X_train, y_train_enc,cv=cv)

In [None]:
filename_vot = './models/GBM_best_bayes.sav'
joblib.dump(GBM_best, filename_vot)

Validation

In [None]:
fit_and_print(GBM_best, X_test)