### Libraries

In [None]:
import time
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import ks_2samp
from IPython.display import Image

import shap
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import RepeatedStratifiedKFold, LeaveOneOut, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import export_graphviz, DecisionTreeClassifier
from yellowbrick.model_selection import FeatureImportances

from imblearn.over_sampling import SMOTE, ADASYN


import joblib
import pickle
import xgboost as xgb
from plot_learning import *
from sklearn.svm import SVC
from sklearn_rvm import EMRVC
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import GridSearchCV, learning_curve, RandomizedSearchCV
from sklearn.metrics  import average_precision_score, make_scorer, roc_curve,f1_score, precision_score, recall_score, fbeta_score, auc, roc_auc_score, accuracy_score, confusion_matrix, classification_report,precision_recall_curve
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical, Integer
from sklearn.feature_selection import SequentialFeatureSelector

warnings.filterwarnings('ignore')

In [None]:
import sklearn
sklearn.__version__

### Read 

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

X_train_2 = pd.read_csv("train_2.csv", sep = ",")
X_test_2 = pd.read_csv("test_2.csv", sep = ",")

X_train_2.head()

In [None]:
X_train_2.columns

In [None]:
y_train = X_train_2[['Y']]
y_test = X_test_2[['Y']]


X_train = X_train_2[['CONICITY', 'RFV','RRO', 'H2RFV',
        'PLY','LFV', 'CAPSPLICE']]

X_test = X_test_2[['CONICITY', 'RFV','RRO', 'H2RFV',
        'PLY','LFV', 'CAPSPLICE']]



In [None]:
y_train.Y.value_counts()

## Dataprep

## Model

In [None]:
def fit_and_print(model, X_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm)                      
    plt.figure(figsize=(8,6))  
    sns.heatmap(cm_df, annot=True)
    print("Classification Report: \n", classification_report(y_test, y_pred))  


    roc_auc = roc_auc_score(y_test, y_pred, multi_class = 'ovr', average=None)
    gini = 2*roc_auc -1
    print("Gini: ",gini)
    print("ROC AUC:: ",roc_auc)

In [None]:
def plot_learning_curve_cross_validation(df, title, ylim=None, train_sizes=np.linspace(1, 64, 64)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Iterations")
    plt.ylabel("Score")

    train_scores_mean = df.mean_train_score
    train_scores_std = df.std_train_score
    test_scores_mean = df.mean_test_score
    test_scores_std = df.std_test_score
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

### Decision Tree

In [None]:
DT = DecisionTreeClassifier(random_state = 21)
# cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=2)
# cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
cv = LeaveOneOut()

DT_search_space = {
        "criterion": Categorical(["gini", "entropy"]),
        "max_depth": Integer(4, 8),
        "max_features": Categorical(['auto', 'sqrt','log2']), 
        "min_samples_leaf": Integer(2, 10),
        "min_samples_split": Integer(8, 20),
    }

In [None]:
DT_GA_search = GASearchCV(estimator=DT, param_grid=DT_search_space, return_train_score = True, 
                                    scoring="accuracy", n_jobs=-1, cv=cv,
                                    population_size=10,
                                    generations=5,
                                    tournament_size=3,
                                    elitism=True,
                                    crossover_probability=0.8,
                                    mutation_probability=0.1,
                                    criteria='max',
                                    algorithm='eaMuPlusLambda',
                                    keep_top_k=4)

inicio = time.time()
DT_GA_search.fit(X_train, y_train) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(DT_GA_search.best_score_)
DT_best = DT_GA_search.best_estimator_
print(DT_GA_search.best_estimator_)
print(DT_GA_search.best_params_)

In [None]:
fit_and_print(DT_best, X_test)

### Random Forest 

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
RF = RandomForestClassifier(random_state = 23)
# cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=2)
# cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
cv = LeaveOneOut()

RF_search_space = {"bootstrap": Categorical([True]), # values for boostrap can be either True or False
        "criterion": Categorical(["gini", "entropy"]),
        "max_depth": Integer(4, 8),
        "max_features": Categorical(['auto', 'sqrt','log2']), 
        "min_samples_leaf": Integer(2, 10),
        "min_samples_split": Integer(8, 30),
        "n_estimators": Integer(5, 20)
    }

In [None]:
#https://sklearn-genetic-opt.readthedocs.io/en/stable/api/gasearchcv.html

RF_GA_search = GASearchCV(estimator=RF, param_grid=RF_search_space, return_train_score = True, 
                                    scoring="accuracy", n_jobs=-1, cv=cv,
                                    population_size=10,
                                    generations=5,
                                    tournament_size=3,
                                    elitism=True,
                                    crossover_probability=0.8,
                                    mutation_probability=0.1,
                                    criteria='max',
                                    algorithm='eaMuPlusLambda',
                                    keep_top_k=4)

inicio = time.time()
RF_GA_search.fit(X_train, y_train) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(RF_GA_search.best_score_)
RF_best = RF_GA_search.best_estimator_
print(RF_GA_search.best_estimator_)
print(RF_GA_search.best_params_)

In [None]:
from sklearn_genetic.plots import plot_fitness_evolution

plt.figure(figsize=(10, 6))
plot = plot_fitness_evolution(RF_GA_search, metric="fitness")
plt.show()

In [None]:
results = pd.DataFrame(RF_GA_search.cv_results_).sort_values(by = 'rank_test_score')
results.head(3)

In [None]:
results_cv2 = pd.DataFrame(RF_GA_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
# plot_learning_curve_cross_validation(results_cv2, "RF learning curve - GASearchCV")
results_cv2.head()

In [None]:
# results.shape

In [None]:
g = plot_learning_curve(RF_best,"Random Forest learning curves",X_train, y_train,cv=cv)

In [None]:
filename_vot = './models/RF_best_GA_v2.sav'
joblib.dump(RF_best, filename_vot)

Validation

In [None]:
fit_and_print(RF_best, X_test)

### Support Vector Machine

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
SVM = SVC(random_state = 261, probability=True)
# cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
cv = LeaveOneOut()

SVM_search_space = {
        'gamma': Continuous(0.01, 10), 
        'kernel': Categorical(['rbf', 'poly', 'sigmoid']), 
        'C': Continuous(0.1, 10)
    }


In [None]:
SVM_GA_search = GASearchCV(estimator=SVM, param_grid=SVM_search_space, return_train_score = True, 
                                    scoring="accuracy", n_jobs=-1, cv=cv,
                                    population_size=10,
                                    generations=5,
                                    tournament_size=3,
                                    elitism=True,
                                    crossover_probability=0.8,
                                    mutation_probability=0.1,
                                    criteria='max',
                                    algorithm='eaMuPlusLambda',
                                    keep_top_k=4)



inicio = time.time()
SVM_GA_search.fit(X_train, y_train) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(SVM_GA_search.best_score_)
SVM_best = SVM_GA_search.best_estimator_
print(SVM_GA_search.best_estimator_)
print(SVM_GA_search.best_params_)

In [None]:
results = pd.DataFrame(SVM_GA_search.cv_results_).sort_values(by = 'rank_test_score')
results.head(3)

In [None]:
results_cv2 = pd.DataFrame(SVM_GA_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
results_cv2.head() 

In [None]:
g = plot_learning_curve(SVM_best,"SVM learning curves",X_train,y_train,cv=cv)

In [None]:
filename_vot = './models/SVM_best_GA_v2.sav'
joblib.dump(SVM_best, filename_vot)

Validation

In [None]:
fit_and_print(SVM_best, X_test)

### Multi-layer Perceptron

https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

In [None]:
MLP = MLPClassifier(random_state = 26)
# cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
cv = LeaveOneOut()

MLP_search_space = {
    # 'hidden_layer_sizes': [(100,), (60,70,50), (70,80,60)],
    'activation':  Categorical(['tanh', 'relu', 'logistic', 'identity']),
    'solver':  Categorical(['sgd', 'adam']),
    'alpha': Continuous(0.001, 0.1),
    'learning_rate': Categorical(['adaptive', 'constant', 'invscaling']),
    'learning_rate_init': Continuous(0.001, 0.1)
}

In [None]:
# MLP_model = RandomizedSearchCV(estimator= MLP, param_distributions = MLP_search_space, cv=cv, scoring="accuracy")
MLP_GA_search = GASearchCV(estimator=MLP, param_grid=MLP_search_space, return_train_score = True, 
                                    scoring="accuracy", n_jobs=-1, cv=cv,
                                    population_size=10,
                                    generations=5,
                                    tournament_size=3,
                                    elitism=True,
                                    crossover_probability=0.8,
                                    mutation_probability=0.1,
                                    criteria='max',
                                    algorithm='eaMuPlusLambda',
                                    keep_top_k=4)



warnings.filterwarnings('ignore')
inicio = time.time()
MLP_GA_search.fit(X_train, y_train)
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(MLP_GA_search.best_score_)
MLP_best = MLP_GA_search.best_estimator_
print(MLP_GA_search.best_estimator_)
print(MLP_GA_search.best_params_)

In [None]:
results_cv2 = pd.DataFrame(MLP_GA_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
# plot_learning_curve_cross_validation(results_cv2, "MLP learning curve - GASearchCV")
results_cv2.head()

In [None]:
g = plot_learning_curve(MLP_best,"Multi-layer Perceptron learning curves", X_train, y_train,cv=cv)

In [None]:
filename_vot = './models/MLP_best_GA_v2.sav'
joblib.dump(MLP_best, filename_vot)

Validation

In [None]:
fit_and_print(MLP_best, X_test)

### GBM

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [None]:
GBM = GradientBoostingClassifier(random_state=31)
# cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

cv = LeaveOneOut()


GBM_search_space = {

        'max_depth': Integer(4, 8),
        'loss': Categorical(['deviance']),
        'learning_rate' : Continuous(0.001, 0.01) , 
        'n_estimators': Integer(5, 20),
        'criterion': Categorical(['friedman_mse', 'mse']),
        "max_features": Categorical(['auto', 'sqrt','log2']), 
        "min_samples_leaf": Integer(2, 8),
        "min_samples_split": Integer(5, 25)
    }

In [None]:
GBM_GA_search = GASearchCV(estimator=GBM, param_grid=GBM_search_space, return_train_score = True, 
                                    scoring="accuracy", n_jobs=-1, cv=cv,
                                    population_size=10,
                                    generations=5,
                                    tournament_size=3,
                                    elitism=True,
                                    crossover_probability=0.8,
                                    mutation_probability=0.1,
                                    criteria='max',
                                    algorithm='eaMuPlusLambda',
                                    keep_top_k=4)



inicio = time.time()
GBM_GA_search.fit(X_train, y_train) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(GBM_GA_search.best_score_)
GBM_best = GBM_GA_search.best_estimator_
print(GBM_GA_search.best_estimator_)
print(GBM_GA_search.best_params_)

In [None]:
GBM_GA_search.score(X_train, y_train)

In [None]:
results_cv2 = pd.DataFrame(GBM_GA_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
results_cv2.head()

In [None]:
g = plot_learning_curve(GBM_best,"GBM learning curves",X_train, y_train,cv=cv)

In [None]:
filename_vot = './models/GBM_best_GA_v2.sav'
joblib.dump(GBM_best, filename_vot)

Validation

In [None]:
fit_and_print(GBM_best, X_test)

### LGBM

https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html

In [None]:
LGBM = LGBMClassifier(random_state=32)
# cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

cv = LeaveOneOut()

LGBM_search_space = {
        'num_iterations': Integer(25, 50),
        'learning_rate' : Continuous(0.001, 0.01) , 
        'n_estimators': Integer(5, 40),
        'boosting_type': Categorical(['goss']),
        'objective': Categorical(['binary']),
        "num_leaves": Integer(5, 20),
        "min_child_samples": Integer(2, 10), 
        "reg_alpha": Continuous(0.01, 0.5)
         }

In [None]:
LGBM_GA_search = GASearchCV(estimator=LGBM, param_grid=LGBM_search_space, return_train_score = True, 
                                    scoring="accuracy", n_jobs=-1, cv=cv,
                                    population_size=10,
                                    generations=5,
                                    tournament_size=3,
                                    elitism=True,
                                    crossover_probability=0.8,
                                    mutation_probability=0.1,
                                    criteria='max',
                                    algorithm='eaMuPlusLambda',
                                    keep_top_k=4)

inicio = time.time()
LGBM_GA_search.fit(X_train, y_train) # callback=on_step will print score after each iteration
fim = time.time()
print("time train")
print(fim - inicio)
print("Acc")
print(LGBM_GA_search.best_score_)
LGBM_best = LGBM_GA_search.best_estimator_
print(LGBM_GA_search.best_estimator_)
print(LGBM_GA_search.best_params_)

In [None]:
LGBM_GA_search.score(X_train, y_train)

In [None]:
results_cv2 = pd.DataFrame(LGBM_GA_search.cv_results_)
results_cv2  = results_cv2[['mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score']]
results_cv2.head()

In [None]:
g = plot_learning_curve(LGBM_best,"LGBM learning curves",X_train, y_train,cv=cv)

In [None]:
filename_vot = './models/LGBM_best_GA_v2.sav'
joblib.dump(LGBM_best, filename_vot)

Validation

In [None]:
fit_and_print(LGBM_best, X_test)

In [None]:
from sklearn import metrics

In [None]:
def plot_roc(model, l):
    #fit logistic regression model and plot ROC curve
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
    plt.plot(fpr,tpr,label=l+ " , AUC="+str(auc))


In [None]:
labels = ['RF', 'SVM', "MLP", "GBM", "LGBM"]
models = [RF_best, SVM_best, MLP_best, GBM_best, LGBM_best]

#set up plotting area
plt.figure(0).clf()

for i, j in enumerate(models):

    plot_roc(j, labels[i])


#add legend
plt.legend()