### Libraries

In [None]:
import time
import shap
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
from IPython.display import Image
from sklearn.tree import export_graphviz

from imblearn.over_sampling import SMOTE, ADASYN
from skopt.space import Real, Categorical, Integer

from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib
from plot_roc import *

from sklearn.metrics  import confusion_matrix, classification_report, roc_auc_score

warnings.filterwarnings('ignore')

### Read 

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

X_train = pd.read_csv("X_train.csv", sep = ";")
X_test = pd.read_csv("X_test.csv", sep = ";")

y_train = pd.read_csv("y_train.csv", sep = ";")
y_test = pd.read_csv("y_test.csv", sep = ";")

X_train.head()

In [None]:
## removendo variáveis 
X_train = X_train.drop([
'H1RFV', 'H1LFV', 'H1RRO'
], axis=1)

X_test = X_test.drop([
'H1RFV', 'H1LFV', 'H1RRO'
], axis=1)

In [None]:
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    
    return y_train_enc, y_test_enc

In [None]:
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

In [None]:
y_train.GR.value_counts()

In [None]:
y_test.GR.value_counts()

In [None]:
y_test_enc

In [None]:
# y_test

In [None]:
X_test.head(5)

## Predict

In [None]:
def fit_and_print(model, X_test, y_enc):
    y_pred = model.predict(X_test)
    # print("Confusion Matrix: \n", confusion_matrix(y_test_enc, y_pred))
    sns.heatmap(confusion_matrix(y_enc, y_pred), annot=True)
    print("Classification Report: \n", classification_report(y_enc, y_pred))  


def fit_and_print_v2(model, X_test):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test_enc, y_pred)
    index =["A", "B", "C"]
    columns =["A", "B", "C"]
    cm_df = pd.DataFrame(cm,columns,index)                      
    plt.figure(figsize=(8,6))  
    sns.heatmap(cm_df, annot=True)
    print("Classification Report: \n", classification_report(y_test_enc, y_pred))  

    lb = LabelBinarizer()
    lb.fit(y_test_enc)
    y_test = lb.transform(y_test_enc)
    y_pred = lb.transform(y_pred)


    roc_auc = roc_auc_score(y_test, y_pred, multi_class = 'ovr', average=None)
    gini = 2*roc_auc -1
    print("Gini: ",gini)
    print("ROC AUC:: ",roc_auc)

In [None]:
file_std = "./models/std.pickle"

std = joblib.load(file_std)

In [None]:
X_train_std = std.transform(X_train)
X_test_std = std.transform(X_test)

### Random Forest 

In [None]:
filename = './models/RF_best_bayes.sav'


RF_best = joblib.load(filename)
y_pred = RF_best.predict(X_test_std)
plot_roc_curve(y_test_enc, y_pred)

In [None]:
# fit_and_print(RF_best, X_test, y_test_enc)
fit_and_print_v2(RF_best, X_test_std)

In [None]:
prob_test = RF_best.predict_proba(X_test_std)[:,1]

In [None]:
data=pd.DataFrame()
data['GR'] = y_test.GR
data["Probability"] = RF_best.predict_proba(X_test_std)[:,0]
plt.xlim(.0, 1.)
g = sns.kdeplot(data["Probability"][(data["GR"] == "A") & (data["Probability"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["Probability"][(data["GR"] == "B") & (data["Probability"].notnull())], ax =g, color="Green", shade= True)
g = sns.kdeplot(data["Probability"][(data["GR"] == "C") & (data["Probability"].notnull())], ax =g, color="Purple", shade= True)
g.set_xlabel("Probability")
g.set_ylabel("Count")
g = g.legend(["A","B", "C"])

In [None]:
plt.xlim(.0, 1.)

sns.kdeplot(
    data=data, x="Probability", hue="GR",
    cumulative=True, common_norm=False, common_grid=False, hue_order=['A', 'B', 'C'], palette=["Blue", "Green", 'Purple'],
)

In [None]:
data=pd.DataFrame()
data['GR'] = y_test.GR
data["Probability"] = RF_best.predict_proba(X_test_std)[:,1]
plt.xlim(.0, 1.)
g = sns.kdeplot(data["Probability"][(data["GR"] == "A") & (data["Probability"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["Probability"][(data["GR"] == "B") & (data["Probability"].notnull())], ax =g, color="Green", shade= True)
g = sns.kdeplot(data["Probability"][(data["GR"] == "C") & (data["Probability"].notnull())], ax =g, color="Purple", shade= True)
g.set_xlabel("Probability")
g.set_ylabel("Count")
g = g.legend(["A","B", "C"])

In [None]:
plt.xlim(.0, 1.)

sns.kdeplot(
    data=data, x="Probability", hue="GR",
    cumulative=True, common_norm=False, common_grid=False, hue_order=['A', 'B', 'C'], palette=["Blue", "Green", 'Purple'],
)

In [None]:
data=pd.DataFrame()
data['GR'] = y_test.GR
data["Probability"] = RF_best.predict_proba(X_test_std)[:,2]
plt.xlim(.0, 1.)
g = sns.kdeplot(data["Probability"][(data["GR"] == "A") & (data["Probability"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["Probability"][(data["GR"] == "B") & (data["Probability"].notnull())], ax =g, color="Green", shade= True)
g = sns.kdeplot(data["Probability"][(data["GR"] == "C") & (data["Probability"].notnull())], ax =g, color="Purple", shade= True)
g.set_xlabel("Probability")
g.set_ylabel("Count")
g = g.legend(["A","B", "C"])

In [None]:
plt.xlim(.0, 1.)

sns.kdeplot(
    data=data, x="Probability", hue="GR",
    cumulative=True, common_norm=False, common_grid=False, hue_order=['A', 'B', 'C'], palette=["Blue", "Green", 'Purple'],
)

In [None]:
RF_best.estimators_[0]

In [None]:
tree_small_0 = RF_best.estimators_[0]
tree_small_1 = RF_best.estimators_[1]
feature_list = list(X_train.columns)
name_list = ['A', 'B', 'C']

In [None]:
# export_graphviz(tree_small_0, out_file = 'tree_0_smote.dot', feature_names = feature_list, class_names = name_list,  rounded = True, precision = 1, filled = True)
# export_graphviz(tree_small_1, out_file = 'tree_1_smote.dot', feature_names = feature_list, class_names = name_list,  rounded = True, precision = 1, filled = True)

# !dot -Tpng tree.dot -o small_tree.png -Gdpi=600

In [None]:
# Image(filename = 'images/small_tree_0_smote.png')

In [None]:
# Image(filename = 'images/small_tree_1_smote.png')

In [None]:
names_classifiers = [("Random Forest",RF_best)]
classifier = names_classifiers[0][1]
name = names_classifiers[0][0]
indices = np.argsort(classifier.feature_importances_)[::-1][:40]
g = sns.barplot(y=X_train.columns[indices][:40],x = classifier.feature_importances_[indices][:40] , orient='h')
g.set_xlabel("Relative importance",fontsize=12)
g.set_ylabel("Features",fontsize=12)
g.tick_params(labelsize=9)
g.set_title(name + " feature importance")

In [None]:
explainer = shap.TreeExplainer(RF_best)
shap_values = explainer.shap_values(X_train_std)

In [None]:
# indices = np.argsort(shap_values)[::-1]
# X_train.columns[indices]

In [None]:
shap.summary_plot(shap_values, X_train_std, class_names= ['A', 'B', 'C'], feature_names = X_train.columns.tolist())

In [None]:
shap.summary_plot(shap_values[0], X_train_std, feature_names = X_train.columns.tolist())
shap.summary_plot(shap_values[1], X_train_std, feature_names = X_train.columns.tolist())
shap.summary_plot(shap_values[2], X_train_std, feature_names = X_train.columns.tolist())

In [None]:
for name in X_train.columns:
    shap.dependence_plot(name, shap_values[0], X_train_std, display_features=X_train)

In [None]:
for name in X_train.columns:
    shap.dependence_plot(name, shap_values[1], X_train_std, display_features=X_train)

In [None]:
for name in X_train.columns:
    shap.dependence_plot(name, shap_values[2], X_train_std, display_features=X_train)

In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[0][1,:], X_train.iloc[1,:], matplotlib=True)
shap.force_plot(explainer.expected_value[1], shap_values[1][1,:], X_train.iloc[1,:], matplotlib=True)
shap.force_plot(explainer.expected_value[2], shap_values[2][1,:], X_train.iloc[1,:], matplotlib=True)

In [None]:
X_train.head(1)

In [None]:
y_train

In [None]:
# shap.initjs()
# shap.force_plot(explainer.expected_value[1], shap_values[1][:,:], X_resampled.iloc[:,:])

### Support Vector Machine

In [None]:
filename = './models/SVM_best_bayes.sav'

SVM_best = joblib.load(filename)
y_pred_svm = SVM_best.predict(X_test)
plot_roc_curve(y_test_enc, y_pred_svm)

In [None]:
# fit_and_print_v2(SVM_best, X_test)

fit_and_print_v2(SVM_best, X_test_std)

### Logistic Regression 

In [None]:
filename = './models/LR_best_bayes.sav'

LR_best = joblib.load(filename)
y_pred = LR_best.predict(X_test)
plot_roc_curve(y_test_enc, y_pred)

In [None]:
# fit_and_print(LR_best, X_test, y_test_enc)

fit_and_print_v2(LR_best, X_test_std)

### KNN

In [None]:
filename = './models/KNN_best_bayes.sav'

KNN_best = joblib.load(filename)
y_pred = KNN_best.predict(X_test)
plot_roc_curve(y_test_enc, y_pred)

In [None]:
# fit_and_print(KNN_best, X_test, y_test_enc)
fit_and_print_v2(KNN_best, X_test_std)

In [None]:
# fit_and_print(KNN_best, X_train, y_train_enc)

### GBM

In [None]:
filename = './models/GBM_best_bayes.sav'

GBM_best = joblib.load(filename)
y_pred = GBM_best.predict(X_test)
plot_roc_curve(y_test_enc, y_pred)

In [None]:
# fit_and_print(GBM_best, X_test, y_test_enc)
fit_and_print_v2(GBM_best, X_test_std)