### Libraries

In [None]:
import time
import joblib
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import ks_2samp
from IPython.display import Image

import shap
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.model_selection import RepeatedStratifiedKFold, LeaveOneOut, StratifiedShuffleSplit, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import export_graphviz
from yellowbrick.model_selection import FeatureImportances

from imblearn.over_sampling import SMOTE, ADASYN


import joblib
import pickle
import xgboost as xgb
# from plot_learning import *
from sklearn.svm import SVC
from sklearn_rvm import EMRVC
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import GridSearchCV, learning_curve, RandomizedSearchCV
from sklearn.metrics  import average_precision_score, make_scorer, roc_curve,f1_score, precision_score, recall_score, fbeta_score, auc, roc_auc_score, accuracy_score, confusion_matrix, classification_report,precision_recall_curve
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Categorical, Integer
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D

from statsmodels.stats.contingency_tables import mcnemar
from mlxtend.evaluate import mcnemar_table

from sklearn.mixture import GaussianMixture

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

from scipy.cluster import hierarchy

from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

In [None]:
import sklearn
sklearn.__version__

### Read 

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

train = pd.read_csv("train.csv", sep = ";")
test = pd.read_csv("test.csv", sep = ";")

train.head()

In [None]:
train.columns

In [None]:
train.Y.value_counts()

In [None]:
test.Y.value_counts()

In [None]:
VARS = ['CONICITY', 'RFV','RRO', 'H2RFV',
        'PLY','LFV', 'CAPSPLICE']
# VARS = ['CONICITY', 'RRO', 'H2RFV']

In [None]:
train['Y2'] = train['Y'].apply(lambda x: 1 if x == 0 else -1)

In [None]:
def metrics(y_test, y_pred):
    cm = list()
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm)                      
    plt.figure(figsize=(8,6))  
    sns.heatmap(cm_df, annot=True)
    print("Classification Report: \n", classification_report(y_test, y_pred))  


    roc_auc = roc_auc_score(y_test, y_pred, multi_class = 'ovr', average=None)
    gini = 2*roc_auc -1
    print("Gini: ",gini)
    print("ROC AUC:: ",roc_auc)

In [None]:
train2 = train[train['Y']==0]
train2.shape

In [None]:
from pyod.models.ecod import ECOD

from pyod.models.inne import INNE

In [None]:
pyecod = ECOD(contamination=0.5,n_jobs=-1)
# pyecod = INNE(contamination=0.6)

In [None]:
pyecod.fit(train2[VARS])

In [None]:
test['scores_pyecod']=pyecod.decision_function(test[VARS])
test['pyecod']=pyecod.predict(test[VARS])
# test['pyecod'] = test['pyecod'].apply(lambda x: 1 if x<0 else 0)

In [None]:
train['scores_pyecod']=pyecod.decision_function(train[VARS])
train['pyecod']=pyecod.predict(train[VARS])
# train['pyecod'] = train['pyecod'].apply(lambda x: 1 if x<0 else 0)

In [None]:
sns.kdeplot(
    data=test, x="scores_pyecod", hue=test.Y,
    cumulative=True, common_norm=False, common_grid=True,
)

In [None]:
metrics(test.Y, test.pyecod)

In [None]:
metrics(train.Y, train.pyecod)

### IsolationForest

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
cv = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=65)

# cv = LeaveOneOut()

forest_s = IsolationForest()
f1sc = make_scorer(f1_score)

ps = {"n_estimators": (100, 256), 
      "max_features": (0.5, 0.7, 0.8, 0.9, 1.0),
      "contamination":(0.49, 0.5)}
search = GridSearchCV(estimator=forest_s, param_grid=ps, scoring='recall', cv=cv)
search.fit(train[VARS], train.Y2)

In [None]:
df = pd.DataFrame(search.cv_results_)
df[["mean_test_score", "std_test_score", "params"]].sort_values(by=["mean_test_score"], ascending=False).head()

In [None]:
search.best_estimator_

In [None]:
iforest=IsolationForest(n_estimators=256, max_samples='auto',max_features=0.9, contamination = 0.5, random_state=75)
iforest.fit(train[VARS])

In [None]:
test['scores_forest']=iforest.decision_function(test[VARS])
test['iForest']=iforest.predict(test[VARS])
test['iForest'] = test['iForest'].apply(lambda x: 1 if x<0 else 0)

In [None]:
train['scores_forest']=iforest.decision_function(train[VARS])
train['iForest']=iforest.predict(train[VARS])
train['iForest'] = train['iForest'].apply(lambda x: 1 if x<0 else 0)

In [None]:
test.Y.value_counts()

In [None]:
sns.kdeplot(
    data=test, x="scores_forest", hue=test.Y,
    cumulative=True, common_norm=False, common_grid=True,
)

In [None]:
test.head(2)

In [None]:
metrics(test.Y, test.iForest)

In [None]:
(17)/(17+2)

In [None]:
metrics(train.Y, train.iForest)

In [None]:
exp = shap.TreeExplainer(iforest) #Explainer
shap_values = exp.shap_values(train[VARS])  #Calculate SHAP values
shap.initjs()

In [None]:
shap.summary_plot(shap_values, train[VARS])

In [None]:
shap.summary_plot(shap_values, train[VARS],plot_type="bar")

In [None]:
shap.force_plot(exp.expected_value, shap_values[2],features =train[VARS].iloc[2,:] ,feature_names =train[VARS].columns)

In [None]:
shap.bar_plot(shap_values[2],features =train[VARS].iloc[2,:] ,feature_names =train[VARS].columns)

In [None]:
train.head(10)

### AutoEnconder

In [None]:
# from pyod.models.auto_encoder import AutoEncoder

In [None]:
# clf = AutoEncoder(hidden_neurons =[25, 2, 2, 25])
# clf.fit(train[VARS])

In [None]:
# # Get the outlier scores for the train data
# y_train_scores = clf.decision_scores_
# # Predict the anomaly scores
# y_test_scores = clf.decision_function(test[VARS])  # outlier scores
# y_test_scores = pd.Series(y_test_scores)

In [None]:
# # Plot it!
# import matplotlib.pyplot as plt
# plt.hist(y_test_scores, bins='auto')
# plt.title("Histogram for Model Clf1 Anomaly Scores")
# plt.show()

In [None]:
# df_test = test[VARS].copy()
# df_test['score'] = y_test_scores
# df_test['cluster'] = np.where(df_test['score']<2.5, 0, 1)
# df_test['cluster'].value_counts()
# df_test.groupby('cluster').mean()

In [None]:
# df_test['cluster'].value_counts()

In [None]:
# pd.crosstab(test.Y, df_test['cluster'])

### OneClassSVM

https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html

In [None]:
from sklearn import linear_model
from sklearn.svm import OneClassSVM

In [None]:
ocsvm = OneClassSVM(gamma='scale', nu=0.5, kernel='rbf')
ocsvm.fit(train[VARS])

In [None]:
train['OCSVM']=ocsvm.predict(train[VARS])
train['score_OCSVM']=ocsvm.decision_function(train[VARS])
train['OCSVM'] = train['OCSVM'].apply(lambda x: 1 if x<0 else 0)

In [None]:
test['OCSVM']=ocsvm.predict(test[VARS])
test['score_OCSVM']=ocsvm.decision_function(test[VARS])
test['OCSVM'] = test['OCSVM'].apply(lambda x: 1 if x<0 else 0)

In [None]:
sns.kdeplot(
    data=test, x="score_OCSVM", hue=test.Y,
    cumulative=True, common_norm=False, common_grid=True,
)

In [None]:
metrics(test.Y, test.OCSVM)

In [None]:
metrics(train.Y, train.OCSVM)

### LocalOutlierFactor

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.LocalOutlierFactor.html

In [None]:
from sklearn.neighbors import LocalOutlierFactor

In [None]:
lof = LocalOutlierFactor(n_neighbors=4, novelty=True, p=2, metric = 'minkowski', contamination=0.5)
lof.fit(train[VARS])

In [None]:
train['LOF']=lof.predict(train[VARS])
train['score_LOF']=lof.decision_function(train[VARS])
train['LOF'] = train['LOF'].apply(lambda x: 1 if x<0 else 0)

In [None]:
test['LOF']=lof.predict(test[VARS])
test['score_LOF']=lof.decision_function(test[VARS])
test['LOF'] = test['LOF'].apply(lambda x: 1 if x<0 else 0)

In [None]:
sns.kdeplot(
    data=test, x="score_LOF", hue=test.Y,
    cumulative=True, common_norm=False, common_grid=True,
)

In [None]:
metrics(test.Y, test.LOF)

In [None]:
metrics(train.Y, train.LOF)

### EllipticEnvelope

https://scikit-learn.org/stable/modules/generated/sklearn.covariance.EllipticEnvelope.html

In [None]:
from sklearn.covariance import EllipticEnvelope

In [None]:
ell = EllipticEnvelope(contamination=0.3)
ell.fit(train[VARS])

In [None]:
train['EE']=ell.predict(train[VARS])
train['score_EE']=ell.decision_function(train[VARS])
train['EE'] = train['EE'].apply(lambda x: 1 if x<0 else 0)

In [None]:
test['EE']=ell.predict(test[VARS])
test['score_EE']=ell.decision_function(test[VARS])
test['EE'] = test['EE'].apply(lambda x: 1 if x<0 else 0)

In [None]:
sns.kdeplot(
    data=test, x="score_EE", hue=test.Y,
    cumulative=True, common_norm=False, common_grid=True,
)

In [None]:
# cm = confusion_matrix(test.Y, test.EE)
# cm_df = pd.DataFrame(cm)                      
# plt.figure(figsize=(8,6))  
# sns.heatmap(cm_df, annot=True)
# print("Classification Report: \n", classification_report(test.Y, test.EE))  

In [None]:
metrics(test.Y, test.EE)

In [None]:
metrics(train.Y, train.EE)

### Curve ROC AUC

In [None]:
from sklearn import metrics
def plot_roc(y_pred, y_test, l):

    y_pred = np.array(y_pred)
    y_test = np.array(y_test)
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)

    roc_auc = roc_auc_score(y_test, y_pred, multi_class = 'ovr', average=None)

    
    auc = round(metrics.roc_auc_score(y_test, y_pred), 3)
    plt.plot(fpr,tpr,label=l+ " , AUC="+str(auc))
    plt.xlabel('Taxa falsos positivos')
    plt.ylabel('Taxa verdadeiros positivos')

In [None]:
labels = ['OCSVM', 'iForest', "EE", "LOF"]

#set up plotting area
plt.figure(0).clf()

for i, j in enumerate(labels):

    plot_roc(test[j],test.Y, labels[i])


#add legend
plt.legend()

In [None]:
# Concatenate all classifier results
ensemble = pd.concat([test.iForest, test.OCSVM, test.LOF, test.EE],axis=1)


# g= sns.heatmap(ensemble.corr(),annot=True)


corrmat = np.triu(ensemble.corr(method='spearman'))

g = sns.heatmap(ensemble.corr(method='spearman'),cmap="coolwarm",annot=True, mask = corrmat, fmt=".2f")

In [None]:
def voting(c1, c2, c3, c4):
    if c1==1 and c2==1:
        return 1
    elif c3==1 and c2==1:
        return 1
    elif c3==1 and c1==1:
        return 1
    elif c3==1 and c4==1:
        return 1
    elif c2==1 and c4==1:
        return 1
    elif c1==1 and c4==1:
        return 1
    elif c1==1 and c2==1 and c3==1:
        return 1
    elif c1==1 and c2==1 and c3==1 and c4==1:
        return 1
    else: 
        return 0

# Apply the function to the DataFrame
test['Ensemble'] = test.apply(lambda row: voting(row['iForest'], row['OCSVM'], row['LOF'], row['EE']), axis=1)
test['Ensemble'] = test['Ensemble'].astype(int)

train['Ensemble'] = train.apply(lambda row: voting(row['iForest'], row['OCSVM'], row['LOF'], row['EE']), axis=1)
train['Ensemble'] = train['Ensemble'].astype(int)
test.head()

In [None]:
cm = confusion_matrix(test.Y, test.Ensemble)
cm_df = pd.DataFrame(cm)                      
plt.figure(figsize=(8,6))  
sns.heatmap(cm_df, annot=True)
print("Classification Report: \n", classification_report(test.Y, test.Ensemble))  

In [None]:
roc_auc = roc_auc_score(test.Y, test.Ensemble, multi_class = 'ovr', average=None)
gini = 2*roc_auc -1
print("Gini: ",gini)
print("ROC AUC:: ",roc_auc)

In [None]:
labels = ['OCSVM', 'iForest', 'Ensemble', "EE", "LOF"]
#set up plotting area
plt.figure(0).clf()

for i, j in enumerate(labels):

    plot_roc(test[j],test.Y, labels[i])


#add legend
plt.legend()

### McNemar’s Test for Classifiers

Fail to Reject Null Hypothesis: Classifiers have a similar proportion of errors on the test set. 

Reject Null Hypothesis: Classifiers have a different proportion of errors on the test set.

In [None]:
pd.crosstab(test.LOF, test.iForest)

In [None]:
table1 = mcnemar_table(y_target=test.Y, 
                   y_model1=test.LOF, 
                   y_model2=test.iForest)

table1

In [None]:
pd.crosstab(test.OCSVM, test.iForest)

In [None]:
table2 = mcnemar_table(y_target=test.Y, 
                   y_model1=test.OCSVM, 
                   y_model2=test.iForest)

table2

In [None]:
pd.crosstab(test.LOF, test.OCSVM)

In [None]:
table3 = mcnemar_table(y_target=test.Y, 
                   y_model1=test.LOF, 
                   y_model2=test.OCSVM)

table3

In [None]:
pd.crosstab(test.LOF, test.EE)

In [None]:
table4 = mcnemar_table(y_target=test.Y, 
                   y_model1=test.LOF, 
                   y_model2=test.EE)
table4

In [None]:
pd.crosstab(test.OCSVM, test.EE)

In [None]:
table5 = mcnemar_table(y_target=test.Y, 
                   y_model1=test.OCSVM, 
                   y_model2=test.EE)

table5

In [None]:
pd.crosstab(test.EE, test.iForest)

In [None]:
table6 = mcnemar_table(y_target=test.Y, 
                   y_model1=test.EE, 
                   y_model2=test.iForest)

table6

In [None]:
def test_hip(table):
    # calculate mcnemar test
    result = mcnemar(table, exact=True,  correction=True)
    # summarize the finding
    print('statistic=%.3f, p-value=%.3f' % (result.statistic, result.pvalue))
    # interpret the p-value
    alpha = 0.05
    if result.pvalue > alpha:
        print('Same proportions of errors (fail to reject H0)')
    else:
        print('Different proportions of errors (reject H0)')


def test_hip2(table):
    result = mcnemar(table, exact=True)
    p = result.pvalue
    return p 

In [None]:
print('LOF vs IsolationForest')
test_hip(table1)
print('OCSVM vs IsolationForest')
test_hip(table2)
print('LOF vs OCSVM')
test_hip(table3)
print('LOF vs EE')
test_hip(table4)
print('OCSVM vs EE')
test_hip(table5)
print('IsolationForest vs EE')
test_hip(table6)

In [None]:
p_value = list()
tables = [table1, table2, table3, table4, table5, table6]
for i in tables:
    p_value.append(test_hip2(i))

In [None]:
p_value

### t-SNE

https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

In [None]:
def tsne_scatter(features, labels, dimensions=2, save_as='graph.png'):
    if dimensions not in (2, 3):
        raise ValueError('tsne_scatter can only plot in 2d or 3d (What are you? An alien that can visualise >3d?). Make sure the "dimensions" argument is in (2, 3)')

    # t-SNE dimensionality reduction
    features_embedded = TSNE(n_components=dimensions, random_state=23).fit_transform(features)
    
    # initialising the plot
    fig, ax = plt.subplots(figsize=(8,8))
    
    # counting dimensions
    if dimensions == 3: ax = fig.add_subplot(111, projection='3d')

    # plotting data
    ax.scatter(
        *zip(*features_embedded[np.where(labels==1)]),
        marker='o',
        color='r',
        s=2,
        alpha=0.7,
        label=1
    )
    ax.scatter(
        *zip(*features_embedded[np.where(labels==0)]),
        marker='o',
        color='g',
        s=2,
        alpha=0.3,
        label=0
    )

    # storing it to be displayed later
    plt.legend(loc='best')
    plt.savefig(save_as);
    plt.show;

In [None]:
n_components = 2
tsne = TSNE(n_components)
tsne_result = tsne.fit_transform(train[VARS])
tsne_result.shape
# (1000, 2)
# Two dimensions for each of our images
 
# Plot the result of our TSNE with the label color coded
# A lot of the stuff here is about making the plot look pretty and not TSNE
tsne_result_df = pd.DataFrame({'tsne_1': tsne_result[:,0], 'tsne_2': tsne_result[:,1], 'label': train.Y})
fig, ax = plt.subplots(1)
sns.scatterplot(x='tsne_1', y='tsne_2', hue='label', data=tsne_result_df, ax=ax,s=120)
lim = (tsne_result.min()-5, tsne_result.max()+5)
ax.set_xlim(lim)
ax.set_ylim(lim)
ax.set_aspect('equal')
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

### Gráfico 3D

In [None]:
train['CONJUNTO'] = 'train'
test['CONJUNTO'] = 'test'

In [None]:
df = pd.concat([train, test], axis=0)
df.shape

In [None]:
tsne = TSNE(n_components=3, verbose=1, perplexity=30, n_iter=3000)

In [None]:
X_reduce = tsne.fit_transform(df[VARS])

In [None]:
tsne_result_df = pd.DataFrame({'tsne_1': X_reduce[:,0], 'tsne_2': X_reduce[:,1], 'tsne_3': X_reduce[:,0]})

tsne_result_df = tsne_result_df.reset_index(drop=True)
data = df.reset_index(drop=True)

data2 = pd.concat([data, tsne_result_df], axis=1)
# data2= data2[data2['CONJUNTO']=='test']
data2.head()

In [None]:
import snips as snp  # my snippets
# snp.prettyplot(matplotlib)  # my aesthetic preferences for plotting
# %matplotlib inline

from mpl_toolkits.mplot3d import Axes3D

In [None]:
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
ax.set_zlabel("tSNE_3")
ax.set_ylabel("tSNE_2")
ax.set_xlabel("tSNE_1")
# Plot the reduced dimensionality data points
ax.scatter(X_reduce[:,0], X_reduce[:,1], zs=X_reduce[:,2], s=10, lw=2, c='blue')

# Plot circles around the predicted outliers
ax.scatter(X_reduce[data2.iForest==1, 0], X_reduce[data2.iForest==1, 1], zs=X_reduce[data2.iForest==1, 2], 
           lw=2, facecolors="none", edgecolors="red", s=80, label="anomalia")

# Plot x's for the ground truth outliers
ax.scatter(X_reduce[data2.Y==1, 0], X_reduce[data2.Y==1, 1], zs=X_reduce[data2.Y==1, 2], 
           lw=2, s=50, marker="x", c="red", label="outlier")
ax.legend()

In [None]:
tsne2 = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=3000)

In [None]:
X_reduce2 = tsne2.fit_transform(train[VARS])

In [None]:
tsne_result_df2 = pd.DataFrame({'tsne_1': X_reduce2[:,0], 'tsne_2': X_reduce2[:,1]})

tsne_result_df2 = tsne_result_df2.reset_index(drop=True)
data3 = train.reset_index(drop=True)

data4 = pd.concat([data3, tsne_result_df2], axis=1)
data4.head()

In [None]:
sns.kdeplot(
    data=data4, x="tsne_1", y="tsne_2", hue="iForest", fill=True
)