In [42]:
import sys; sys.path.insert(0, '..')

In [43]:
import matplotlib.pyplot as plt
import numpy as np
from joblib import dump, load
from matplotlib.colors import ListedColormap
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.model_selection import (GridSearchCV, cross_validate,
                                     train_test_split)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

from scripts.get_data import ceramiche_DB

In [44]:
classifiers = {
    "Neural net": (
                MLPClassifier(),
                {
                    'hidden_layer_sizes': list(range(2,30)),
                    'activation': ['identity', 'logistic', 'tanh', 'relu'],
                }
            ),


    "Nearest Neighbors": (
                    KNeighborsClassifier(),
                    {
                        'n_neighbors': list(range(1,11)), 
                        'weights': ['uniform', 'distance']
                    }
                ),
    "SVC": (
                    SVC(),
                    [{
                        'kernel': ['linear', 'rbf', 'sigmoid'],
                        'C': [x/10 for x in range(1, 11)],
                    },
                    {
                        'kernel': ['poly'],
                        'degree': list(range(2,6)),
                        'C': [x/10 for x in range(1, 11)],
                    }]
                ),
    "Decision Tree": (
                    DecisionTreeClassifier(),
                    {
                        'criterion': ['gini', 'entropy'], 
                        'min_samples_split':list(range(2,11)),
                        'min_samples_leaf': list(range(1,6)),
                    }
                ),
    "Random Forest": (
                    RandomForestClassifier(n_jobs=-1),
                    {
                        'n_estimators': list(range(1, 101, 10)), 
                        'criterion': ['gini', 'entropy'], 
                        'min_samples_split':list(range(2, 11)),
                        'min_samples_leaf': list(range(1, 6)),
                        'oob_score': [True, False],
                    }
                ),
    "Naive Bayes": (
                    GaussianNB(),
                    {}
                ),

    # "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
    # "AdaBoost": AdaBoostClassifier(),
    # "QDA": QuadraticDiscriminantAnalysis(),
}

In [45]:
def sensibility(y_true, y_predict):
    c_matrix = confusion_matrix(y_true, y_predict)
    return c_matrix[1][1]/sum(c_matrix[1])


def specificity(y_true, y_predict):
    c_matrix = confusion_matrix(y_true, y_predict)
    return c_matrix[0][0]/sum(c_matrix[0])


scores = {
    'sensibility' : make_scorer(sensibility, greater_is_better=True),
    'specificity' : make_scorer(specificity, greater_is_better=True),
    'accuracy' : make_scorer(accuracy_score, greater_is_better=True)
}

In [46]:
X, y = ceramiche_DB()
# X_train, X_test, y_train, y_test = train_test_split(X, y)

XLRDError: Excel xlsx file; not supported

In [None]:
#Se voi eseguire di nuovo la computazione invece che caricare quella cacheata cambia questa variabile
NEW_COMPUTATION = False

if NEW_COMPUTATION:
    result = {}

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        for name, (clf, params) in tqdm(classifiers.items()):
            grid = GridSearchCV(clf, params, scoring='accuracy', cv = 5, n_jobs = -1)
            s = cross_validate(grid, X, y, cv=5, scoring = scores, return_estimator=True)
            result[name] = s
else:
    result = load('../scripts/supervised_dump.joblib')


In [None]:
rows = ['sensibility', 'specificity', 'accuracy'] #score.keys()
columns = ['mean', 'std']

row_format ="{:>10.4}" * (len(columns) + 1)
col_format ="{:>10.4}" * (len(columns) + 1)

for n, r in result.items():
    print(n.upper())

    data = [(r["test_"+x].mean(), r["test_"+x].std()) for x in rows]

    print(col_format.format("", *columns))
    for name, row in zip(rows, data):
        print(row_format.format(name, *row))

    print()
    print()


NEURAL NET
                mean       std
      sens    0.9287     0.114
      spec       0.8    0.1633
      accu    0.8947   0.06795


NEAREST NEIGHBORS
                mean       std
      sens    0.9522   0.06874
      spec    0.9333    0.1333
      accu    0.9477   0.04959


SVC
                mean       std
      sens    0.9404   0.09114
      spec    0.8667    0.1633
      accu    0.9212   0.05538


DECISION TREE
                mean       std
      sens    0.9037    0.1227
      spec    0.9333    0.1333
      accu    0.9129   0.08039


RANDOM FOREST
                mean       std
      sens       1.0       0.0
      spec    0.9333    0.1333
      accu    0.9818   0.03636


NAIVE BAYES
                mean       std
      sens       1.0       0.0
      spec       1.0       0.0
      accu       1.0       0.0




In [None]:
# dump(result, '../scripts/supervised_dump.joblib')