In [1]:
import sys; sys.path.insert(0, '..')

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix
from sklearn.model_selection import (GridSearchCV, cross_validate,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from scripts.get_data import ceramiche_DB
from tqdm import tqdm

In [5]:
classifiers = {
    "Neural net": (
                MLPClassifier(),
                {
                    'hidden_layer_sizes': list(range(2,30)),
                    'activation': ['identity', 'logistic', 'tanh', 'relu'],
                }
            ),


    "Nearest Neighbors": (
                    KNeighborsClassifier(),
                    {'n_neighbors': list(range(1,11)), 'weights': ['uniform', 'distance']}
                ),
    "SVC": (
                    SVC(),
                    [{
                        'kernel': ['linear', 'rbf', 'sigmoid'],
                        'C': [x/10 for x in range(1, 11)],
                    },
                    {
                        'kernel': ['poly'],
                        'degree': list(range(2,6)),
                        'C': [x/10 for x in range(1, 11)],
                    }]
                ),
    "Decision Tree": (
                    DecisionTreeClassifier(),
                    {
                        'criterion': ['gini', 'entropy'], 
                        'min_samples_split':list(range(2,11)),
                        'min_samples_leaf': list(range(1,6)),
                    }
                ),
    "Random Forest": (
                    RandomForestClassifier(n_jobs=-1),
                    {
                        'n_estimators': list(range(1, 101, 10)), 
                        'criterion': ['gini', 'entropy'], 
                        'min_samples_split':list(range(2, 11)),
                        'min_samples_leaf': list(range(1, 6)),
                        'oob_score': [True, False],
                    }
                ),
    "Naive Bayes": (
                    GaussianNB(),
                    {}
                ),

    # "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
    # "AdaBoost": AdaBoostClassifier(),
    # "QDA": QuadraticDiscriminantAnalysis(),
}

In [6]:
def sensibility(y_true, y_predict):
    c_matrix = confusion_matrix(y_true, y_predict)
    return c_matrix[1][1]/sum(c_matrix[1])


def specificity(y_true, y_predict):
    c_matrix = confusion_matrix(y_true, y_predict)
    return c_matrix[0][0]/sum(c_matrix[0])


scores = {
    'sensibility' : make_scorer(sensibility, greater_is_better=True),
    'specificity' : make_scorer(specificity, greater_is_better=True),
    'accuracy' : make_scorer(accuracy_score, greater_is_better=True)
}

In [7]:
X, y = ceramiche_DB()
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
result = {}

import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    for name, (clf, params) in tqdm(classifiers.items()):
        grid = GridSearchCV(clf, params, scoring='accuracy', cv = 5, n_jobs = -1)
        s = cross_validate(grid, X, y, cv=5, scoring = scores)
        result[name] = s


100%|██████████| 6/6 [50:44<00:00, 507.47s/it]


In [9]:
rows = ['sensibility', 'specificity', 'accuracy']
columns = ['mean', 'std']

row_format ="{:>20}" * (len(columns) + 1)
col_format ="{:>20}" * (len(columns) + 1)

for n, r in result.items():
    print(n.upper())

    data = [(r["test_"+x].mean(), r["test_"+x].std()) for x in rows]

    print(col_format.format("", *columns))
    for name, row in zip(rows, data):
        print(row_format.format(name, *row))

    print()
    print()


NEURAL NET
                                    mean                 std
         sensibility  0.9882352941176471 0.02352941176470589
         specificity  0.7666666666666666 0.32659863237109044
            accuracy  0.9276679841897233  0.0844370913571189


NEAREST NEIGHBORS
                                    mean                 std
         sensibility  0.9522058823529411 0.06874115230894264
         specificity  0.9333333333333333 0.13333333333333336
            accuracy  0.9466403162055336 0.05120064795165622


SVC
                                    mean                 std
         sensibility                 1.0                 0.0
         specificity  0.8666666666666666 0.16329931618554522
            accuracy  0.9636363636363636 0.04453617714151235


DECISION TREE
                                    mean                 std
         sensibility  0.9147058823529411 0.08225440533632127
         specificity  0.9333333333333333 0.13333333333333336
            accuracy  0.91936758