In [1]:
import sys; sys.path.insert(0, '..')

In [2]:
import matplotlib.pyplot as plt
import numpy as np
from joblib import dump, load
from matplotlib.colors import ListedColormap
import pandas as  pd
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.model_selection import (GridSearchCV, cross_validate,
                                     train_test_split)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

from scripts.get_data import ceramiche_no_ripetizioni_DB

In [3]:
classifiers = {
    "Neural net": (
                MLPClassifier(),
                {
                    'hidden_layer_sizes': list(range(2,30)),
                    'activation': ['identity', 'logistic', 'tanh', 'relu'],
                }
            ),


    "Nearest Neighbors": (
                    KNeighborsClassifier(),
                    {'n_neighbors': list(range(1,11)), 'weights': ['uniform', 'distance']}
                ),
    "SVC": (
                    SVC(),
                    [{
                        'kernel': ['linear', 'rbf', 'sigmoid'],
                        'C': [x/10 for x in range(1, 11)],
                    },
                    {
                        'kernel': ['poly'],
                        'degree': list(range(2,6)),
                        'C': [x/10 for x in range(1, 11)],
                    }]
                ),
    "Decision Tree": (
                    DecisionTreeClassifier(),
                    {
                        'criterion': ['gini', 'entropy'], 
                        'min_samples_split':list(range(2,11)),
                        'min_samples_leaf': list(range(1,6)),
                    }
                ),
    "Random Forest": (
                    RandomForestClassifier(n_jobs=-1),
                    {
                        'n_estimators': list(range(1, 101, 10)), 
                        'criterion': ['gini', 'entropy'], 
                        'min_samples_split':list(range(2, 11)),
                        'min_samples_leaf': list(range(1, 6)),
                        'oob_score': [True, False],
                    }
                ),
    "Naive Bayes": (
                    GaussianNB(),
                    {}
                ),

    # "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
    # "AdaBoost": AdaBoostClassifier(),
    # "QDA": QuadraticDiscriminantAnalysis(),
}

In [4]:
def sensibility(y_true, y_predict):
    c_matrix = confusion_matrix(y_true, y_predict)
    return c_matrix[1][1]/sum(c_matrix[1])


def specificity(y_true, y_predict):
    c_matrix = confusion_matrix(y_true, y_predict)
    return c_matrix[0][0]/sum(c_matrix[0])


scorers = {
    'sensibility' : make_scorer(sensibility, greater_is_better=True),
    'specificity' : make_scorer(specificity, greater_is_better=True),
    'accuracy' : make_scorer(accuracy_score, greater_is_better=True)
}

# NESTED CROSS VALIDATION
X, y = ceramiche_no_ripetizioni_DB()
X_train, X_test, y_train, y_test = train_test_split(X, y)
file_name = '../scripts/supervised_no_repetitions_NCV_dump.joblib'

#Se voi eseguire di nuovo la computazione invcece che caricare quella cachata cambia questa variabile
NEW_COMPUTATION = False

if NEW_COMPUTATION:
    result = {}

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        for name, (clf, params) in tqdm(classifiers.items()):
            grid = GridSearchCV(clf, params, scoring='accuracy', cv = 5, n_jobs = -1)
            s = cross_validate(grid, X, y, cv=5, scoring = scorers, return_estimator=True)
            result[name] = s
else:
    result = load(file_name)


In [5]:
# TEST CON LE RIPETIZIONI

X_train, X_test, y_train, y_test = ceramiche_no_ripetizioni_DB(ripetizioni=True)
file_name = '../scripts/supervised_no_repetitions_repet_dump.joblib'

#Se voi eseguire di nuovo la computazione invcece che caricare quella cachata cambia questa variabile
NEW_COMPUTATION = False

if NEW_COMPUTATION:
    result = {}

    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        for name, (clf, params) in tqdm(classifiers.items()):
            grid = GridSearchCV(clf, params, scoring='accuracy', cv = 5, n_jobs = -1).fit(X_train, y_train)

            result[name] = dict(
                    [('test_' + scorer_name,
                     s(grid, X_test, y_test))
                    for scorer_name, s in scorers.items()]
                )
            result[name]['estimator'] = (grid,) #dentro una tupla per essere uguale al dizionario generato da cross_validate
else:
    result = load(file_name)



In [6]:
rows = ['sensibility', 'specificity', 'accuracy']
columns = ['mean', 'std']

row_format ="{:>10.4}" * (len(columns) + 1)
col_format ="{:>10.4}" * (len(columns) + 1)

for n, r in result.items():
    print(n.upper())

    #NB se si usa il test con le ripetizioni avremmo sempre std = 0
    data = [(r["test_"+x].mean(), r["test_"+x].std()) for x in rows]

    print(col_format.format("", *columns))
    for name, row in zip(rows, data):
        print(row_format.format(name, *row))

    print()
    print()


NEURAL NET
                mean       std
      sens       1.0       0.0
      spec      0.95       0.0
      accu    0.9851       0.0


NEAREST NEIGHBORS
                mean       std
      sens    0.9787       0.0
      spec       1.0       0.0
      accu    0.9851       0.0


SVC
                mean       std
      sens       1.0       0.0
      spec      0.95       0.0
      accu    0.9851       0.0


DECISION TREE
                mean       std
      sens    0.8085       0.0
      spec      0.95       0.0
      accu    0.8507       0.0


RANDOM FOREST
                mean       std
      sens       1.0       0.0
      spec       1.0       0.0
      accu       1.0       0.0


NAIVE BAYES
                mean       std
      sens    0.9787       0.0
      spec       1.0       0.0
      accu    0.9851       0.0




In [7]:
#Rimuovere il commento per salvare i nuovi modelli
dump(result, file_name)

['../scripts/supervised_no_repetitions_repet_dump.joblib']

In [13]:
pd.read_excel('../dati_ceramiche_classi_no_ripetizioni.xlsx', index_col=0, usecols=[
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], sheet_name=4)

Unnamed: 0,chiusi,K,Ca,Ti,Cr,Mn,Fe,Zn,Rb,Sr
1,A10_25,12.858469,69.126564,1.152565,0.180259,0.491615,15.944721,0.043699,0.032774,0.169334
2,_1,13.485708,68.796399,1.034664,0.163676,0.47349,15.806395,0.046764,0.035073,0.15783
3,c3-738,18.771078,57.095643,2.016727,0.283286,0.438419,21.071091,0.087684,0.053959,0.182113
4,_a,18.945925,56.942268,2.096395,0.032654,0.248171,21.421108,0.07837,0.058777,0.176332
5,c30-110,17.638601,42.115252,3.490913,0.586611,1.207729,34.679089,0.10352,0.063262,0.115022
6,_a,18.940317,44.753785,2.566556,0.46111,0.974421,32.060205,0.078302,0.060901,0.104402
7,c203-1,11.775384,58.54901,2.652535,0.368896,0.784635,25.547488,0.070266,0.06441,0.187376
8,_bis,12.011662,58.51895,2.553936,0.338192,0.705539,25.55102,0.069971,0.06414,0.186589
9,a,14.353377,56.733937,2.127952,0.21966,0.658979,25.617792,0.054915,0.054915,0.178473
10,_bis,16.088906,55.124077,2.146871,0.214687,0.669319,25.453053,0.063143,0.063143,0.176801


In [14]:
tarquina_singolo = pd.read_excel('../dati_ceramiche_classi_no_ripetizioni.xlsx', index_col=0, usecols=[
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], sheet_name=4)

non_tarquina_singolo = pd.read_excel('../dati_ceramiche_classi_no_ripetizioni.xlsx', index_col=0, usecols=[
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], sheet_name=5)

X_singolo = pd.concat([tarquina_singolo, non_tarquina_singolo])
y_singolo = [1]*len(tarquina_singolo) + \
    [0] * len(non_tarquina_singolo)

for name, r in result.items():
    X_singolo[name] = r['estimator'][0].predict([a[1] for a in X_singolo.iloc[:,1:10].iterrows()])

X_singolo['y reale'] = y_singolo

X_singolo.to_excel('kasdgf.xlsx')

In [20]:
# Valotazione con i singoli come test
X_singolo = pd.concat([tarquina_singolo, non_tarquina_singolo])
print('Accuracy:\n')
for name, r in result.items():
    print(name, accuracy_score(y_singolo, r['estimator'][0].predict(X_singolo.iloc[:,1:10])))

Accuracy:

Neural net 0.875
Nearest Neighbors 0.875
SVC 0.875
Decision Tree 0.75
Random Forest 1.0
Naive Bayes 0.9375
