# Progetto Mobd
    Authors: Giacomo Solfizi, Edoardo Rossi
    Project: MOBD
    Description: Script to generate and save best classifier.


## Import Packege

In [1]:
# Packege principali
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Rimpiazzo NaN
from sklearn.impute import KNNImputer

# ELimino Outliers Multivariati
from collections import Counter
from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE

# Spit Dati
import sklearn.model_selection as model_select
import sklearn.metrics as metrics

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA


# Salvataggio Classificatore
import pickle
# Path della best_classificator.
best_classificator = 'best_path.sav'

## Preparo secondo pickle

In [5]:
# Creiamo la classe che rimpiazza gli outliers che ci servira per pipeline e perchè KNN non rimpiazza direttamente gli outliers
# ma i nan quindi bisogna anche ricreare il metodo fit e il trasform

class KNNReplacerIQR(KNNImputer):
    """Pipeline-compliant KNNReplacer, based on IQR."""

    def __init__(self, n_neighbors=2):
        super().__init__(n_neighbors=n_neighbors)
        self.lower_bound = None
        self.upper_bound = None
        self.imputer = KNNImputer(n_neighbors=n_neighbors)

    def fit(self, x, y=None):
        """Computes IQR bound and fits the imputer on the data."""
        x = pd.DataFrame(x)
        q1 = x.quantile(0.25)
        q3 = x.quantile(0.75)
        iqr = q3 - q1
        self.lower_bound = q1 - (1.5* iqr)
        self.upper_bound = q3 + (1.5* iqr)
        self.imputer.fit(
            x.where(~((x < self.lower_bound) | (x > self.upper_bound)), np.nan)
        )
        return self

    def transform(self, x, y=None):
        """Detects outliers and replaces them with the imputer."""
        x = pd.DataFrame(x)
        x.where(~((x < self.lower_bound) | (x > self.upper_bound)),
                np.nan,
                inplace=True)
        return self.imputer.transform(x)

In [6]:
# Pipeline Principale
Pipe_Knn_Bagging = Pipeline([   ('replacer', KNNReplacerIQR(n_neighbors = 2)),
                                ('pre-process-QDA',PolynomialFeatures(degree=3)),
                                ('scaler',StandardScaler()),
                                ('decomposition',PCA(random_state = 42)),
                                ('feature_selection', SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))),
                                ('classifier',BaggingClassifier(
                                                 base_estimator=QDA(reg_param = 0.001,store_covariance =True,tol = 0.001),
                                                 n_jobs = -1,
                                                 random_state = 42))
                            ])

grid_Pipe_Knn_Bagging = {'replacer__n_neighbors': [2]}

gs_Bagging_knn = model_select.GridSearchCV( Pipe_Knn_Bagging ,
                                               param_grid =  grid_Pipe_Knn_Bagging,
                                               scoring='f1_macro',
                                               cv=5,
                                               refit=True,
                                               n_jobs=-1)
# Lista pipeline
grids = [gs_Bagging_knn]
#Dizionario delle pipeline
grid_dict_pipe = {0:'BEST PIPELINE'}

In [7]:
# Fit the grid search objects and look for the best model.
print("\nMODEL OPTIMIZATIONS STARTED")
best_f1 = 0.0
best_idx = 0
best_pipe = None
for idx, pipe_gs in enumerate(grids):
    print('Currently trying model: %s' % grid_dict_pipe[idx])
    print(pipe_gs)

    # Perform grid search.
    pipe_gs.fit(train_x, train_y[target])

    # Dump detailed scores on a file.
    results_file = open(grid_dict_pipe[idx] + '_results.txt', 'w')

    # Print scores and update bests.
    print("\nGrid scores:")
    means = pipe_gs.cv_results_['mean_test_score']
    stds = pipe_gs.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds,
                                 pipe_gs.cv_results_['params']):
        print("%0.4f (+/-%0.03f) for %r" % (mean, std * 2, params))
        results_file.write("%0.4f (+/-%0.03f) for %r\n"
                           % (mean, std * 2, params))
    print("\nBest parameters:")
    print(pipe_gs.best_params_)
    print("\nBest score: %0.4f" % pipe_gs.best_score_)
    if pipe_gs.best_score_ > best_f1:
        best_f1 = pipe_gs.best_score_
        best_idx = idx
        best_pipe = pipe_gs.best_estimator_
    results_file.write("\nBest parameters:\n%r\n" % pipe_gs.best_params_)
    results_file.write("\nBest score: %0.4f\n" % pipe_gs.best_score_)

    results_file.close()


MODEL OPTIMIZATIONS STARTED
Currently trying model: BEST PIPELINE
GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('replacer', KNNReplacerIQR()),
                                       ('pre-process-QDA',
                                        PolynomialFeatures(degree=3)),
                                       ('scaler', StandardScaler()),
                                       ('decomposition', PCA(random_state=42)),
                                       ('feature_selection',
                                        SelectFromModel(estimator=LinearSVC(C=0.01,
                                                                            dual=False,
                                                                            penalty='l1'))),
                                       ('classifier',
                                        BaggingClassifier(base_estimator=QuadraticDiscriminantAnalysis(reg_param=0.001,
                                                                

NameError: name 'train_x' is not defined

In [8]:
def show_confusion_matrix(cm, f1_score, title):
    """Displays confusion matrix with annotations."""
    # Create annotations label.
    group_counts = ["{0:0.0f}\n".format(value) for value in cm.flatten()]
    group_percentages =\
        ["{0:.2%}".format(value) for value in cm.flatten() / np.sum(cm)]
    box_labels =\
        [f"{v1}{v2}".strip() for v1, v2 in zip(group_counts, group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cm.shape[0], cm.shape[1])
    # Show confusion matrix with heat map.
    sns.heatmap(cm,
                annot=box_labels,
                fmt="",
                cmap="YlGnBu",
                cbar=False,
                linewidths=1.0)\
        .set(title=title,
             xlabel='Predicted class\n\nF1 macro: %0.4f' % f1_score,
             ylabel='Actual class')
    plt.show()

In [34]:
def evaluate_classifier(classifier, data_x, data_y, matrix_title='', show=True):
    """Preprocesses test set and evaluates classifiers."""
    pred_y = classifier.predict(data_x)
    confusion_matrix = metrics.confusion_matrix(data_y, pred_y)
    f1_score = metrics.f1_score(data_y, pred_y, average='macro')
    if show:
        show_confusion_matrix(confusion_matrix, f1_score, matrix_title)
    return f1_score

In [35]:
with open(best_classificator, 'wb') as model_file:
        pickle.dump(best_pipe, model_file)

In [39]:
with open(best_classificator, 'rb') as model_file:
        model = pickle.load(model_file)
print('\n(Post-save) Dataset F1 macro: %0.4f'
          % evaluate_classifier(model,
                                x,
                                y[target],
                                show=False))


(Post-save) Dataset F1 macro: 0.9252
