## Valutazione Progetto

In [27]:
# Packege principali
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Rimpiazzo NaN
from sklearn.impute import KNNImputer

# ELimino Outliers Multivariati
from collections import Counter
from sklearn.cluster import DBSCAN
from imblearn.over_sampling import SMOTE

# Spit Dati
import sklearn.model_selection as model_select
import sklearn.metrics as metrics

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA


# Salvataggio Classificatore
import pickle
# Path della best_classificator.
target = 'CLASS'
pipeline_path = 'best_pipeline2.sav'

In [28]:
def show_confusion_matrix(cm, f1_score, title):
    """Displays confusion matrix with annotations."""
    # Create annotations label.
    group_counts = ["{0:0.0f}\n".format(value) for value in cm.flatten()]
    group_percentages =\
        ["{0:.2%}".format(value) for value in cm.flatten() / np.sum(cm)]
    box_labels =\
        [f"{v1}{v2}".strip() for v1, v2 in zip(group_counts, group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cm.shape[0], cm.shape[1])
    # Show confusion matrix with heat map.
    sns.heatmap(cm,
                annot=box_labels,
                fmt="",
                cmap="YlGnBu",
                cbar=False,
                linewidths=1.0)\
        .set(title=title,
             xlabel='Predicted class\nF1 macro: %0.4f' % f1_score,
             ylabel='Actual class')
    plt.show()

In [29]:
def evaluation():
    """Evaluates our classifier on the test set."""
    # Load our classifier.
    with open('best_pipeline.sav', 'rb') as model_file:
        best_pipeline = pickle.load(model_file)

    # Load test set.
    testset = pd.read_csv('training_set.csv')
    print("TEST SET IMPORTED")

    # Separate features and labels.
    x = testset.drop('CLASS', axis=1)
    y = testset['CLASS']

    evaluate_classifier(best_pipeline, x, y)

In [30]:
def evaluate_classifier(classifier, data_x, data_y, matrix_title='', show=True):
    """Preprocesses test set and evaluates classifiers."""
    pred_y = classifier.predict(data_x)
    confusion_matrix = metrics.confusion_matrix(data_y, pred_y)
    f1_score = metrics.f1_score(data_y, pred_y, average='macro')
    print('\nTest set F1 macro score: %0.4f .\n' % f1_score)
    if show:
        show_confusion_matrix(confusion_matrix, f1_score, matrix_title)
    return f1_score

In [31]:
# Creiamo la classe che rimpiazza gli outliers che ci servira per pipeline e perchè KNN non rimpiazza direttamente gli outliers
# ma i nan quindi bisogna anche ricreare il metodo fit e il trasform

class KNNReplacerIQR(KNNImputer):
    """Pipeline-compliant KNNReplacer, based on IQR."""

    def __init__(self, n_neighbors=2):
        super().__init__(n_neighbors=n_neighbors)
        self.lower_bound = None
        self.upper_bound = None
        self.imputer = KNNImputer(n_neighbors=n_neighbors)

    def fit(self, x, y=None):
        """Computes IQR bound and fits the imputer on the data."""
        x = pd.DataFrame(x)
        q1 = x.quantile(0.25)
        q3 = x.quantile(0.75)
        iqr = q3 - q1
        self.lower_bound = q1 - (1.5* iqr)
        self.upper_bound = q3 + (1.5* iqr)
        self.imputer.fit(
            x.where(~((x < self.lower_bound) | (x > self.upper_bound)), np.nan)
        )
        return self

    def transform(self, x, y=None):
        """Detects outliers and replaces them with the imputer."""
        x = pd.DataFrame(x)
        x.where(~((x < self.lower_bound) | (x > self.upper_bound)),
                np.nan,
                inplace=True)
        return self.imputer.transform(x)

In [34]:
def main():
    """Performs analysis and determines the best model for this problem."""
    # Read dataset.
    dataset_path = 'training_set.csv'
    dataset = pd.read_csv(dataset_path)
    print("DATASET IMPORTED")
    print('\nDataset shape:', dataset.shape)
    print(dataset.describe())
    print('\nLast dataset entries:', dataset.tail())

    # Separate features and target labels.
    x = dataset.drop(target, axis=1)
    y = dataset[[target]]
    features_list = x.columns.values.tolist()
    
    # Elimino nan iniziali
    imputer = KNNImputer(n_neighbors=2)
    dati_imputati_0 = imputer.fit(x)
    dati_imputati_1 = dati_imputati_0.transform(x)
    # Dataset senza NaN
    X_imputer = pd.DataFrame(dati_imputati_1)

    # Faccio clustering per eliminare gli Outliers Multivariati
    Scanner = DBSCAN(eps=6.6, min_samples=10,n_jobs = -1).fit(X_imputer)
    Outliers = X_imputer[Scanner.labels_==-1].index.values
    x_final = pd.DataFrame(X_imputer).drop(index = Outliers)
    y_final = y.drop(index = Outliers)

    # Sovracampiono il dataset con valori simili
    oversample = SMOTE()
    x_final, y_final = oversample.fit_resample(x_final, y_final)

    # Split dataset in training set and test set.
    train_x, test_x, train_y, test_y = \
        model_select.train_test_split(x_final, y_final,
                                      test_size=0.2,
                                      random_state=42,
                                      stratify=y_final)
    print('\nTraining set shape:', train_x.shape, train_y.shape)
    print('Test set shape:', test_x.shape, test_y.shape)

    # Display data proportions after splitting.
    #show_classes_proportions(y, 'Dataset classes proportions')
    #show_classes_proportions(train_y, 'Training set classes proportions')
    #show_classes_proportions(test_y, 'Test set classes proportions')

    # Define pipelines for preprocessing.
    Pipe_Knn_Bagging = Pipeline([   ('replacer', KNNReplacerIQR(n_neighbors = 2)),
                                    ('pre-process-QDA',PolynomialFeatures(degree=3)),
                                    ('scaler',StandardScaler()),
                                    ('decomposition',PCA(random_state = 42)),
                                    ('feature_selection', SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))),
                                    ('classifier',BaggingClassifier(
                                                 base_estimator=QDA(reg_param = 0.001,store_covariance =True,tol = 0.001),
                                                 n_jobs = -1,
                                                 random_state = 42))
                                ])

    # Set the parameters grids.
    grid_Pipe_Knn_Bagging = {'replacer__n_neighbors': [2]}


    # Define grid searches for each pipeline.
    gs_Bagging_knn = model_select.GridSearchCV( Pipe_Knn_Bagging ,
                                               param_grid =  grid_Pipe_Knn_Bagging,
                                               scoring='f1_macro',
                                               cv=5,
                                               refit=True,
                                               n_jobs=-1)

    # Lista pipeline
    grids = [gs_Bagging_knn]
    #Dizionario delle pipeline
    grid_dict_pipe = {0:'BEST PIPELINE'}

    # Fit the grid search objects and look for the best model.
    print("\nMODEL OPTIMIZATIONS STARTED")
    best_f1 = 0.0
    best_idx = 0
    best_pipe = None
    for idx, pipe_gs in enumerate(grids):
        print('Currently trying model: %s' % grid_dict_pipe[idx])

        # Perform grid search.
        pipe_gs.fit(train_x, train_y[target])

        # Dump detailed scores on a file.
        results_file = open(grid_dict_pipe[idx] + '_results.txt', 'w')

        # Print scores and update bests.
        print("\nGrid scores:")
        means = pipe_gs.cv_results_['mean_test_score']
        stds = pipe_gs.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     pipe_gs.cv_results_['params']):
            print("%0.4f (+/-%0.03f) for %r" % (mean, std * 2, params))
            results_file.write("%0.4f (+/-%0.03f) for %r\n"
                               % (mean, std * 2, params))
        print("\nBest parameters:")
        print(pipe_gs.best_params_)
        print("\nBest score: %0.4f" % pipe_gs.best_score_)
        if pipe_gs.best_score_ > best_f1:
            best_f1 = pipe_gs.best_score_
            best_idx = idx
            best_pipe = pipe_gs.best_estimator_
        results_file.write("\nBest parameters:\n%r\n" % pipe_gs.best_params_)
        results_file.write("\nBest score: %0.4f\n" % pipe_gs.best_score_)

        results_file.close()

    print('\nPipeline with best training set F1 macro score: %s'
          % grid_dict_pipe[best_idx])

    # Show information and plots about best preprocessing pipeline.
    #data_preparation_info(train_x, features_list, best_pipe)

    # Evaluates the pipeline on the test set.
    
    # Serialize and dump the best model.

    with open(pipeline_path, 'wb') as model_file:
        pickle.dump(best_pipe, model_file)

   

In [35]:
# Start the script.
if __name__ == '__main__':
    main()

DATASET IMPORTED

Dataset shape: (8000, 21)
                F1           F2           F3           F4           F5  \
count  7994.000000  7994.000000  7999.000000  7999.000000  7996.000000   
mean     -0.013077    -0.261413    -0.356239    -0.107298    -0.505798   
std       1.006235     1.852793     1.794600     3.038362     1.818965   
min      -4.181155    -6.980290    -7.563245   -13.133565    -9.011808   
25%      -0.698506    -1.441144    -1.564076    -1.854212    -1.735180   
50%      -0.028194    -0.261095    -0.375098    -0.022912    -0.511772   
75%       0.666096     0.944857     0.824168     1.757135     0.691109   
max       3.774161     7.155359     6.774458    10.975842     6.420768   

                F6           F7           F8           F9          F10  ...  \
count  7994.000000  7996.000000  7991.000000  7994.000000  7994.000000  ...   
mean      0.170845    -0.142636     0.135534    -0.004581     0.017338  ...   
std       3.802454     1.901893     1.846124     1.0

In [40]:
 # Reload best model and check if the save went well.
with open(pipeline_path, 'rb') as model_file:
    model = pickle.load(model_file)
print('\n(Post-save) Dataset F1 macro: %0.4f'
      % evaluate_classifier(model,
                            x,
                            y[target],
                            show=False))

NameError: name 'x' is not defined

In [38]:
print('\nTest set F1 macro: %0.4f'
          % evaluate_classifier(best_pipe,
                                test_x,
                                test_y[target],
                                'Test Set Confusion matrix'))

# Refit the best pipeline on the whole dataset.
print("\nRE-FITTING BEST PIPELINE ON WHOLE DATASET")
best_pipe = best_pipe.fit(x, y[target])
print('\n(Pre-save) Dataset F1 macro: %0.4f'
      % evaluate_classifier(best_pipe,
                            x,
                            y[target],
                            'Dataset Confusion matrix'))


NameError: name 'best_pipe' is not defined