## Import

In [24]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [10]:
testset = pd.read_csv('training_set.csv')
testset

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F12,F13,F14,F15,F16,F17,F18,F19,F20,CLASS
0,-0.555694,-0.848258,0.132180,-4.061760,1.661394,2.219988,0.360537,2.537116,-0.613588,2.078144,...,-2.492234,0.808907,-1.078887,3.438161,2.372122,1.899934,2.372122,2.219416,0.132180,2
1,0.293193,-2.628978,-1.154407,0.538828,-0.169857,3.487574,0.443397,-0.006410,-0.125778,1.223669,...,-1.723842,1.772836,0.467387,-1.501851,-3.599221,-0.968531,-3.599221,1.127776,-1.154407,0
2,-2.078656,-0.834492,1.241461,1.010122,-1.638526,0.247378,-1.887390,-1.331368,-2.159086,0.002788,...,-1.686278,-1.047410,-1.133299,-1.953928,-1.149684,1.111692,-1.149684,0.134184,1.241461,1
3,-1.294256,-2.804065,-1.335397,-1.351379,-0.327137,1.199219,0.262458,0.825120,-0.638883,1.660732,...,4.437570,-0.093413,2.637345,-2.415704,-4.679002,0.511314,-4.679002,0.805571,-1.335397,0
4,-0.525611,0.024948,1.609361,-0.248425,1.533188,0.580862,0.049771,-0.430270,-0.714264,-0.186867,...,0.404803,-0.733368,1.288384,-1.646543,-1.020989,0.658584,-1.020989,1.412792,1.609361,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.405098,0.592920,2.441859,-1.134919,2.248627,2.509097,-0.227617,0.275321,-0.274926,-0.595862,...,-0.169544,-0.255823,2.695956,-2.220396,-2.432903,-0.131110,-2.432903,-0.479939,2.441859,1
7996,-0.404388,1.813804,-2.483380,2.093664,0.139423,2.617027,0.489677,1.387914,-0.363073,0.030530,...,0.124103,0.056482,-0.333988,-1.204211,-0.224673,1.521622,-0.224673,-1.096852,-2.483380,2
7997,0.233546,-1.003142,-2.121826,1.581558,1.152723,-0.987152,0.337969,-4.654229,-0.417682,-1.260857,...,2.469214,2.782867,0.888288,0.335607,-2.248307,-1.386542,-2.248307,0.934043,-2.121826,0
7998,1.120382,0.194409,-0.672968,0.005154,2.290353,4.112554,0.720367,0.563533,-1.009534,-1.551473,...,0.607569,3.102179,4.237942,-0.710731,-0.359420,-1.500903,-0.359420,-0.658633,-0.672968,1


# Data preparation

In [11]:
def get_na_count(df):
    # Conta i valori mancanti
    na_mask = df.isna()
    return na_mask.sum().sum()

In [12]:
get_na_count(testset)

100

In [20]:
# Prints information on the dataset
def data_preparation_info(train_x, feats_names, pipeline):
    # Missing values.
    print('\nMissing values')
    print('Train NaNs: ', get_na_count(train_x))

    # KNN-based NaN filling.
    imputer = pipeline.named_steps['imputer']
    train_x = pd.DataFrame(imputer.transform(train_x))
    if get_na_count(train_x) != 0:
        print('ERROR: Missing values filling failed.')
        sys.exit(1)

    # Outliers processing.
    print('\nOutliers')
    show_boxplot_features(train_x, 'Training set with outliers')
    replacer = pipeline.named_steps['replacer']
    train_x = pd.DataFrame(replacer.transform(train_x))
    if get_na_count(train_x) != 0:
        print('ERROR: Outliers processing failed.')
        sys.exit(1)
    show_boxplot_features(train_x, 'Training set with replaced outliers')

    # Scale the training set.
    print('\nScaling')
    scaler = pipeline.named_steps['scaler']
    train_x = pd.DataFrame(scaler.transform(train_x))
    print("Training set features properties after scaling:")
    print(train_x.describe())
    show_boxplot_features(train_x, 'Training set after scaling')

    # Feature selection information using PCA.
    pca = PCA(random_state=42)
    pca = pca.fit(train_x)
    show_histogram_features(pca.explained_variance_ratio_,
                            feats_names,
                            'Feature importance by variance ratios')
    

# Data visualization

In [16]:
# Visualizza proprzione tra le classi
def show_classes_proportions(dataset,title):
    data_counts = dataset['CLASS'].value_counts(normalize = True)
    print('\n' + title)
    print(data_counts)
    sns.countplot(x='CLASS', data=dataset).set(title=title)
    plt.show()
    return data_counts

In [17]:
# Displays features histogram
def show_histogram_features(data, feat_names, title):
    fig, ax = plt.subplots()
    ax.bar(feat_names, data)
    ax.set_title(title)
    plt.show()

In [18]:
 # Displays confusion matrix with annotations
def show_confusion_matrix(cm, f1_score, title):
    # Create annotations label.
    group_counts = ["{0:0.0f}\n".format(value) for value in cm.flatten()]
    group_percentages =\
        ["{0:.2%}".format(value) for value in cm.flatten() / np.sum(cm)]
    box_labels =\
        [f"{v1}{v2}".strip() for v1, v2 in zip(group_counts, group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cm.shape[0], cm.shape[1])
    # Show confusion matrix with heat map.
    sns.heatmap(cm,
                annot=box_labels,
                fmt="",
                cmap="YlGnBu",
                cbar=False,
                linewidths=1.0)\
        .set(title=title,
             xlabel='Predicted class\n\nF1 macro: %0.4f' % f1_score,
             ylabel='Actual class')
    plt.show()

In [19]:
# Displays features boxplot
def show_boxplot_features(dataset, title):
    sns.boxplot(data=dataset).set(title=title)
    plt.show()

# Data evalutation

In [23]:
# Preprocesses test set and evaluates classifiers
def evaluate_classifier(classifier, data_x, data_y, matrix_title='', show=True):
    pred_y = classifier.predict(data_x)
    confusion_matrix = metrics.confusion_matrix(data_y, pred_y)
    f1_score = metrics.f1_score(data_y, pred_y, average='macro')
    if show:
        show_confusion_matrix(confusion_matrix, f1_score, matrix_title)
    return f1_score