Autor: Matyáš Sládek <br>
Rok: 2020 <br>

Tento soubor obsahuje funkce pro výpis výsledků selekce atributů, optimalizace parametrů či klasifikace.

Tato buňka vypíše informace o optimalizaci parametrů. <br>
V proměnné <code>datasets</code> je možné vybrat, pro které datové sady mají být informace vypasány. <br>
V proměnné <code>feature_extraction_libraries</code> je možné vybrat, pro atributy které z extrakčních knihoven mají být informace vypasány. <br>
V proměnné <code>feature_sets</code> je možné vybrat, pro které sady atributů mají být informace vypasány. <br>
V proměnné <code>classifiers</code> je možné vybrat, pro které klasifikátory mají být informace vypasány. <br>
V proměnné <code>validation_types</code> je možné vybrat, pro které typy validací (u optimalizace) mají být informace vypasány. <br>
Nastavením proměnné <code>show_all_trials</code> na hodnotu True je možné vysat informace o každé iteraci optimalizace.

In [1]:
import joblib
from datetime import timedelta
import optuna
import copy

def get_duration(elem):
    return(elem.datetime_complete - elem.datetime_start)

datasets = [
    'EBD',
    'FMA',
    'GTZAN'
]

feature_extraction_libraries = [
#     'librosa',
    'essentia'
]

feature_sets = [
    'all',
#     'opt_feature_set_FS_VS',
#     'opt_feature_set_BE_VS'
]

classifiers = [
        # sklearn classifiers
#         'LogisticRegression',
#         'KNeighborsClassifier',
#         'MLPClassifier',
#         'DecisionTreeClassifier',
#         'SVC_linear',
#         'SVC_rbf',
        
        # sklearn ensemble classifiers
#         'RandomForestClassifier',
        
        # other classifiers
        'XGBClassifier'
]

validation_types = [
    'VS',
    'CV'
]

show_all_trials = False

for dataset in datasets:
    print('#'*100)
    print('Results for dataset \033[1m{}\033[0m:'.format(dataset))
    
    for library in feature_extraction_libraries:
        print('*'*100)
        print('Results for library \033[1m{}\033[0m:'.format(library))
        for feature_set in feature_sets:
            print('-'*100)
            print('Results for feature_set \033[1m{}\033[0m:'.format(feature_set))
            for classifier in classifiers:
                for validation_type in validation_types:
                    study = joblib.load('../metadata/optuna_studies/{}_{}_{}_{}_{}.pkl'.format(dataset, library, feature_set, classifier, validation_type)) 
                    print('')

                    if show_all_trials:
                        for trial in study.trials:
                            if trial.state == optuna.trial.TrialState.COMPLETE:
                                print('{} {} {} {} {}'.format(trial.number, trial.state, trial.value, str(trial.datetime_complete - trial.datetime_start).split('.')[0], trial.user_attrs['params']))
                            else:
                                print('{} {}'.format(trial.number, trial.state))

                    print('\nOptimisation info about classifier \033[1m{}\033[0m with validation type \033[1m{}\033[0m:'.format(classifier, validation_type))
                    print('Total optimisation runtime: \033[1m{}\033[0m\n'.format(str(study.trials[len(study.trials)-1].datetime_complete - study.trials[0].datetime_start).split('.')[0]))
                    print('Best trial:\n\tNumber:   \033[1m{}\033[0m\n\tScore:    \033[1m{}\033[0m\n\tRuntime:  \033[1m{}\033[0m\n\tParams:   \033[1m{}\033[0m'.format(study.best_trial.number, study.best_trial.value, str(study.best_trial.datetime_complete - study.best_trial.datetime_start).split('.')[0], study.best_trial.user_attrs['params']))

####################################################################################################
Results for dataset EBD:
****************************************************************************************************
Results for library essentia:
----------------------------------------------------------------------------------------------------
Results for feature_set all:


FileNotFoundError: [Errno 2] No such file or directory: '../metadata/optuna_studies/EBD_essentia_all_XGBClassifier_VS.pkl'

Tato buňka vypíše informace o selekci atributů. <br>
V proměnné <code>datasets</code> je možné vybrat, pro které datové sady mají být informace vypasány. <br>
V proměnné <code>feature_extraction_libraries</code> je možné vybrat, pro atributy které z extrakčních knihoven mají být informace vypasány. <br>
V proměnné <code>feature_sets</code> je možné vybrat, pro které sady atributů mají být informace vypasány. <br>
V proměnné <code>classifiers</code> je možné vybrat, pro které klasifikátory mají být informace vypasány. <br>

In [2]:
import pandas as pd
import numpy as np
import sys
import json

try:
    with open('../metadata/misc/optimised_feature_sets.json') as f:
        optimised_feature_sets = json.load(f)   
except Exception as e:
    print('Failed to read file: "../metadata/misc/optimised_feature_sets.json"!', file=sys.stderr)
    print('Error: {}'.format(repr(e)), file=sys.stderr)
    
datasets = [
    'EBD',
#     'FMA',
#     'GTZAN'
]

feature_extraction_libraries = [
#     'librosa',
    'essentia'
]

feature_sets = [
    'opt_feature_set_FS_VS',
    'opt_feature_set_BE_VS'
]

classifiers = [
        # sklearn classifiers
#         'LogisticRegression',
        'KNeighborsClassifier',
#         'MLPClassifier',
#         'DecisionTreeClassifier',
#         'SVC_linear',
#         'SVC_rbf',
        
        # sklearn ensemble classifiers
#         'RandomForestClassifier',
        
        # other classifiers
#         'XGBClassifier'
]

for dataset in datasets:
    print('#'*100)
    print('Results for dataset \033[1m{}\033[0m:'.format(dataset))
    
    for library in feature_extraction_libraries:
        print('*'*100)
        print('Results for library \033[1m{}\033[0m:'.format(library))
        for feature_set_name in feature_sets:
            print('-'*100)
            print('Results for feature_set \033[1m{}\033[0m:'.format(feature_set_name))
            for classifier in classifiers:
                feature_set = optimised_feature_sets[dataset][library][classifier][feature_set_name]
                print('Feature subset info for classifier \033[1m{}\033[0m:'.format(classifier))
                print('Features selected: \033[1m{}\033[0m'.format(len(feature_set)))
                print('Features: \033[1m{}\033[0m'.format(feature_set))

####################################################################################################
Results for dataset EBD:
****************************************************************************************************
Results for library essentia:
----------------------------------------------------------------------------------------------------
Results for feature_set opt_feature_set_FS_VS:
Feature subset info for classifier KNeighborsClassifier:
Features selected: 8
Features: ['bpm_histogram', 'spectral_contrast_valleys', 'melbands_spread', 'barkbands_crest', 'danceability', 'chords_scale', 'melbands_flatness_db', 'loudness_ebu128']
----------------------------------------------------------------------------------------------------
Results for feature_set opt_feature_set_BE_VS:
Feature subset info for classifier KNeighborsClassifier:
Features selected: 66
Features: ['average_loudness', 'barkbands', 'barkbands_crest', 'barkbands_flatness_db', 'barkbands_skewness', 'beats_cou

Tato buňka vypíše informace o dosaženém skóre a době trénování a klasifikace pro vybrané klasifikační algoritmy sady atributů. <br>
V proměnné <code>datasets</code> je možné vybrat, pro které datové sady mají být informace vypasány. <br>
V proměnné <code>feature_extraction_libraries</code> je možné vybrat, pro atributy které z extrakčních knihoven mají být informace vypasány. <br>
V proměnné <code>feature_sets</code> je možné vybrat, pro které sady atributů mají být informace vypasány. <br>
V proměnné <code>control_sets</code> je možné vybrat, pro které testovací/validační sady mají být informace vypasány. <br>
Nastavením proměnné <code>load_f1_score</code> na hodnotu True je možné načíst hodnoty F1 skóre, při hodnotě False se vypíše klasické skóre (počet správně / počet celkem) <br>

In [2]:
import IPython.display as ipd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

if __name__ == "__main__":
       
    def highlight_max(s):
        is_max = s == s.max()
        return ['background-color: yellow' if v else '' for v in is_max]
        
    datasets = [
        'EBD',
        'FMA',
        'GTZAN'
    ]
    
    feature_extraction_libraries = [
#         'librosa',
        'essentia'
    ]
    
    feature_sets = [
        'all',
        'opt_feature_set_FS_VS',
        'opt_feature_set_BE_VS'
    ]
    
    control_sets = [
#         'validation_set',
        'test_set'
    ]
    
    load_f1_score = False

    scs = pd.DataFrame()
    rts = pd.DataFrame()
    
    for dataset in datasets:
        for library in feature_extraction_libraries:
            for control_set in control_sets:
                
                scores = pd.read_csv("../metadata/scores/scores_{}_{}_{}_{}.csv".format(dataset, library, control_set, 'F1_score' if load_f1_score else 'accuracy_score'), index_col=0, header=[0])
                scs = scs.append(scores)
                runtimes = pd.read_csv("../metadata/runtimes/runtimes_{}_{}_{}.csv".format(dataset, library, control_set), index_col=0, header=[0])
                rts = rts.append(runtimes)

#     cols = [c for c in scs.columns if 'default' not in c]
#     cols = [c for c in cols if 'VS' not in c]
#     scs = scs[cols]
#     rts = rts[cols]
     
    index = scs.index.unique()
    index = pd.MultiIndex.from_product([['EBD', 'FMA', 'GTZAN'], list(index)])
    scs = scs.reset_index(drop=True)
    scs = pd.DataFrame(data=scs.values, index=index, columns=scs.columns.str.replace('_default',''))
    indices_to_drop = [x for x in ['all', 'opt_feature_set_FS_VS', 'opt_feature_set_BE_VS'] if x not in feature_sets]
    if len(indices_to_drop) != 0:
        scs = scs.drop(indices_to_drop, level=1)
    scs.to_csv('../../bp/classification_scores.csv')  
    scs = scs.style.apply(highlight_max, axis=1)
    ipd.display(scs.format("{:.2%}"))

    index = rts.index.unique()
    index = pd.MultiIndex.from_product([['EBD', 'FMA', 'GTZAN'], list(index)])
    rts = rts.reset_index(drop=True)
    rts = pd.DataFrame(data=rts.values, index=index, columns=rts.columns.str.replace('_default',''))
    if len(indices_to_drop) != 0:
        rts = rts.drop(indices_to_drop, level=1)
    rts.to_csv('../../bp/classification_runtimes.csv')  
    ipd.display(rts)

Unnamed: 0,Unnamed: 1,LogisticRegression,LogisticRegression_optimised_VS,KNeighborsClassifier,KNeighborsClassifier_optimised_VS,MLPClassifier,MLPClassifier_optimised_CV,MLPClassifier_optimised_VS,DecisionTreeClassifier,DecisionTreeClassifier_optimised_VS,SVC_linear,SVC_linear_optimised_CV,SVC_linear_optimised_VS,SVC_rbf,SVC_rbf_optimised_VS,RandomForestClassifier,RandomForestClassifier_optimised_VS,XGBClassifier,XGBClassifier_optimised_CV,XGBClassifier_optimised_VS
EBD,all,85.53%,85.05%,53.11%,61.00%,83.73%,85.89%,86.36%,72.85%,72.49%,84.57%,84.57%,84.57%,78.23%,85.41%,79.55%,81.94%,86.24%,87.56%,86.36%
EBD,opt_feature_set_FS_VS,82.42%,83.97%,78.11%,78.11%,85.53%,nan%,85.65%,70.10%,73.21%,80.50%,nan%,80.50%,83.01%,84.69%,82.66%,82.30%,83.25%,nan%,84.33%
EBD,opt_feature_set_BE_VS,85.65%,85.65%,62.56%,70.45%,83.49%,nan%,85.41%,71.65%,73.56%,84.21%,nan%,84.33%,84.45%,84.45%,81.34%,83.85%,86.48%,nan%,86.12%
FMA,all,54.56%,60.94%,47.62%,51.94%,50.69%,62.81%,62.69%,37.06%,42.81%,53.81%,60.94%,60.94%,60.00%,61.94%,58.06%,58.50%,62.88%,64.50%,64.56%
FMA,opt_feature_set_FS_VS,58.63%,58.63%,49.50%,50.06%,49.94%,nan%,54.69%,35.75%,40.25%,56.06%,nan%,56.06%,61.38%,61.38%,56.94%,57.00%,60.69%,nan%,61.94%
FMA,opt_feature_set_BE_VS,54.44%,59.88%,47.75%,52.50%,57.19%,nan%,61.50%,39.69%,42.81%,53.06%,nan%,59.75%,61.50%,61.50%,58.81%,59.13%,63.50%,nan%,64.19%
GTZAN,all,80.00%,81.00%,71.00%,68.00%,80.50%,82.00%,82.00%,57.50%,58.00%,82.00%,82.00%,82.00%,75.00%,80.00%,76.50%,76.50%,77.00%,83.50%,81.00%
GTZAN,opt_feature_set_FS_VS,75.00%,75.00%,69.50%,70.50%,78.50%,nan%,80.50%,51.00%,53.00%,75.00%,nan%,75.00%,77.00%,77.00%,67.50%,64.00%,71.50%,nan%,70.50%
GTZAN,opt_feature_set_BE_VS,79.00%,79.00%,71.50%,75.50%,79.50%,nan%,81.50%,54.00%,58.00%,81.50%,nan%,81.50%,78.00%,82.00%,77.00%,78.50%,78.00%,nan%,78.00%


Unnamed: 0,Unnamed: 1,LogisticRegression,LogisticRegression_optimised_VS,KNeighborsClassifier,KNeighborsClassifier_optimised_VS,MLPClassifier,MLPClassifier_optimised_CV,MLPClassifier_optimised_VS,DecisionTreeClassifier,DecisionTreeClassifier_optimised_VS,SVC_linear,SVC_linear_optimised_CV,SVC_linear_optimised_VS,SVC_rbf,SVC_rbf_optimised_VS,RandomForestClassifier,RandomForestClassifier_optimised_VS,XGBClassifier,XGBClassifier_optimised_CV,XGBClassifier_optimised_VS
EBD,all,0:00:16,0:00:03,0:00:00,0:00:03,0:00:06,0:00:37,0:00:44,0:00:10,0:00:18,0:00:34,0:00:35,0:00:34,0:00:58,0:00:35,0:00:01,0:01:05,0:00:48,0:02:50,0:02:08
EBD,opt_feature_set_FS_VS,0:00:01,0:00:00,0:00:00,0:00:00,0:00:02,,0:00:02,0:00:00,0:00:00,0:00:01,,0:00:01,0:00:03,0:00:03,0:00:00,0:00:00,0:00:04,,0:00:09
EBD,opt_feature_set_BE_VS,0:00:11,0:00:11,0:00:00,0:00:01,0:00:06,,0:00:40,0:00:06,0:00:06,0:00:33,,0:00:33,0:00:12,0:00:12,0:00:01,0:01:22,0:00:48,,0:01:58
FMA,all,0:00:34,0:00:06,0:00:00,0:00:11,0:00:14,0:00:44,0:00:52,0:00:22,0:00:27,0:02:34,0:02:42,0:02:42,0:03:22,0:02:59,0:00:03,0:04:36,0:01:27,0:06:50,0:02:55
FMA,opt_feature_set_FS_VS,0:00:03,0:00:03,0:00:00,0:00:00,0:00:12,,0:00:05,0:00:08,0:00:05,0:00:11,,0:00:11,0:00:23,0:00:23,0:00:01,0:00:31,0:00:32,,0:02:01
FMA,opt_feature_set_BE_VS,0:00:31,0:00:10,0:00:00,0:00:10,0:00:14,,0:00:44,0:00:21,0:00:27,0:02:13,,0:02:21,0:02:07,0:02:07,0:00:03,0:05:35,0:01:27,,0:03:50
GTZAN,all,0:00:07,0:00:02,0:00:00,0:00:00,0:00:01,0:00:09,0:00:08,0:00:01,0:00:01,0:00:02,0:00:02,0:00:02,0:00:03,0:00:03,0:00:00,0:00:12,0:00:15,0:00:26,0:00:19
GTZAN,opt_feature_set_FS_VS,0:00:00,0:00:00,0:00:00,0:00:00,0:00:00,,0:00:01,0:00:00,0:00:00,0:00:00,,0:00:00,0:00:00,0:00:00,0:00:00,0:00:00,0:00:01,,0:00:02
GTZAN,opt_feature_set_BE_VS,0:00:02,0:00:02,0:00:00,0:00:00,0:00:01,,0:00:08,0:00:01,0:00:01,0:00:01,,0:00:01,0:00:01,0:00:01,0:00:00,0:00:07,0:00:10,,0:00:07
