Autor: Matyáš Sládek <br>
Rok: 2020 <br>

Tento soubor slouží k optimalizaci parametrů klasifikačních algoritmů pomocí grid GridSampler a TPESampler z knihovny Optuna. <br>

Tato buňka importuje potřebné knihovny.

In [None]:
%%capture
import os
import sys
import time
from datetime import timedelta
import json
import pickle
import warnings

import numpy as np
import pandas as pd
from optuna.exceptions import ExperimentalWarning

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from hpoptimise_optuna import HPoptimise

Tato buňka obsahuje funkci pro načtení a zpracování potřebných dat a spuštění optimalizace parametrů.

In [None]:
def optimise(optimised_params, dataset, library, classifiers, parameters):
    """
    Optimises hyper parameters of classifiers on given features.

    Parameters:

    optimised_params:       Dictionary of optimised hyper parameters
    dataset:                Name of a dataset to indicate which extracted features to load (dataset from which the features were extracted)
    library:                Name of a library to indicate which extracted features to load (library with which the features were extracted)
    optimised_feature_sets: Dictionary containing optimised feature sets
    classifiers:            Dictionary containing classifiers to be optimised
    max_evals:              Maximum number of optimisation iterations
    verbose:                Indicates what information about optimisation to print out
    """
    
    def get_best_params():
        """
        Optimises hyper parameters of a given classifier.
        """
                
        # Add the feature set name to the optimised parameters dictionary if not there already
        if not feature_set_name in optimised_params:
            optimised_params[feature_set_name] = {}
                 
        # Initialise the optimiser
        estim = HPoptimise(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, classifier_name=classifier_name, classifier=classifier, filepath='../metadata/optuna_studies/{}_{}_{}_{}_{}.pkl'.format(dataset, library, feature_set_name, classifier_name, validation_type), other_params=parameters)   # Initialize optimiser object
        
        print('Optimising \033[1m{}\033[0m classifier hyper parameters on feature set \033[1m{}\033[0m:'.format(classifier_name, feature_set_name))
        
        # Store the start time of the optimisation of current classifier
        t_start = time.time()
        
        # Start the optimisation process
        with warnings.catch_warnings():   # Suppress warnings regarding experimental features
            warnings.simplefilter('ignore', category=ExperimentalWarning)
            estim.optimise()
        
        # Add the classifier name to the optimised parameters dictionary if not there already
        if not classifier_name in optimised_params[feature_set_name]:
            optimised_params[feature_set_name][classifier_name] = {}
            
        # Store the best parameters for selected validation type
        optimised_params[feature_set_name][classifier_name][validation_type] = estim.best_params()

        print('\nOptimisation finished in ~\033[1m{}\033[0m.'.format(str(timedelta(seconds=(time.time() - t_start))).split(".")[0]))
#         print('Cross-validation score: \033[1m{}\033[0m'.format(1 - current_loss))
        print('Best params: \033[1m{}\033[0m'.format(estim.best_params()))
                
        return
    
    
    
    optimised_feature_sets_names = []   # Stores the names of optimised feature sets to be used for hyper parameter optimisation
    validation_type = 'CV' if parameters['use_cross_validation'] else 'VS'   # Stores the shortcut for the type of selected hyper parameter validation method
    X_val = None   # Stores the features of validation set if used
    y_val = None   # Stores the genres of validation set if used
    
    # If any of the optimised feature sets is selected
    if parameters['use_opt_feature_set_FS_CV'] or parameters['use_opt_feature_set_FS_VS'] or parameters['use_opt_feature_set_BE_CV'] or parameters['use_opt_feature_set_BE_VS']:
        
        # Load optimised feature sets if available
        try:
            with open('../metadata/misc/optimised_feature_sets.json') as f:
                optimised_feature_sets = json.load(f)   
        except Exception as e:
            print('Failed to read file: "../metadata/misc/optimised_feature_sets.json"!', file=sys.stderr)
            print('Error: {}'.format(repr(e)), file=sys.stderr)
            return -1
        
        # If feature set optimised with forward selection and cross-validation is selected, add it to the dictionary of feature sets to be optimised
        if parameters['use_opt_feature_set_FS_CV']:
            optimised_feature_sets_names.append('opt_feature_set_FS_CV')
            
        # If feature set optimised with forward selection and validation set is selected, add it to the dictionary of feature sets to be optimised
        if parameters['use_opt_feature_set_FS_VS']:
            optimised_feature_sets_names.append('opt_feature_set_FS_VS')
            
        # If feature set optimised with backward elimination and cross-validation is selected, add it to the dictionary of feature sets to be optimised
        if parameters['use_opt_feature_set_BE_CV']:
            optimised_feature_sets_names.append('opt_feature_set_BE_CV')
            
        # If feature set optimised with backward elimination and validation set is selected, add it to the dictionary of feature sets to be optimised
        if parameters['use_opt_feature_set_BE_VS']:
            optimised_feature_sets_names.append('opt_feature_set_BE_VS')
    
    # Load specified extracted features
    try:                
        features = pd.read_csv('../metadata/features/features_{}_{}.csv'.format(dataset, library), index_col=0, header=[0, 1, 2])        
    except Exception as e:
        print('Failed to read file: "../metadata/features/features_{}_{}.csv"!'.format(dataset, library), file=sys.stderr)
        print('Error: {}'.format(repr(e)), file=sys.stderr)
        return -1

    # Perform One-hot encoding on categorical features
    for column in features.select_dtypes(include='object'):   # For each categorical column
        dummy_columns = pd.get_dummies(features[column], drop_first=True)   # Encode the column values  
        features = features.drop(columns=column)   # Drop the column from the dataframe
        dummy_columns.columns = pd.MultiIndex.from_product([[column[0]], [column[1]], ['{}'.format(c) for c in dummy_columns.columns]], names=features.columns.names) # Reindex for consistance
        features = pd.concat([features, dummy_columns], axis=1).sort_index(axis=1)   # Append columns to features dataframe
        
    feature_names = list(features.columns.levels[0])   # Get names of all features
    feature_sets = {}   # Stores non-optimised feature sets
    
    # If all features is selected, add all feature names to the dictionary of feature sets to be used
    if parameters['use_all_features']:
        feature_sets['all'] = feature_names
    
    # Load track-genre list
    try:                
        genres = pd.read_csv("../metadata/track_genre_lists/{}_track_genre_list.csv".format(dataset), index_col=0, header=0)        
    except Exception as e:
        print('Failed to read file: "../metadata/track_genre_lists/{}_track_genre_list.csv"!'.format(dataset), file=sys.stderr)
        print('Error: {}'.format(repr(e)), file=sys.stderr)
        return -1
    
    genres = genres.loc[features.index]   # Remove unwanted data from track-genre list (data about tracks removed from features because of corruption or other reason)
    
    # Encode genre labels
    encoder = LabelEncoder()
    y = encoder.fit_transform(np.ravel(genres))

    scaler = StandardScaler()
    
    # For each of the non-optimised feature sets
    for feature_set_name, feature_set in feature_sets.items():
        X = features[feature_set].values   # Extract selected feature set values to ndarray
        X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)   # Split the samples to train and test data (test data is not used)
                
        if parameters['use_cross_validation']:   # If cross-validation is selected, transform all the training data                     
            X_train = scaler.fit_transform(X_train)
        else:   # If validation set is selected, split the training data to training and validation sets
            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=parameters['validation_set_size'], random_state=42, stratify=y_train)
            X_train = scaler.fit_transform(X_train)
            X_val = scaler.transform(X_val)

        # For each of the selected classifiers, perform hyper parameter optimisation
        for classifier_name, classifier in classifiers.items():                            
            get_best_params()
            print('-'*100)
            
    # For each of the optimised feature sets
    for feature_set_name in optimised_feature_sets_names:
        
        # For each of the selected classifiers
        for classifier_name, classifier in classifiers.items():
            
            # Get the optimised feature set
            try:
                feature_set = optimised_feature_sets[dataset][library][classifier_name][feature_set_name]
            except KeyError:
                print('Optimised feature set {} for classifier {} not found!'.format(feature_set_name, classifier_name), file=sys.stderr)
                continue
            
            X = features[feature_set].values   # Get the features of the features set
            X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)   # Split the samples to train and test data (test data is not used)
            
            if parameters['use_cross_validation']:   # If cross-validation is selected, transform all the training data                         
                X_train = scaler.fit_transform(X_train)
            else:   # If validation set is selected, split the training data to training and validation sets
                X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=parameters['validation_set_size'], random_state=42, stratify=y_train)
                X_train = scaler.fit_transform(X_train)
                X_val = scaler.transform(X_val)
            
            # Perform hyper parameter optimisation
            get_best_params()
            print('-'*100)
 
                                   
    return optimised_params

Tato buňka slouží k nastavení potřebných parametrů a výběru datových sad, sad atributů a klasifikátorů, pro které má být optimalizace provedena. <br>
Detailní popis optimalizovaných parametrů je možné pro klasifikátor XGBClassifier nalézt zde <https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn> a pro klasifikátory z knihovny Sci-kit learn zde <https://scikit-learn.org/stable/>. <br>
Pro jaké datové sady má být optimalizace provedena je možné zvolit v proměnné <code>datasets</code> odkomentováním/zakomentováním příslušných záznamů. <br>
Při použití vlastní datové sady je nutné do proměnné <code>datasets</code> přidat stejný název datové sady, jako byl použit pro extrakci atributů v souboru <strong>feature_extraction.ipynb</strong> <br>
Dále je nutné zvolit, jestli optimalizaci provést na atributech extrahovaných pomocí knihovny Librosa či Essentia odkomentováním/zakomentováním příslušných záznamů v proměnné <code>feature_extraction_libraries</code>. <br>
Pro které klasifikátory má být optimalizace provedena je možné zvolit v proměnné <code>classifiers</code> odkomentováním/zakomentováním příslušných záznamů. <br>
Při použití jiného klasifikátorů je tento nutné přidat do proměnné <code>classifiers</code> ve formátu {zvolený_název_klasifikátoru}:{odkaz_na_objekt_klasifikátoru}. <br>
Paremetry klasifikátorů je možné upravit v proměnné <code>default_params</code> v souboru <strong>hpoptimise_optuna.py</strong>, případně pro nový klasifikátor přidat záznam ve formátu {zvolený_název_klasifikátoru}:{slovník_parametrů}, kde zvolený název klasifikátoru musí odpovídat názvu v proměnné <code>classifiers</code>. <br>
Dále je pro případný nový klasifikátor přidat všechny náležitosti (optimalizační funkce atd.) dle konvencí v souboru <strong>hpoptimise_optuna.py</strong>. <br>
V proměnné <code>parameters</code> je možné nastavit průběh optimalizace. <br>
<br>
Popis parametrů: <br>
<ul>
    <li><code>max_trials</code> Hodnota značí maximální počet iterací algoritmu TPESampler, u algoritmu GridSampler ignorováno</li>
    <li><code>max_trials_no_change</code> Hodnota značí maximální počet iterací algoritmu TPESampler bez navášení skóre, po kterých bude optimalizace zastavena, u algoritmu GridSampler ignorováno</li>
    <li><code>use_cross_validation</code> Hodnota True znamená použití křížové validace, hodnota False pak validace na validační sadě</li>
    <li><code>cross_validation_num_of_folds</code> Značí, kolikanásobná bude křížová validace, pokud je tato metoda zvolena</li>
    <li><code>validation_set_size</code> Značí, jaká část trénovacích dat bude vyhrazena jako validační, pokud je tato metoda zvolena</li>
    <li><code>use_all_features</code> Hodnota True znamená použití celé sady atributů</li>
    <li><code>use_opt_feature_set_FS_CV</code> Hodnota True znamená použití sady atributů vybrané pomocí metody dopředné selekce a křížové validace</li>
    <li><code>use_opt_feature_set_FS_VS</code> Hodnota True znamená použití sady atributů vybrané pomocí metody dopředné selekce a validace na validační saďe</li>
    <li><code>use_opt_feature_set_BE_CV</code> Hodnota True znamená použití sady atributů vybrané pomocí metody zpětné eliminace a křížové validace</li>
    <li><code>use_opt_feature_set_BE_VS</code> Hodnota True znamená použití sady atributů vybrané pomocí metody zpětné eliminace a validace na validační saďe</li>
</ul>

Klasifikátory <code>MLPClassifier</code>, <code>DecisionTreeClassifier</code>, <code>RandomForestClassifier</code> a <code>XGBClassifier</code> používají algoritmus TPESampler a klasifikátory <code>LogisticRegression</code>, <code>KNeighborsClassifier</code>, <code>SVC_linear</code> a <code>SVC_rbf</code> používají algoritmus GridSampler. Stejně jako u selekce atributů je při nastavování parametrů třeba brát v potaz, že optimalizace je velmi výpočetně náročný proces, obzvláště pak v případě optimalizace algoritmů <code>RandomForestClassifier</code> a <code>XGBClassifier</code>.<br>

In [None]:
if __name__ == "__main__":
        
    # List of datasets whose features should be used, unwanted can be commented out   
    datasets = [
#         'EBD',
#         'FMA',
        'GTZAN'
    ]
    
    # List of feature extraction libraries whose features should be used, unwanted can be commented out   
    feature_extraction_libraries = [
#         'librosa',
        'essentia'
    ]
    
    # Dictionary of classifier names and objects which hyper parameters should be optimised, unwanted can be commented out   
    classifiers = {
        # sklearn classifiers
#         'LogisticRegression':LogisticRegression,
        'KNeighborsClassifier':KNeighborsClassifier,
#         'MLPClassifier':MLPClassifier,
#         'DecisionTreeClassifier':DecisionTreeClassifier,
#         'SVC_linear':SVC,
#         'SVC_rbf':SVC,
        
        # sklearn ensemble classifiers
#         'RandomForestClassifier':RandomForestClassifier,
        
        # other classifiers
#         'XGBClassifier':XGBClassifier,
    }
    
    parameters = {
        'max_trials':1000,   # Determines the maximum number of optimisation iterations
        'max_trials_no_change':100,   # Determines the maximum number of optimisation iterations with no improvement of score, after which the optimisation will be stopped
        'use_cross_validation': False,   # Set to true to use cross-validation on the entire training data, else a portion of the train data will be reserved as validation set and cross-validation will not be used
        'cross_validation_num_of_folds' : 5,   # Determines the number of cross-validation folds, ignored if 'use_cross_validation' parameter is set to False
        'validation_set_size': 0.2,   # Determines the size of the portion of training data, which will be reserved as validation set, ignored if 'use_cross_validation' parameter is set to True
        'use_all_features': True,   # Use all available features extracted with selected library
        'use_opt_feature_set_FS_CV': False,   # Set this to true to use feature set optimised with forward selection and cross-validation
        'use_opt_feature_set_FS_VS': True,   # Set this to true to use feature set optimised with backward elimination and validation set
        'use_opt_feature_set_BE_CV': False,   # Set this to true to use feature set optimised with forward selection and cross-validation
        'use_opt_feature_set_BE_VS': True,   # Set this to true to use feature set optimised with backward elimination and validation set
    }

    # Load optimised hyper parameters file if available
    try:
        with open('../metadata/misc/optimised_hyper_parameters.json') as f:
            optimised_params = json.load(f)            
    except FileNotFoundError:    
        optimised_params = {}
    
    if parameters['use_cross_validation']:
        print('Using cross-validation.')
    else:
        print('Using validation set.')
    
    t_start = time.time() # Store the start time of the hyper parameter optimisation
        
    for dataset in datasets:   # For each of the selected datasets
        
        # Add dataset name to optimised hyper parameters dictionary if not there already
        if not dataset in optimised_params:
            optimised_params[dataset] = {}
        
        for library in feature_extraction_libraries:   # For each of the selected extraction libraries
            
            # Add extraction library name to optimised hyper parameters dictionary if not there already
            if not library in optimised_params[dataset]:
                optimised_params[dataset][library] = {}
            
            print('#'*100)
            print('Optimising hyper parameters on \033[1m{}\033[0m dataset features extracted with \033[1m{}\033[0m library:\n'.format(dataset, library))
            t = time.time()   # Store the start time of hyper parameter optimisation for selected dataset and extraction library features
            optimise(optimised_params[dataset][library], dataset, library, classifiers, parameters)   # Start the optimisation
            print("\nClassifiers hyper parameter optimisation on \033[1m{}\033[0m dataset features extracted with \033[1m{}\033[0m library finished in ~\033[1m{}\033[0m.".format(dataset, library, str(timedelta(seconds=(time.time() - t))).split(".")[0]))
            print('#'*100)
            
            # Save optimised hyper parameters to .json file
            with open('../metadata/misc/optimised_hyper_parameters.json', 'w') as json_file:
                json.dump(optimised_params, json_file)
    
    print("Classifiers hyper parameter optimisation on all selected datasets and all selected features finished in ~\033[1m{}\033[0m.".format(str(timedelta(seconds=(time.time() - t_start))).split(".")[0]))