# XGBoost

Toujours avec des les features musicales agrégées Librosa, nous entraînons ici un modèle de classification *XGBoost*.

### I. Chargement, preprocessing des données et sélection des variables

Comme pour la forêt aléatoire, nous reprenons le processus et les idées de sélection des variables développées dans *neural_net.ipynb*.

In [1]:
import numpy as np
import pandas as pd
from utils.load_data import fma_load


features = fma_load('data/fma_metadata/features.csv')
tracks = fma_load('data/fma_metadata/tracks.csv')

y = tracks['track','genre_top']

corr = features.corr()

threshold = 0.75

corr_matrix = corr.copy()

np.fill_diagonal(corr_matrix.values, 0) # Diagonal to zero

strong_corr = (corr_matrix.abs() > threshold)

pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if strong_corr.iloc[i, j]:
            pairs.append({
                "feature_1": corr_matrix.columns[i],
                "feature_2": corr_matrix.columns[j],
                "correlation": corr_matrix.iloc[i, j]
            })


features.drop(columns=['chroma_cqt','chroma_cens'],inplace=True)
features = features.loc[:, features.columns.get_level_values('statistics') != 'median']

features

feature,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,...,tonnetz,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,std,kurtosis,max,mean,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,03,04,05,06,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,-1.006041,-0.634076,-0.233752,-0.120917,0.004806,1.218982,0.969103,-0.884986,-0.469190,-0.515475,...,0.038974,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.000000,2.089872,0.061448
3,-0.951502,-0.660734,-1.050015,-0.977441,-0.343043,-0.515404,-0.973297,-1.261086,-1.132458,-0.953374,...,0.051151,0.063831,0.014212,0.017740,2.824694,0.466309,0.084578,0.000000,1.716724,0.069330
5,-0.794551,-1.264806,-0.664387,-0.405196,-0.022688,0.014883,-0.190766,-0.507027,-0.868905,-0.952605,...,0.084997,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.000000,2.193303,0.044861
10,0.954931,-1.266404,0.030425,-0.646823,0.664217,0.082626,-1.241245,0.102074,-0.742071,0.670849,...,0.088197,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.000000,3.542325,0.040800
20,-0.048443,-0.543755,-1.336000,-0.410307,-0.684526,-0.960676,-1.108473,-1.320151,-0.754329,-1.029887,...,0.105521,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.000977,3.189831,0.030993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,-0.959207,-0.130793,-0.635587,-0.251981,-0.656534,-0.238574,-0.372219,-0.661802,-0.297405,-0.533092,...,0.102859,0.128410,0.022547,0.019816,4.448255,0.172852,0.028773,0.003906,0.955388,0.012385
155317,-0.965068,-0.679953,-1.051353,-0.852524,-0.872679,-0.740262,-0.737960,-1.060661,-1.029553,-0.949072,...,0.135479,0.132964,0.023548,0.026527,3.270612,0.196289,0.031116,0.002441,1.283060,0.019059
155318,-0.771414,-0.438261,-0.888968,-0.730804,-0.868208,-0.623147,-0.855489,-0.950127,-0.725640,-0.694402,...,0.089910,0.108324,0.017540,0.020471,2.356727,0.212891,0.038450,0.003418,0.828569,0.017904
155319,-0.984837,-0.198621,-0.923624,-0.825457,-1.035743,-0.919991,-0.655853,-0.887433,-0.398968,-0.233844,...,0.092314,0.088311,0.018328,0.017936,6.188604,0.167480,0.041480,0.004883,1.818740,0.020133


In [9]:
df_flat = features.copy()
df_flat.columns = ['_'.join(col).strip() for col in df_flat.columns.values]

df_flat['genre_top'] = y

df_flat.dropna(inplace=True)

df_flat

Unnamed: 0_level_0,chroma_stft_kurtosis_01,chroma_stft_kurtosis_02,chroma_stft_kurtosis_03,chroma_stft_kurtosis_04,chroma_stft_kurtosis_05,chroma_stft_kurtosis_06,chroma_stft_kurtosis_07,chroma_stft_kurtosis_08,chroma_stft_kurtosis_09,chroma_stft_kurtosis_10,...,tonnetz_std_04,tonnetz_std_05,tonnetz_std_06,zcr_kurtosis_01,zcr_max_01,zcr_mean_01,zcr_min_01,zcr_skew_01,zcr_std_01,genre_top
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-1.006041,-0.634076,-0.233752,-0.120917,0.004806,1.218982,0.969103,-0.884986,-0.469190,-0.515475,...,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.000000,2.089872,0.061448,Hip-Hop
3,-0.951502,-0.660734,-1.050015,-0.977441,-0.343043,-0.515404,-0.973297,-1.261086,-1.132458,-0.953374,...,0.063831,0.014212,0.017740,2.824694,0.466309,0.084578,0.000000,1.716724,0.069330,Hip-Hop
5,-0.794551,-1.264806,-0.664387,-0.405196,-0.022688,0.014883,-0.190766,-0.507027,-0.868905,-0.952605,...,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.000000,2.193303,0.044861,Hip-Hop
10,0.954931,-1.266404,0.030425,-0.646823,0.664217,0.082626,-1.241245,0.102074,-0.742071,0.670849,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.000000,3.542325,0.040800,Pop
134,-0.972450,-1.047674,-1.312579,-1.120184,-1.280836,-1.215888,-1.006120,-0.915774,-0.781248,-1.012533,...,0.058766,0.016322,0.015819,4.731087,0.419434,0.064370,0.000000,1.806106,0.054623,Hip-Hop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155315,-0.931622,-0.823569,-1.235423,-1.213406,-0.999892,-0.672000,-1.219753,-1.381937,-1.270755,-1.129524,...,0.108577,0.016631,0.014705,6.947788,0.188477,0.037708,0.002930,1.764233,0.018679,Rock
155316,-0.959207,-0.130793,-0.635587,-0.251981,-0.656534,-0.238574,-0.372219,-0.661802,-0.297405,-0.533092,...,0.128410,0.022547,0.019816,4.448255,0.172852,0.028773,0.003906,0.955388,0.012385,Rock
155317,-0.965068,-0.679953,-1.051353,-0.852524,-0.872679,-0.740262,-0.737960,-1.060661,-1.029553,-0.949072,...,0.132964,0.023548,0.026527,3.270612,0.196289,0.031116,0.002441,1.283060,0.019059,Rock
155318,-0.771414,-0.438261,-0.888968,-0.730804,-0.868208,-0.623147,-0.855489,-0.950127,-0.725640,-0.694402,...,0.108324,0.017540,0.020471,2.356727,0.212891,0.038450,0.003418,0.828569,0.017904,Rock


## II. Création du XGBoost

On retrace ici les choix d'entraînement de notre modèle *XGBoost*.

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


y = df_flat["genre_top"].dropna()

X = df_flat.drop(["genre_top"],axis=1).loc[y.index]

le = LabelEncoder()
y_encoded = le.fit_transform(y) #necessary for xgb
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

Comme pour la forêt aléatoire, afin d'identifier rapidemment les meilleurs hyper-paramètres, on utilise `RandomizedSearchCV` mais cette fois-ci sur 3 *folds* pour gagner du temps de calcul (le modèle étant plus complexe qu'une forêt aléatoire).

(De même, ce code prend longtemps à tourner, nous suggérons de sauter les deux cellules suviantes.)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer

f1_weighted = make_scorer(f1_score, average="weighted")

param_dist = {
    "max_depth": [4, 6, 8, 10],
    "min_child_weight": [1, 5, 10],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0],
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=20,
    scoring=f1_weighted,
    cv=cv,
    verbose=2,
    n_jobs=-1
)

search.fit(X_train, y_train)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[CV] END colsample_bytree=0.8, max_depth=4, min_child_weight=10, subsample=0.7; total time= 4.5min
[CV] END colsample_bytree=0.8, max_depth=4, min_child_weight=10, subsample=0.7; total time= 4.6min
[CV] END colsample_bytree=0.8, max_depth=4, min_child_weight=10, subsample=0.7; total time= 4.7min
[CV] END colsample_bytree=1.0, max_depth=8, min_child_weight=10, subsample=0.7; total time=10.2min
[CV] END colsample_bytree=1.0, max_depth=8, min_child_weight=10, subsample=0.7; total time=10.6min
[CV] END colsample_bytree=1.0, max_depth=8, min_child_weight=10, subsample=0.7; total time=10.6min
[CV] END colsample_bytree=1.0, max_depth=8, min_child_weight=5, subsample=1.0; total time=12.2min
[CV] END colsample_bytree=1.0, max_depth=8, min_child_weight=5, subsample=1.0; total time=12.2min
[CV] END colsample_bytree=0.7, max_depth=6, min_child_weight=5, subsample=1.0; total time= 7.8min
[CV] END colsample_bytree=1.0, max_depth=8, min_child_weight=5, subsample=1.0; total time=12.6min
[CV] END colsa

0,1,2
,"estimator  estimator: estimator object An object of that type is instantiated for each grid point. This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed.","XGBClassifier...ree=None, ...)"
,"param_distributions  param_distributions: dict or list of dicts Dictionary with parameters names (`str`) as keys and distributions or lists of parameters to try. Distributions must provide a ``rvs`` method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. If a list of dicts is given, first a dict is sampled uniformly, and then a parameter is sampled using that dict as above.","{'colsample_bytree': [0.7, 0.8, ...], 'max_depth': [4, 6, ...], 'min_child_weight': [1, 5, ...], 'subsample': [0.7, 0.8, ...]}"
,"n_iter  n_iter: int, default=10 Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.",20
,"scoring  scoring: str, callable, list, tuple or dict, default=None Strategy to evaluate the performance of the cross-validated model on the test set. If `scoring` represents a single score, one can use: - a single string (see :ref:`scoring_string_names`); - a callable (see :ref:`scoring_callable`) that returns a single value; - `None`, the `estimator`'s  :ref:`default evaluation criterion ` is used. If `scoring` represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric  names and the values are the metric scores; - a dictionary with metric names as keys and callables as values. See :ref:`multimetric_grid_search` for an example. If None, the estimator's score method is used.",make_scorer(f...rage=weighted)
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. .. versionchanged:: v0.20  `n_jobs` default changed from 1 to None",-1
,"refit  refit: bool, str, or callable, default=True Refit an estimator using the best found parameters on the whole dataset. For multiple metric evaluation, this needs to be a `str` denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. Where there are considerations other than maximum score in choosing a best estimator, ``refit`` can be set to a function which returns the selected ``best_index_`` given the ``cv_results_``. In that case, the ``best_estimator_`` and ``best_params_`` will be set according to the returned ``best_index_`` while the ``best_score_`` attribute will not be available. The refitted estimator is made available at the ``best_estimator_`` attribute and permits using ``predict`` directly on this ``RandomizedSearchCV`` instance. Also for multiple metric evaluation, the attributes ``best_index_``, ``best_score_`` and ``best_params_`` will only be available if ``refit`` is set and all of them will be determined w.r.t this specific scorer. See ``scoring`` parameter to know more about multiple metric evaluation. See :ref:`this example ` for an example of how to use ``refit=callable`` to balance model complexity and cross-validated score. .. versionchanged:: 0.20  Support for callable added.",True
,"cv  cv: int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22  ``cv`` default value if None changed from 3-fold to 5-fold.",StratifiedKFo... shuffle=True)
,"verbose  verbose: int Controls the verbosity: the higher, the more messages. - >1 : the computation time for each fold and parameter candidate is  displayed; - >2 : the score is also displayed; - >3 : the fold and candidate parameter indexes are also displayed  together with the starting time of the computation.",2
,"pre_dispatch  pre_dispatch: int, or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use  this for lightweight and fast-running jobs, to avoid delays due to on-demand  spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'",'2*n_jobs'
,"random_state  random_state: int, RandomState instance or None, default=None Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'multi:softprob'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [None]:
print("Best params:", search.best_params_)
print("Best CV F1:", search.best_score_)

Best params: {'subsample': 0.8, 'min_child_weight': 10, 'max_depth': 10, 'colsample_bytree': 0.8}
Best CV F1: 0.6735860490211225


In [12]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    max_depth=10,
    min_child_weight=10,
    n_estimators=500,
    colsample_bytree=0.8,
    learning_rate=0.1,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

xgb.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=True
)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation_0-mlogloss:2.42850
[1]	validation_0-mlogloss:2.26636
[2]	validation_0-mlogloss:2.14001
[3]	validation_0-mlogloss:2.03460
[4]	validation_0-mlogloss:1.94586
[5]	validation_0-mlogloss:1.86960
[6]	validation_0-mlogloss:1.80307
[7]	validation_0-mlogloss:1.74399
[8]	validation_0-mlogloss:1.69187
[9]	validation_0-mlogloss:1.64488
[10]	validation_0-mlogloss:1.60203
[11]	validation_0-mlogloss:1.56274
[12]	validation_0-mlogloss:1.52698
[13]	validation_0-mlogloss:1.49524
[14]	validation_0-mlogloss:1.46535
[15]	validation_0-mlogloss:1.43828
[16]	validation_0-mlogloss:1.41251
[17]	validation_0-mlogloss:1.38976
[18]	validation_0-mlogloss:1.36814
[19]	validation_0-mlogloss:1.34784
[20]	validation_0-mlogloss:1.32869
[21]	validation_0-mlogloss:1.31078
[22]	validation_0-mlogloss:1.29480
[23]	validation_0-mlogloss:1.27971
[24]	validation_0-mlogloss:1.26593
[25]	validation_0-mlogloss:1.25264
[26]	validation_0-mlogloss:1.24058
[27]	validation_0-mlogloss:1.22905
[28]	validation_0-mlogloss:1.2

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'multi:softprob'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [13]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

y_pred = xgb.predict(X_test)

# to have classes labels instead of numbers
y_pred_labels = le.inverse_transform(y_pred)
y_test_labels = le.inverse_transform(y_test)

print(classification_report(y_test_labels, y_pred_labels))
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print("Accuracy:", accuracy)

                     precision    recall  f1-score   support

              Blues       1.00      0.09      0.17        22
          Classical       0.85      0.83      0.84       246
            Country       0.70      0.18      0.29        39
         Electronic       0.67      0.74      0.70      1874
       Experimental       0.64      0.75      0.69      2122
               Folk       0.69      0.64      0.66       561
            Hip-Hop       0.74      0.66      0.70       710
       Instrumental       0.52      0.28      0.36       416
      International       0.84      0.46      0.60       278
               Jazz       0.71      0.21      0.32       114
Old-Time / Historic       0.94      0.95      0.94       111
              Other       0.00      0.00      0.00         5
                Pop       0.53      0.18      0.27       466
               Rock       0.76      0.85      0.80      2836
           Soul-RnB       0.75      0.09      0.15        35
             Spoken    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


On constate que très vite la fonction de perte atteint un plancher autour de 0.98. Nous essayons donc d'utiliser de l'*early-stopping*. Même si les résultats sont légèrement moins bons, on diminue presque par deux la durée de l'entraînement ce qui constitue un gain de temps non négligeable.

In [14]:
xgb = XGBClassifier(
    max_depth=10,
    min_child_weight=10,
    eval_metric="mlogloss",
    learning_rate=0.1,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    early_stopping_rounds=50
)

xgb.fit(
    X_train,y_train, eval_set=[(X_test,y_test)],
    verbose=True
)

[0]	validation_0-mlogloss:2.43159
[1]	validation_0-mlogloss:2.27037
[2]	validation_0-mlogloss:2.14419
[3]	validation_0-mlogloss:2.03920
[4]	validation_0-mlogloss:1.95128
[5]	validation_0-mlogloss:1.87595
[6]	validation_0-mlogloss:1.80929
[7]	validation_0-mlogloss:1.75008
[8]	validation_0-mlogloss:1.69804
[9]	validation_0-mlogloss:1.65006
[10]	validation_0-mlogloss:1.60699
[11]	validation_0-mlogloss:1.56769
[12]	validation_0-mlogloss:1.53280
[13]	validation_0-mlogloss:1.49997
[14]	validation_0-mlogloss:1.46983
[15]	validation_0-mlogloss:1.44285
[16]	validation_0-mlogloss:1.41700
[17]	validation_0-mlogloss:1.39293
[18]	validation_0-mlogloss:1.37076
[19]	validation_0-mlogloss:1.35077
[20]	validation_0-mlogloss:1.33265
[21]	validation_0-mlogloss:1.31524
[22]	validation_0-mlogloss:1.29892
[23]	validation_0-mlogloss:1.28383
[24]	validation_0-mlogloss:1.26977
[25]	validation_0-mlogloss:1.25606
[26]	validation_0-mlogloss:1.24425
[27]	validation_0-mlogloss:1.23240
[28]	validation_0-mlogloss:1.2

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'multi:softprob'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",50
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [15]:
y_pred = xgb.predict(X_test)

y_pred_labels = le.inverse_transform(y_pred)
y_test_labels = le.inverse_transform(y_test)

print(classification_report(y_test_labels, y_pred_labels))
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print("Accuracy:", accuracy)

                     precision    recall  f1-score   support

              Blues       0.50      0.05      0.08        22
          Classical       0.86      0.81      0.83       246
            Country       0.83      0.13      0.22        39
         Electronic       0.66      0.73      0.70      1874
       Experimental       0.63      0.75      0.68      2122
               Folk       0.65      0.63      0.64       561
            Hip-Hop       0.73      0.65      0.69       710
       Instrumental       0.52      0.28      0.37       416
      International       0.84      0.44      0.58       278
               Jazz       0.80      0.18      0.29       114
Old-Time / Historic       0.92      0.92      0.92       111
              Other       0.00      0.00      0.00         5
                Pop       0.50      0.14      0.22       466
               Rock       0.75      0.84      0.79      2836
           Soul-RnB       0.75      0.09      0.15        35
             Spoken    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## III. Optimisation des performances

Comme pour la forêt aléatoire, ous tentons ici d'améliorer les performances du modèle en sélectionnant plus finement les *features* selon leur importance dans la prédiction.

In [9]:
importances = pd.Series(
    xgb.feature_importances_,
    index=X_train.columns
).sort_values(ascending=False)

Les *chroma STFT max* sont encore moins importantes que pour la forêt aléatoire et les retirer ne change toujours que peu de choses aux résultats.

In [None]:
importances[280:]

mfcc_skew_17          0.001906
mfcc_kurtosis_07      0.001901
mfcc_skew_18          0.001889
mfcc_kurtosis_12      0.001886
mfcc_skew_19          0.001848
tonnetz_skew_03       0.001828
tonnetz_skew_04       0.001766
tonnetz_skew_05       0.001726
chroma_stft_max_03    0.000000
chroma_stft_max_02    0.000000
chroma_stft_max_01    0.000000
chroma_stft_max_04    0.000000
chroma_stft_max_08    0.000000
chroma_stft_max_05    0.000000
chroma_stft_max_06    0.000000
chroma_stft_max_07    0.000000
chroma_stft_max_12    0.000000
chroma_stft_max_09    0.000000
chroma_stft_max_10    0.000000
chroma_stft_max_11    0.000000
dtype: float32

On applique là encore un tri plus fin de l'importance des *features*.

In [17]:
from sklearn.inspection import permutation_importance

r = permutation_importance(
    xgb,
    X_test,
    y_test,
    n_repeats=5,
    scoring="f1_weighted",
    random_state=42
)

perm_importance = pd.Series(
    r.importances_mean,
    index=X_test.columns
).sort_values()


In [18]:
top_50_features=perm_importance[250:].index.tolist()
top_100_features=perm_importance[200:].index.tolist()
top_150_features=perm_importance[150:].index.tolist()

perm_importance

mfcc_skew_12                -0.000714
mfcc_kurtosis_13            -0.000648
chroma_stft_skew_05         -0.000637
chroma_stft_skew_12         -0.000582
chroma_stft_kurtosis_12     -0.000560
                               ...   
spectral_contrast_skew_03    0.006253
tonnetz_std_04               0.006720
spectral_contrast_mean_04    0.007539
spectral_contrast_mean_02    0.010918
mfcc_max_04                  0.010977
Length: 300, dtype: float64

Nous essayons comme pour la forêt aléatoire de nous en tenir à moins de *features* pour gagner en efficacité.

In [21]:
# 50 features
X=df_flat[top_50_features]
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# 100 features
X=df_flat[top_100_features]
X_train_100, X_test_100, y_train_100, y_test_100 = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# 150 features
X=df_flat[top_150_features]
X_train_150, X_test_150, y_train_150, y_test_150 = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

L'observation faite pour la forêt aléatoire est encore plus vraie ici. Avec 150 *features* nous avons quasiment les mêmes résultats qu'avec le *dataframe* complet (0.690 contre 0.694 d'*accuracy*).

In [24]:
xgb = XGBClassifier(
    max_depth=10,
    min_child_weight=10,
    eval_metric="mlogloss",
    learning_rate=0.1,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    early_stopping_rounds=50
)

xgb.fit(
    X_train_50,y_train_50, eval_set=[(X_test_50,y_test_50)],
    verbose=False # no print of the evaluation
)
y_pred_50 = xgb.predict(X_test_50)

xgb.fit(
    X_train_100,y_train_100, eval_set=[(X_test_100,y_test_100)],
    verbose=False
)
y_pred_100 = xgb.predict(X_test_100)

xgb.fit(
    X_train_150,y_train_150, eval_set=[(X_test_150,y_test_150)],
    verbose=False
)
y_pred_150 = xgb.predict(X_test_150)


print("Accuracy with top 50 features :", accuracy_score(y_test_50, y_pred_50))
print("Accuracy with top 100 features :", accuracy_score(y_test_100, y_pred_100))
print("Accuracy with top 150 features :", accuracy_score(y_test_150, y_pred_150))

y_pred_labels = le.inverse_transform(y_pred_150)
y_test_labels = le.inverse_transform(y_test_150)

print(classification_report(y_test_150, y_pred_150))

Accuracy with top 50 features : 0.6764112903225806
Accuracy with top 100 features : 0.6895161290322581
Accuracy with top 150 features : 0.6900201612903226
              precision    recall  f1-score   support

           0       0.85      0.79      0.82       246
           1       0.66      0.72      0.69      1874
           2       0.62      0.74      0.68      2122
           3       0.67      0.62      0.65       561
           4       0.74      0.68      0.71       710
           5       0.52      0.32      0.40       416
           6       0.81      0.42      0.56       278
           7       0.71      0.19      0.30       114
           8       0.94      0.92      0.93       111
           9       0.62      0.05      0.09       101
          10       0.47      0.15      0.23       466
          11       0.76      0.85      0.80      2836
          12       0.70      0.52      0.59        85

    accuracy                           0.69      9920
   macro avg       0.70      0.54