# Random Forest

Toujours avec des les features musicales agrégées Librosa, nous entraînons ici un modèle de forêt aléatoire.

## I. Chargement, preprocessing des données et sélection des variables

Nous reprenons le processus et les idées de sélection des variables développées dans *neural_net.ipynb*.

In [None]:
import numpy as np
import pandas as pd
from utils.load_data import fma_load


features = fma_load('data/fma_metadata/features.csv')
tracks = fma_load('data/fma_metadata/tracks.csv')

y = tracks['track','genre_top']

corr = features.corr()

threshold = 0.75

corr_matrix = corr.copy()

np.fill_diagonal(corr_matrix.values, 0) # Diagonal to zero

strong_corr = (corr_matrix.abs() > threshold)

pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if strong_corr.iloc[i, j]:
            pairs.append({
                "feature_1": corr_matrix.columns[i],
                "feature_2": corr_matrix.columns[j],
                "correlation": corr_matrix.iloc[i, j]
            })


features.drop(columns=['chroma_cqt','chroma_cens'],inplace=True)
features = features.loc[:, features.columns.get_level_values('statistics') != 'median']

features

feature,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,chroma_stft,...,tonnetz,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,std,kurtosis,max,mean,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,03,04,05,06,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,-1.006041,-0.634076,-0.233752,-0.120917,0.004806,1.218982,0.969103,-0.884986,-0.469190,-0.515475,...,0.038974,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.000000,2.089872,0.061448
3,-0.951502,-0.660734,-1.050015,-0.977441,-0.343043,-0.515404,-0.973297,-1.261086,-1.132458,-0.953374,...,0.051151,0.063831,0.014212,0.017740,2.824694,0.466309,0.084578,0.000000,1.716724,0.069330
5,-0.794551,-1.264806,-0.664387,-0.405196,-0.022688,0.014883,-0.190766,-0.507027,-0.868905,-0.952605,...,0.084997,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.000000,2.193303,0.044861
10,0.954931,-1.266404,0.030425,-0.646823,0.664217,0.082626,-1.241245,0.102074,-0.742071,0.670849,...,0.088197,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.000000,3.542325,0.040800
20,-0.048443,-0.543755,-1.336000,-0.410307,-0.684526,-0.960676,-1.108473,-1.320151,-0.754329,-1.029887,...,0.105521,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.000977,3.189831,0.030993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,-0.959207,-0.130793,-0.635587,-0.251981,-0.656534,-0.238574,-0.372219,-0.661802,-0.297405,-0.533092,...,0.102859,0.128410,0.022547,0.019816,4.448255,0.172852,0.028773,0.003906,0.955388,0.012385
155317,-0.965068,-0.679953,-1.051353,-0.852524,-0.872679,-0.740262,-0.737960,-1.060661,-1.029553,-0.949072,...,0.135479,0.132964,0.023548,0.026527,3.270612,0.196289,0.031116,0.002441,1.283060,0.019059
155318,-0.771414,-0.438261,-0.888968,-0.730804,-0.868208,-0.623147,-0.855489,-0.950127,-0.725640,-0.694402,...,0.089910,0.108324,0.017540,0.020471,2.356727,0.212891,0.038450,0.003418,0.828569,0.017904
155319,-0.984837,-0.198621,-0.923624,-0.825457,-1.035743,-0.919991,-0.655853,-0.887433,-0.398968,-0.233844,...,0.092314,0.088311,0.018328,0.017936,6.188604,0.167480,0.041480,0.004883,1.818740,0.020133


Afin de réduire l'impact du déséquilibre entre les classes, nous décidons de réduire le nombre de classe à 13, en regroupant les plus rares sous la mention "Other" (Country :194, Soul-RnB :175, Blues :110 et Easy Listening :24, soit 503 titres).

In [4]:
from collections import Counter

df_flat = features.copy()
df_flat.columns = ['_'.join(col).strip() for col in df_flat.columns.values]

df_flat['genre_top'] = y

df_flat.dropna(inplace=True)

counts = Counter(df_flat['genre_top'])

rare_classes = [cls for cls, c in counts.items() if c < 400]

df_flat["genre_top_grouped"] = df_flat["genre_top"].replace(
    {cls: "Other" for cls in rare_classes}
)

df_flat

  df_flat["genre_top_grouped"] = df_flat["genre_top"].replace(


Unnamed: 0_level_0,chroma_stft_kurtosis_01,chroma_stft_kurtosis_02,chroma_stft_kurtosis_03,chroma_stft_kurtosis_04,chroma_stft_kurtosis_05,chroma_stft_kurtosis_06,chroma_stft_kurtosis_07,chroma_stft_kurtosis_08,chroma_stft_kurtosis_09,chroma_stft_kurtosis_10,...,tonnetz_std_05,tonnetz_std_06,zcr_kurtosis_01,zcr_max_01,zcr_mean_01,zcr_min_01,zcr_skew_01,zcr_std_01,genre_top,genre_top_grouped
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-1.006041,-0.634076,-0.233752,-0.120917,0.004806,1.218982,0.969103,-0.884986,-0.469190,-0.515475,...,0.012226,0.012111,5.758890,0.459473,0.085629,0.000000,2.089872,0.061448,Hip-Hop,Hip-Hop
3,-0.951502,-0.660734,-1.050015,-0.977441,-0.343043,-0.515404,-0.973297,-1.261086,-1.132458,-0.953374,...,0.014212,0.017740,2.824694,0.466309,0.084578,0.000000,1.716724,0.069330,Hip-Hop,Hip-Hop
5,-0.794551,-1.264806,-0.664387,-0.405196,-0.022688,0.014883,-0.190766,-0.507027,-0.868905,-0.952605,...,0.012691,0.014759,6.808415,0.375000,0.053114,0.000000,2.193303,0.044861,Hip-Hop,Hip-Hop
10,0.954931,-1.266404,0.030425,-0.646823,0.664217,0.082626,-1.241245,0.102074,-0.742071,0.670849,...,0.017952,0.013921,21.434212,0.452148,0.077515,0.000000,3.542325,0.040800,Pop,Pop
134,-0.972450,-1.047674,-1.312579,-1.120184,-1.280836,-1.215888,-1.006120,-0.915774,-0.781248,-1.012533,...,0.016322,0.015819,4.731087,0.419434,0.064370,0.000000,1.806106,0.054623,Hip-Hop,Hip-Hop
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155315,-0.931622,-0.823569,-1.235423,-1.213406,-0.999892,-0.672000,-1.219753,-1.381937,-1.270755,-1.129524,...,0.016631,0.014705,6.947788,0.188477,0.037708,0.002930,1.764233,0.018679,Rock,Rock
155316,-0.959207,-0.130793,-0.635587,-0.251981,-0.656534,-0.238574,-0.372219,-0.661802,-0.297405,-0.533092,...,0.022547,0.019816,4.448255,0.172852,0.028773,0.003906,0.955388,0.012385,Rock,Rock
155317,-0.965068,-0.679953,-1.051353,-0.852524,-0.872679,-0.740262,-0.737960,-1.060661,-1.029553,-0.949072,...,0.023548,0.026527,3.270612,0.196289,0.031116,0.002441,1.283060,0.019059,Rock,Rock
155318,-0.771414,-0.438261,-0.888968,-0.730804,-0.868208,-0.623147,-0.855489,-0.950127,-0.725640,-0.694402,...,0.017540,0.020471,2.356727,0.212891,0.038450,0.003418,0.828569,0.017904,Rock,Rock


## II. Création de la forêt aléatoire

On retrace ici les choix d'entraînement de la forêt aléatoire.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


y = df_flat["genre_top_grouped"].dropna()

X = df_flat.drop(["genre_top","genre_top_grouped"],axis=1).loc[y.index]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # stratify to keep same proportion of classes in train and test

On commence par faire une *5-fold-cross-validation* sur plusieurs combinaisons d'hyper-paramètres grâce à `RandomizedSearchCV`. L'idée est plutôt que de tester toutes les combinaisons, en essayer plusieurs au hasard et garder la plus performante. On choisit le score F1 pondéré qui tient compte du déséquilibre entre les classes comme métrique.

(Ce code est long à tourner, nous vous suggérons de plutôt sauter les deux cellules qui suivent.)

In [10]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'max_features': ['sqrt', 'log2', 0.5],
    'min_samples_leaf': [1, 2, 5],
}

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1_weighted',
    cv=cv,
    verbose=2,
    random_state=42
)

search.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END max_depth=30, max_features=0.5, min_samples_leaf=2, n_estimators=500; total time= 3.6min
[CV] END max_depth=30, max_features=0.5, min_samples_leaf=2, n_estimators=500; total time= 3.5min
[CV] END max_depth=30, max_features=0.5, min_samples_leaf=2, n_estimators=500; total time= 3.7min
[CV] END max_depth=30, max_features=0.5, min_samples_leaf=2, n_estimators=500; total time= 3.8min
[CV] END max_depth=30, max_features=0.5, min_samples_leaf=2, n_estimators=500; total time= 3.5min
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, n_estimators=300; total time=   4.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, n_estimators=300; total time=   4.7s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, n_estimators=300; total time=   4.9s
[CV] END max_depth=10, max_features=log2, min_samples_leaf=1, n_estimators=300; total time=   4.6s
[CV] END max_depth=10, max_features=log2, min_sample

0,1,2
,"estimator  estimator: estimator object An object of that type is instantiated for each grid point. This is assumed to implement the scikit-learn estimator interface. Either estimator needs to provide a ``score`` function, or ``scoring`` must be passed.",RandomForestC...ndom_state=42)
,"param_distributions  param_distributions: dict or list of dicts Dictionary with parameters names (`str`) as keys and distributions or lists of parameters to try. Distributions must provide a ``rvs`` method for sampling (such as those from scipy.stats.distributions). If a list is given, it is sampled uniformly. If a list of dicts is given, first a dict is sampled uniformly, and then a parameter is sampled using that dict as above.","{'max_depth': [10, 20, ...], 'max_features': ['sqrt', 'log2', ...], 'min_samples_leaf': [1, 2, ...], 'n_estimators': [200, 300, ...]}"
,"n_iter  n_iter: int, default=10 Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.",20
,"scoring  scoring: str, callable, list, tuple or dict, default=None Strategy to evaluate the performance of the cross-validated model on the test set. If `scoring` represents a single score, one can use: - a single string (see :ref:`scoring_string_names`); - a callable (see :ref:`scoring_callable`) that returns a single value; - `None`, the `estimator`'s  :ref:`default evaluation criterion ` is used. If `scoring` represents multiple scores, one can use: - a list or tuple of unique strings; - a callable returning a dictionary where the keys are the metric  names and the values are the metric scores; - a dictionary with metric names as keys and callables as values. See :ref:`multimetric_grid_search` for an example. If None, the estimator's score method is used.",'f1_weighted'
,"n_jobs  n_jobs: int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. .. versionchanged:: v0.20  `n_jobs` default changed from 1 to None",
,"refit  refit: bool, str, or callable, default=True Refit an estimator using the best found parameters on the whole dataset. For multiple metric evaluation, this needs to be a `str` denoting the scorer that would be used to find the best parameters for refitting the estimator at the end. Where there are considerations other than maximum score in choosing a best estimator, ``refit`` can be set to a function which returns the selected ``best_index_`` given the ``cv_results_``. In that case, the ``best_estimator_`` and ``best_params_`` will be set according to the returned ``best_index_`` while the ``best_score_`` attribute will not be available. The refitted estimator is made available at the ``best_estimator_`` attribute and permits using ``predict`` directly on this ``RandomizedSearchCV`` instance. Also for multiple metric evaluation, the attributes ``best_index_``, ``best_score_`` and ``best_params_`` will only be available if ``refit`` is set and all of them will be determined w.r.t this specific scorer. See ``scoring`` parameter to know more about multiple metric evaluation. See :ref:`this example ` for an example of how to use ``refit=callable`` to balance model complexity and cross-validated score. .. versionchanged:: 0.20  Support for callable added.",True
,"cv  cv: int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22  ``cv`` default value if None changed from 3-fold to 5-fold.",StratifiedKFo... shuffle=True)
,"verbose  verbose: int Controls the verbosity: the higher, the more messages. - >1 : the computation time for each fold and parameter candidate is  displayed; - >2 : the score is also displayed; - >3 : the fold and candidate parameter indexes are also displayed  together with the starting time of the computation.",2
,"pre_dispatch  pre_dispatch: int, or str, default='2*n_jobs' Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: - None, in which case all the jobs are immediately created and spawned. Use  this for lightweight and fast-running jobs, to avoid delays due to on-demand  spawning of the jobs - An int, giving the exact number of total jobs that are spawned - A str, giving an expression as a function of n_jobs, as in '2*n_jobs'",'2*n_jobs'
,"random_state  random_state: int, RandomState instance or None, default=None Pseudo random number generator state used for random uniform sampling from lists of possible values instead of scipy.stats distributions. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",42

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",500
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",30
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",5
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [11]:
best_rf = search.best_estimator_

print("Best params:", search.best_params_)
print("Best CV score (weighted F1):", search.best_score_)

Best params: {'n_estimators': 500, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 30}
Best CV score (weighted F1): 0.6051986506374469


On *fit* donc le modèle le plus performant.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier(class_weight='balanced', max_depth=30,
                       min_samples_leaf=5, n_estimators=500, n_jobs=-1, max_features="sqrt", 
                       random_state=42)
                       
rf.fit(X_train, y_train)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",500
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",30
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",5
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [12]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))


                     precision    recall  f1-score   support

          Classical       0.66      0.82      0.73       246
         Electronic       0.61      0.66      0.64      1874
       Experimental       0.60      0.60      0.60      2122
               Folk       0.45      0.67      0.54       561
            Hip-Hop       0.63      0.63      0.63       710
       Instrumental       0.39      0.36      0.38       416
      International       0.57      0.35      0.43       278
               Jazz       0.64      0.18      0.29       114
Old-Time / Historic       0.84      0.95      0.89       111
              Other       1.00      0.03      0.06       101
                Pop       0.36      0.06      0.11       466
               Rock       0.73      0.78      0.76      2836
             Spoken       0.53      0.54      0.54        85

           accuracy                           0.63      9920
          macro avg       0.62      0.51      0.51      9920
       weighted avg   

Pour être sûrs que `RandomizedSearchCV` n'occulte pas un modèle plus performant, nous avons aussi testé à la main quelques autres configurations proches du modèle au-dessus. Il se trouve qu'en baissant le nombre d'échantillon minimum par feuille, l'accuracy est légèrement meilleure mais au prix d'un moins bon score F1 pondéré. On retombe dans un arbitrage entre bien prédire les classes dominantes ou les classes rares. Nous gardons le premier modèle, plus équilibré.

In [None]:
from sklearn.metrics import accuracy_score

rf2=RandomForestClassifier(class_weight='balanced', max_depth=30,
                       min_samples_leaf=3, n_estimators=500, n_jobs=-1, max_features="sqrt",
                       random_state=42)
                       
rf2.fit(X_train, y_train)

y_pred2 = rf2.predict(X_test)

print(classification_report(y_test, y_pred2))

print("Accuracy from best :", accuracy_score(y_test, y_pred))
print("Accuracy from second model :", accuracy_score(y_test, y_pred2))

                     precision    recall  f1-score   support

          Classical       0.77      0.80      0.79       246
         Electronic       0.60      0.67      0.64      1874
       Experimental       0.57      0.67      0.62      2122
               Folk       0.51      0.62      0.56       561
            Hip-Hop       0.71      0.59      0.64       710
       Instrumental       0.46      0.28      0.35       416
      International       0.76      0.31      0.44       278
               Jazz       0.65      0.11      0.19       114
Old-Time / Historic       0.90      0.91      0.91       111
              Other       1.00      0.01      0.02       101
                Pop       0.52      0.03      0.06       466
               Rock       0.71      0.81      0.76      2836
             Spoken       0.59      0.49      0.54        85

           accuracy                           0.64      9920
          macro avg       0.67      0.49      0.50      9920
       weighted avg   

## III. Optimisation des performances

Nous tentons ici d'améliorer les performances du modèle en sélectionnant plus finement les *features* selon leur importance dans la prédiction.

In [None]:
# Importance of features in the prediction
importances = pd.Series(
    rf.feature_importances_,
    index=X_train.columns
).sort_values(ascending=False)

Il semblerait que les features *chroma STFT max* soient les moins utiles. Nous avons essayé de les retirer et cela a peu d'impact sur les résultats, nous encourageant donc à trier encore davantage nos *features*.

In [None]:
importances[280:]

chroma_stft_min_08    1.391866e-03
chroma_stft_min_03    1.391404e-03
chroma_stft_min_04    1.374834e-03
chroma_stft_min_12    1.368643e-03
chroma_stft_min_10    1.364518e-03
chroma_stft_min_01    1.358080e-03
chroma_stft_min_05    1.316114e-03
chroma_stft_min_07    1.283552e-03
chroma_stft_max_06    3.509443e-06
chroma_stft_max_02    1.821985e-06
chroma_stft_max_09    1.622223e-06
chroma_stft_max_07    1.584831e-06
chroma_stft_max_01    1.036087e-06
chroma_stft_max_11    8.650726e-07
chroma_stft_max_12    4.630517e-07
chroma_stft_max_04    4.609511e-07
chroma_stft_max_10    0.000000e+00
chroma_stft_max_08    0.000000e+00
chroma_stft_max_03    0.000000e+00
chroma_stft_max_05    0.000000e+00
dtype: float64

Même principe mais en mesurant cette fois-ci la dépendance du modèle aux *features*. Si en modifiant les valeurs d'une *feature* (par permutations), la qualité de la prédiction chute, alors la *feature* est très importante.

In [15]:
from sklearn.inspection import permutation_importance

r = permutation_importance(
    rf,
    X_test,
    y_test,
    n_repeats=5,
    scoring="f1_weighted",
    random_state=42
)

perm_importance = pd.Series(
    r.importances_mean,
    index=X_test.columns
).sort_values()


Cette fois-ci, on élague drastiquement le nombre de features en ne gardant tout d'abord que les 50 plus importantes, puis 100 puis 150.

In [None]:
top_50_features=perm_importance[250:].index.tolist()
top_100_features=perm_importance[200:].index.tolist()
top_150_features=perm_importance[150:].index.tolist()

perm_importance

tonnetz_std_01              -0.000795
tonnetz_std_05              -0.000717
tonnetz_min_04              -0.000524
mfcc_max_03                 -0.000486
chroma_stft_kurtosis_07     -0.000484
                               ...   
mfcc_max_04                  0.002741
mfcc_std_06                  0.002801
spectral_contrast_mean_02    0.002955
spectral_contrast_mean_01    0.003271
rmse_mean_01                 0.003372
Length: 300, dtype: float64

In [None]:
# 50 features
X=df_flat[top_50_features]
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 100 features
X=df_flat[top_100_features]
X_train_100, X_test_100, y_train_100, y_test_100 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 150 features
X=df_flat[top_150_features]
X_train_150, X_test_150, y_train_150, y_test_150 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

Il se trouve qu'avec 150, voire 100 *features*, nous sommes très proches des résultats obtenus avec toutes les features, suggérant qu'il est possible de s'en tenir à 150 *features* si l'on souhaite économiser du temps de calcul.

In [19]:
rf=RandomForestClassifier(class_weight='balanced', max_depth=30,
                       min_samples_leaf=5, n_estimators=500, n_jobs=-1, max_features="sqrt",
                       random_state=42)
                       
rf.fit(X_train_50, y_train_50)
y_pred_50 = rf.predict(X_test_50)

rf.fit(X_train_100, y_train_100)
y_pred_100 = rf.predict(X_test_100)

rf.fit(X_train_150, y_train_150)
y_pred_150 = rf.predict(X_test_150)


print("Accuracy with top 50 features :", accuracy_score(y_test_50, y_pred_50))
print("Accuracy with top 100 features :", accuracy_score(y_test_100, y_pred_100))
print("Accuracy with top 150 features :", accuracy_score(y_test_150, y_pred_150))

print(classification_report(y_test_150, y_pred_150))

Accuracy with top 50 features : 0.6047379032258065
Accuracy with top 100 features : 0.6139112903225806
Accuracy with top 150 features : 0.6211693548387097
                     precision    recall  f1-score   support

          Classical       0.68      0.83      0.75       246
         Electronic       0.62      0.65      0.63      1874
       Experimental       0.61      0.59      0.60      2122
               Folk       0.44      0.67      0.53       561
            Hip-Hop       0.59      0.64      0.61       710
       Instrumental       0.36      0.38      0.37       416
      International       0.55      0.35      0.43       278
               Jazz       0.49      0.18      0.27       114
Old-Time / Historic       0.84      0.93      0.88       111
              Other       1.00      0.03      0.06       101
                Pop       0.34      0.07      0.11       466
               Rock       0.73      0.78      0.75      2836
             Spoken       0.56      0.52      0.54 