In [1]:
%matplotlib inline

In [2]:
import warnings

import pandas as pd

from report import mlflow_log_classification_report, mlflow_log_model
import constants

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier

import mlflow
import mlflow.sklearn
from mlflow.data.pandas_dataset import PandasDataset

In [3]:
warnings.filterwarnings("ignore", "Setuptools is replacing distutils.")
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=UserWarning)

# Music Genre Classifier Model Selection

## Optimizing Random Forest Classifier

In [4]:
sample_length = 30

In [5]:
data_file = f"../data/{sample_length}_seconds_song_features.csv".format(sample_length)

In [6]:
songs = pd.read_csv(data_file, low_memory=False)

In [7]:
songs

Unnamed: 0,zero_crossings_max,zero_crossings_min,zero_crossings_mean,zero_crossings_std,zero_crossings_kurtosis,zero_crossings_skew,centroid_max,centroid_min,centroid_mean,centroid_std,...,mfcc_11_skew,mfcc_12_max,mfcc_12_min,mfcc_12_mean,mfcc_12_std,mfcc_12_kurtosis,mfcc_12_skew,tempo,genre,file
0,0.752441,0.009766,0.144590,0.082585,9.540830,2.444111,7412.694108,1022.153015,3302.342044,893.866324,...,0.002264,21.563793,-23.538502,0.544136,7.403120,0.046500,-0.178789,117.453835,pop,pop.00024.wav
1,0.682129,0.011719,0.145827,0.108406,4.443655,1.947596,8040.036703,965.917225,2940.712732,1290.550571,...,0.053169,24.365047,-36.742607,-3.502535,8.535687,0.208987,-0.296028,99.384014,pop,pop.00058.wav
2,0.676270,0.018066,0.121279,0.089060,13.139049,3.260256,7513.716630,698.671998,2885.707646,988.136171,...,-0.240175,15.289233,-25.785180,-2.685357,6.333065,0.292852,-0.318905,161.499023,pop,pop.00008.wav
3,0.674316,0.003906,0.072507,0.074976,27.751176,4.805041,7523.680993,753.456471,1996.214762,978.055235,...,-0.073378,19.543030,-21.030888,-3.200028,5.797513,0.272921,0.051250,112.347147,pop,pop.00079.wav
4,0.671387,0.007812,0.142882,0.092810,2.602808,1.200830,7387.704967,595.120877,3126.086716,972.528281,...,0.632891,27.504042,-34.374443,-1.224632,8.387035,0.740220,-0.310612,103.359375,pop,pop.00078.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
922,0.429688,0.013184,0.154277,0.057703,2.119194,0.693468,5543.291625,1080.617808,2615.398850,585.437364,...,0.149398,37.331573,-31.768559,-7.373089,7.614615,0.856139,0.197147,99.384014,disco,disco.00048.wav
923,0.484863,0.058594,0.169988,0.051547,7.649568,2.056028,5212.853055,1473.049674,2592.271461,506.006198,...,-0.172077,13.354691,-36.042137,-12.028731,7.713210,0.087752,0.043065,123.046875,disco,disco.00052.wav
924,0.454590,0.014648,0.131841,0.062179,2.818245,1.299597,5822.532186,851.802097,2739.275175,699.574463,...,-0.494983,19.650800,-27.985397,0.808960,7.346756,0.232449,-0.351014,112.347147,disco,disco.00012.wav
925,0.419434,0.035645,0.138467,0.050052,4.400630,1.600552,6249.889744,1345.064509,3075.215095,620.704023,...,-0.080007,16.779268,-32.086174,-2.831664,7.741563,0.192752,-0.446212,103.359375,disco,disco.00029.wav


In [8]:
song_genres = songs["genre"]

In [9]:
label_encoder = LabelEncoder()

In [10]:
encoded_song_genres = label_encoder.fit_transform(song_genres)

In [11]:
song_features = songs.drop(columns=["genre" , "file"], axis=1)

## Test, train and validation split

In [12]:
constants.RANDOM_STATE

1984

In [13]:
song_features_intermediate_train, song_features_test, song_genres_intermediate_train, song_genres_test = \
    train_test_split(song_features, song_genres, test_size = 0.1, random_state=constants.RANDOM_STATE)

In [14]:
song_features_train, song_features_val, song_genres_train, song_genres_val = \
    train_test_split(song_features_intermediate_train, song_genres_intermediate_train, test_size = 0.2,\
                     random_state=constants.RANDOM_STATE)

## Peraring traning pieline

In [15]:
train_pipeline = Pipeline([
    ("sndard_scaler", StandardScaler()),
    ("reduce_dimension", None),
    ("random_forest", RandomForestClassifier())
])

## Fiding Best Random Forest Classifier

In [29]:
dataset: PandasDataset = mlflow.data.from_pandas(songs, source=data_file)

### By min sample split

In [17]:
experiment = mlflow.create_experiment(name = f"Random Forest, min sample split - {sample_length} sec".format(sample_length))

In [18]:
run = mlflow.start_run(experiment_id=experiment)

In [19]:
mlflow.log_artifact(data_file)
mlflow.log_input(dataset)

In [20]:
grid_search = GridSearchCV(train_pipeline,
    param_grid={
        "random_forest__max_depth" : [ 10, 20, 50, 100, 150, 200, 400, None],
        "random_forest__min_samples_split": [2, 3, 5, 10, 20]
    }, cv = 5, n_jobs=8)

In [21]:
grid_search.fit(song_features_train, song_genres_train)

In [22]:
grid_search.best_params_

{'random_forest__max_depth': 150, 'random_forest__min_samples_split': 3}

In [23]:
train_score = grid_search.best_estimator_.score(song_features_train, song_genres_train)

In [24]:
train_score

1.0

In [25]:
validation_score = grid_search.best_estimator_.score(song_features_val, song_genres_val)

In [26]:
validation_score

0.7604790419161677

In [27]:
mlflow_log_classification_report(song_features_val, song_genres_val,\
                            grid_search.best_estimator_, target_names=label_encoder.classes_)

              precision    recall  f1-score   support

       blues       0.80      0.63      0.71        19
   classical       0.86      0.90      0.88        20
     country       0.67      0.62      0.65        16
       disco       0.75      0.79      0.77        19
      hiphop       0.88      0.75      0.81        20
        jazz       0.67      0.67      0.67        15
       metal       0.89      0.84      0.86        19
         pop       0.75      0.90      0.82        20
      reggae       0.71      0.67      0.69        18
        rock       0.20      1.00      0.33         1

    accuracy                           0.76       167
   macro avg       0.72      0.78      0.72       167
weighted avg       0.78      0.76      0.76       167



In [28]:
mlflow_log_model(grid_search, train_score, validation_score)
mlflow.end_run()

### By min sample leafs

In [None]:
experiment = mlflow.create_experiment(name = f"Random Forest, min sample leaf - {sample_length} sec".format(sample_length))

In [None]:
run = mlflow.start_run(experiment_id=experiment)

In [None]:
mlflow.log_artifact(data_file)
mlflow.log_input(dataset)

In [None]:
grid_search = GridSearchCV(train_pipeline,
    param_grid={
        "random_forest__max_depth" : [ 10, 20, 50, 100, 150, 200, 400, None],
        "random_forest__min_samples_leaf": [1, 2, 3, 5, 10, 20]
    }, cv = 5, n_jobs=8)

In [None]:
grid_search.fit(song_features_train, song_genres_train)

In [None]:
grid_search.best_params_

In [None]:
train_score = grid_search.best_estimator_.score(song_features_train, song_genres_train)

In [None]:
train_score

In [None]:
validation_score = grid_search.best_estimator_.score(song_features_val, song_genres_val)

In [None]:
validation_score

In [None]:
mlflow_log_classification_report(song_features_val, song_genres_val,\
                            grid_search.best_estimator_, target_names=label_encoder.classes_)

In [None]:
mlflow_log_model(grid_search, train_score, validation_score)
mlflow.end_run()