# Music Genre Classification Using Deep Learning
Data source: [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)

Yetong Chen

* This notebook evaluates standard classifiers from scikit-learn on the provided features.
* Moreover, it evaluates Deep Learning models on the provided features.

In [1]:
#! pip install python-dotenv pydot requests tqdm

In [2]:
import time
import os

import IPython.display as ipd
# import tqdm
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import random
import keras
from keras.models import Sequential
from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split

import utils

In [3]:
os.environ['AUDIO_DIR'] = 'D:\\code\\BIA667\\fma\\data\\fma_small'

In [4]:
AUDIO_DIR = os.environ.get('AUDIO_DIR')

tracks = utils.load('data/fma_metadata/tracks.csv')
features = utils.load('data/fma_metadata/features.csv')
echonest = utils.load('data/fma_metadata/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, features.shape, echonest.shape

((106574, 52), (106574, 518), (13129, 249))

In [5]:
print(AUDIO_DIR)

D:\code\BIA667\fma\data\fma_small


In [6]:
def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    random.seed(seed_value)  # Python random module
    np.random.seed(seed_value)  # Numpy module
    torch.manual_seed(seed_value)  # PyTorch random number generator for CPU
    
    # if using CUDA
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  # if using multi-GPU

# Use a known seed to initialize random number generator for reproducibility
set_seed(42)

## Subset

In [7]:
subset = tracks.index[tracks['set', 'subset'] <= 'medium']

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]
features_all = features.loc[subset]

tracks.shape, features_all.shape

Not enough Echonest features: (13129, 767)


((25000, 52), (25000, 518))

In [8]:
train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))

genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)
#genres = list(tracks['track', 'genre_top'].unique())
print('Top genres ({}): {}'.format(len(genres), genres))
genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)
print('All genres ({}): {}'.format(len(genres), genres))

19922 training examples, 2505 validation examples, 2573 testing examples
Top genres (16): ['Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken']
All genres (151): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, 30, 31, 32, 33, 36, 37, 38, 41, 42, 43, 45, 46, 47, 49, 53, 58, 63, 64, 65, 66, 70, 71, 74, 76, 77, 79, 81, 83, 85, 86, 88, 89, 90, 92, 94, 97, 98, 100, 101, 102, 103, 107, 109, 111, 113, 117, 118, 125, 130, 137, 138, 166, 167, 169, 171, 172, 174, 177, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 214, 224, 232, 236, 240, 247, 250, 267, 286, 296, 297, 311, 314, 322, 337, 359, 360, 361, 362, 374, 378, 400, 401, 404, 428, 439, 440, 441, 442, 443, 456, 468, 491, 495, 502, 504, 514, 524, 538, 539, 542, 580, 602, 619, 651, 659, 695, 741, 763, 808, 810, 811, 906, 1032, 1060, 1193, 1235]


## 1 Baselines

### 1.1 Pre-processing

In [24]:
def pre_process(tracks, features, columns, multi_label=False, verbose=False):
    if not multi_label:
        # Assign an integer value to each genre.
        enc = LabelEncoder()
        labels = tracks['track', 'genre_top']
        #y = enc.fit_transform(tracks['track', 'genre_top'])
    else:
        # Create an indicator matrix.
        enc = MultiLabelBinarizer()
        labels = tracks['track', 'genres_all']
        #labels = tracks['track', 'genres']

    # Split in training, validation and testing sets.
    y_train = enc.fit_transform(labels[train])
    y_val = enc.transform(labels[val])
    y_test = enc.transform(labels[test])
    X_train = features.loc[train, columns].to_numpy()
    X_val = features.loc[val, columns].to_numpy()
    X_test = features.loc[test, columns].to_numpy()

    X_train, y_train = shuffle(X_train, y_train, random_state=42)

    # Standardize features by removing the mean and scaling to unit variance.
    scaler = StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_val)
    scaler.transform(X_test)

    # print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
    # print(f'X_val shape: {X_val.shape}, y_val shape: {y_val.shape}')
    # print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')


    return y_train, y_val, y_test, X_train, X_val, X_test

### 1.2 Single genre

In [25]:
def test_classifiers_features(classifiers, feature_sets, multi_label=False):
    columns = list(classifiers.keys()).insert(0, 'dim')
    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())
    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())
    for fset_name, fset in tqdm(feature_sets.items(), desc='features'):
        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)
        scores.loc[fset_name, 'dim'] = X_train.shape[1]
        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            scores.loc[fset_name, clf_name] = score
            times.loc[fset_name, clf_name] = time.process_time() - t
    return scores, times

def format_scores(scores):
    def highlight(s):
        is_max = s == max(s[1:])
        return ['background-color: yellow' if v else '' for v in is_max]
    scores = scores.style.apply(highlight, axis=1)
    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])

In [26]:
classifiers = {
    'LR': LogisticRegression(max_iter=500),
    'kNN': KNeighborsClassifier(n_neighbors=200),
    'SVCrbf': SVC(kernel='rbf'),
    'SVCpoly1': SVC(kernel='poly', degree=1),
    'linSVC1': SVC(kernel="linear"),
    'linSVC2': LinearSVC(),
    #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    'DT': DecisionTreeClassifier(max_depth=5),
    'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    'AdaBoost': AdaBoostClassifier(n_estimators=10),
    'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
    'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
    'NB': GaussianNB(),
    'QDA': QuadraticDiscriminantAnalysis(),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_social': ('echonest', 'social_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
#    'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),
#    'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),
}
for name in features.columns.levels[0]:
    feature_sets[name] = name
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})

scores, times = test_classifiers_features(classifiers, feature_sets)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))

features:   0%|          | 0/18 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,dim,LR,kNN,SVCrbf,SVCpoly1,linSVC1,linSVC2,DT,RF,AdaBoost,MLP1,MLP2,NB,QDA
chroma_cens,84.0,39.45%,37.50%,42.29%,38.63%,39.29%,39.18%,35.68%,33.42%,30.86%,38.98%,33.46%,9.99%,24.64%
chroma_cqt,84.0,40.19%,40.03%,44.27%,39.95%,41.39%,40.61%,35.45%,35.72%,35.72%,42.09%,36.92%,1.55%,3.54%
chroma_stft,84.0,43.72%,43.92%,48.31%,43.65%,44.35%,43.30%,39.88%,40.03%,35.25%,46.95%,41.16%,4.20%,5.95%
mfcc,140.0,58.03%,54.99%,60.98%,59.66%,59.19%,57.21%,45.82%,45.12%,41.31%,50.45%,52.47%,41.86%,48.39%
rmse,7.0,36.92%,38.52%,38.90%,37.70%,37.58%,37.35%,38.63%,37.19%,34.67%,38.55%,38.83%,11.78%,15.04%
spectral_bandwidth,7.0,40.58%,45.39%,44.46%,40.38%,40.46%,40.50%,42.91%,44.07%,37.47%,45.51%,43.57%,36.18%,34.16%
spectral_centroid,7.0,42.75%,45.36%,45.71%,42.09%,42.09%,42.17%,42.67%,43.88%,42.60%,47.30%,45.94%,33.31%,36.11%
spectral_contrast,49.0,51.77%,49.55%,54.45%,49.59%,51.81%,48.97%,43.53%,43.92%,39.53%,50.33%,44.31%,39.41%,41.78%
spectral_rolloff,7.0,41.78%,46.25%,47.53%,41.43%,41.62%,41.47%,45.36%,45.47%,41.66%,48.54%,47.65%,28.49%,28.53%
tonnetz,42.0,40.11%,37.31%,42.25%,40.23%,40.15%,39.64%,35.91%,36.69%,34.16%,40.65%,31.79%,22.31%,23.05%


Unnamed: 0,LR,kNN,SVCrbf,SVCpoly1,linSVC1,linSVC2,DT,RF,AdaBoost,MLP1,MLP2,NB,QDA
chroma_cens,9.875,4.3125,73.8281,60.0,274.0938,134.4375,3.5469,0.3906,9.0469,148.4844,123.3438,0.0938,0.1406
chroma_cqt,10.4844,3.8438,80.5625,73.5781,416.9844,131.7812,3.2969,0.3594,8.5312,95.7031,114.1875,0.0938,0.1562
chroma_stft,1.7031,3.625,64.5,64.75,257.875,125.5938,3.2812,0.4219,8.4375,83.4062,100.7812,0.0781,0.1875
mfcc,13.1094,4.8281,90.2031,65.2031,243.1562,136.0469,6.3906,0.4219,17.25,166.7656,54.5,0.1406,0.3594
rmse,1.2812,0.7812,43.6406,25.9531,31.0781,17.9219,0.2969,0.4062,1.1562,64.4531,247.0938,0.0,0.0156
spectral_bandwidth,1.7344,0.6875,44.1875,26.7656,38.2344,19.2344,0.3125,0.4062,1.125,68.6562,91.25,0.0,0.0
spectral_centroid,1.7344,0.7344,41.7812,29.0625,39.2031,18.2344,0.2969,0.4062,1.1406,52.0,419.0938,0.0,0.0
spectral_contrast,8.6875,3.7031,42.2031,35.6875,89.2188,62.5,2.3125,0.5312,6.3125,135.4375,159.625,0.0,0.2031
spectral_rolloff,1.0156,0.7031,41.25,25.9375,35.875,18.375,0.2031,0.3125,0.9375,62.3594,169.1406,0.0,0.0
tonnetz,0.8281,3.5938,54.5625,38.3906,94.7656,63.7812,1.8438,0.4219,5.2344,86.8594,176.2812,0.0469,0.0781


### 1.3 Multiple genres

Todo:
* Ignore rare genres? Count them higher up in the genre tree? On the other hand it's not much tracks.

In [27]:
classifiers = {
    #LogisticRegression(),
    'LR': OneVsRestClassifier(LogisticRegression(max_iter= 1000)),
    'SVC': OneVsRestClassifier(SVC()),
    'MLP': MLPClassifier(max_iter=700),
}

feature_sets = {
#    'echonest_audio': ('echonest', 'audio_features'),
#    'echonest_temporal': ('echonest', 'temporal_features'),
    'mfcc': 'mfcc',
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
}

scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)

ipd.display(format_scores(scores))
ipd.display(times.style.format('{:.4f}'))

features:   0%|          | 0/3 [00:00<?, ?it/s]



Unnamed: 0,dim,LR,SVC,MLP
mfcc,140.0,11.19%,12.13%,10.69%
mfcc/contrast/chroma/centroid/tonnetz,322.0,12.98%,13.41%,8.82%
mfcc/contrast/chroma/centroid/zcr,287.0,12.90%,13.64%,9.56%


Unnamed: 0,LR,SVC,MLP
mfcc,31.0625,1114.2969,324.2969
mfcc/contrast/chroma/centroid/tonnetz,115.7188,2576.6406,394.3594
mfcc/contrast/chroma/centroid/zcr,147.2656,2241.5781,322.5469


## 2 Convolutional Neural Network

### 2.1 Build the CNN Model
Construct a simple Convolutional Neural Network (CNN) that consists of three convolutional layers and two fully connected layers to identify genres from music features

In [13]:
class MusicGenreCNN(nn.Module):
    def __init__(self, num_classes=16):
        super(MusicGenreCNN, self).__init__()
        self.num_classes = num_classes
        self.conv_layers = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1), 
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),

            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.Dropout(0.5),

            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1), 
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
        )
        
        self.fc = None

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)

        if self.fc is None:
            num_features = x.shape[1]
            self.fc = nn.Sequential(
                nn.Linear(num_features, 512),
                nn.ReLU(),
                nn.Linear(512, self.num_classes)
            )
            self.fc = self.fc.to(x.device)
        x = self.fc(x)
        return x

### 2.2 Train and evaluate the Model

In [14]:
def train_model(model, tracks, features_all, features, device,
                multi_label=False, epochs=30, batch_size=64, patience = 10):
    y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, features, multi_label)
    
    # Convert arrays to PyTorch tensors
    X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long)
    X_val, y_val = torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long)
    X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long)
    
    # Create TensorDataset
    train_dataset = TensorDataset(X_train, y_train)
    val_dataset = TensorDataset(X_val, y_val)
    test_dataset = TensorDataset(X_test, y_test)
    
    # Create data loaders
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
    
    # move the model to device
    model = model.to(device)

    # history
    history = {'train_loss': [],
              'train_acc': [],
              'valid_loss': [],
              'valid_acc': []}

    # set up loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    #optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    # scheduler = ReduceLROnPlateau(optimizer, 'min', patience=patience, factor=0.1)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

    best_val_loss = float('inf')
    patience_counter = 0
    best_model = None

    # traning loop
    print('Training Starts:')
    num_total_steps = len(train_loader)
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        train_acc = 0
        for inputs, labels in train_loader:
            inputs, labels= inputs.to(device).unsqueeze(1), labels.to(device)
            # print("inputs after unsqueeze:\n", inputs)
            # forward
            outputs = model(inputs)
            cur_train_loss = criterion(outputs, labels)
            # backward
            cur_train_loss.backward()
            optimizer.step()  # optimizer update all model parameters
            optimizer.zero_grad()  # set gradient to zero, avoid gradient accumulating
            # loss
            train_loss += cur_train_loss.item()
            # acc
            _, pred_class = torch.max(outputs, 1)
            train_acc += (pred_class == labels).float().mean().item()

        # valid
        model.eval()
        with torch.no_grad():
            val_loss = 0
            val_acc = 0
            for inputs, labels in valid_loader:
                inputs, labels= inputs.to(device).unsqueeze(1), labels.to(device)
                outputs = model(inputs)
                cur_valid_loss = criterion(outputs, labels)
                val_loss += cur_valid_loss.item()
                _, pred_class = torch.max(outputs, 1)
                val_acc += (pred_class == labels).float().mean().item()

        # print & record
        train_loss = train_loss / len(train_loader)
        train_acc = train_acc / len(train_loader)
        val_loss = val_loss / len(valid_loader)
        val_acc = val_acc / len(valid_loader)
        print(f"Epoch:{epoch + 1} / {epochs}, train loss:{train_loss:.5f}, train acc: {train_acc:.5f}, valid loss:{val_loss:.5f}, valid acc:{val_acc:.5f}")
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['valid_loss'].append(val_loss)
        history['valid_acc'].append(val_acc)

        scheduler.step()

        # Check if current epoch's loss is less than the best observed loss
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict()  # Save the best model
            patience_counter = 0  # Reset patience counter
        else:
            patience_counter += 1  # Increment patience counter

        # Check if patience limit is reached
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

    # Load the best model
    model.load_state_dict(best_model)

    model.eval()
    total_correct = 0
    total_samples = 0
    test_labels = []
    test_preds = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels= inputs.to(device).unsqueeze(1), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            correct = preds.eq(labels.view_as(preds))
            total_correct += correct.sum().item()
            total_samples += labels.size(0)
            test_labels.extend(labels.cpu().numpy())
            test_preds.extend(preds.cpu().numpy())

    test_acc = total_correct / total_samples
    print(f'Test Accuracy: {test_acc:.4f}')
    print(classification_report(test_labels, test_preds))

    return test_acc, history

In [15]:
feature_sets = {}
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})
cnn_acc_sets = {}
cnn_times = {}
for fset_name, fset in tqdm(feature_sets.items(), desc='features'):
    cnn_model = MusicGenreCNN(num_classes=16)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Feature set: {fset_name}\n')
    t = time.process_time()
    test_acc_cnn, hist_cnn = train_model(cnn_model, tracks, features_all, fset, device,
                                         multi_label=False, epochs=200, batch_size=64, patience = 10)
    cnn_acc_sets[fset_name] = test_acc_cnn
    cnn_times[fset_name] = time.process_time() - t
    print('-'*70)

features:   0%|          | 0/7 [00:00<?, ?it/s]

Feature set: mfcc/contrast

Training Starts:
Epoch:1 / 200, train loss:1.93473, train acc: 0.46787, valid loss:1.59389, valid acc:0.55104
Epoch:2 / 200, train loss:1.58137, train acc: 0.54273, valid loss:1.40854, valid acc:0.59401
Epoch:3 / 200, train loss:1.45455, train acc: 0.57400, valid loss:1.32250, valid acc:0.61749
Epoch:4 / 200, train loss:1.38207, train acc: 0.59385, valid loss:1.28598, valid acc:0.62643
Epoch:5 / 200, train loss:1.33097, train acc: 0.60756, valid loss:1.26032, valid acc:0.62565
Epoch:6 / 200, train loss:1.29437, train acc: 0.61438, valid loss:1.23256, valid acc:0.63893
Epoch:7 / 200, train loss:1.26354, train acc: 0.62123, valid loss:1.21880, valid acc:0.63859
Epoch:8 / 200, train loss:1.23440, train acc: 0.63094, valid loss:1.20381, valid acc:0.65030
Epoch:9 / 200, train loss:1.21349, train acc: 0.63710, valid loss:1.19072, valid acc:0.64835
Epoch:10 / 200, train loss:1.19556, train acc: 0.63535, valid loss:1.18873, valid acc:0.64640
Epoch:11 / 200, train lo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.74      0.81      0.77        62
           2       0.00      0.00      0.00        18
           3       0.00      0.00      0.00         6
           4       0.66      0.84      0.74       632
           5       0.37      0.28      0.32       225
           6       0.33      0.34      0.34       152
           7       0.66      0.64      0.65       220
           8       0.42      0.26      0.32       174
           9       0.49      0.33      0.40       102
          10       0.84      0.41      0.55        39
          11       0.88      0.98      0.93        51
          12       0.20      0.03      0.06       119
          13       0.70      0.88      0.78       711
          14       0.00      0.00      0.00        42
          15       0.00      0.00      0.00        12

    accuracy                           0.62      2573
   macro avg       0.39   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Feature set: mfcc/contrast/centroid

Training Starts:
Epoch:1 / 200, train loss:1.92390, train acc: 0.49106, valid loss:1.61269, valid acc:0.54870
Epoch:2 / 200, train loss:1.56395, train acc: 0.56062, valid loss:1.41392, valid acc:0.58307
Epoch:3 / 200, train loss:1.44078, train acc: 0.58614, valid loss:1.32862, valid acc:0.61745
Epoch:4 / 200, train loss:1.36957, train acc: 0.60007, valid loss:1.28937, valid acc:0.62843
Epoch:5 / 200, train loss:1.31927, train acc: 0.61171, valid loss:1.25299, valid acc:0.62218
Epoch:6 / 200, train loss:1.28575, train acc: 0.61759, valid loss:1.24397, valid acc:0.62999
Epoch:7 / 200, train loss:1.25253, train acc: 0.62834, valid loss:1.21645, valid acc:0.64332
Epoch:8 / 200, train loss:1.22571, train acc: 0.63302, valid loss:1.21413, valid acc:0.64180
Epoch:9 / 200, train loss:1.20639, train acc: 0.63758, valid loss:1.19982, valid acc:0.64214
Epoch:10 / 200, train loss:1.18199, train acc: 0.64303, valid loss:1.19577, valid acc:0.64449
Epoch:11 / 200,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.80      0.76      0.78        62
           2       0.00      0.00      0.00        18
           3       0.00      0.00      0.00         6
           4       0.66      0.82      0.73       632
           5       0.39      0.34      0.36       225
           6       0.28      0.23      0.25       152
           7       0.65      0.59      0.62       220
           8       0.44      0.23      0.30       174
           9       0.49      0.37      0.42       102
          10       0.94      0.38      0.55        39
          11       0.84      0.96      0.90        51
          12       0.35      0.06      0.10       119
          13       0.67      0.89      0.77       711
          14       0.00      0.00      0.00        42
          15       0.00      0.00      0.00        12

    accuracy                           0.62      2573
   macro avg       0.41   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.76      0.89      0.82        62
           2       0.00      0.00      0.00        18
           3       0.00      0.00      0.00         6
           4       0.67      0.80      0.73       632
           5       0.37      0.32      0.35       225
           6       0.32      0.30      0.31       152
           7       0.63      0.65      0.64       220
           8       0.44      0.21      0.28       174
           9       0.42      0.15      0.22       102
          10       1.00      0.23      0.38        39
          11       0.91      0.98      0.94        51
          12       0.33      0.08      0.12       119
          13       0.66      0.90      0.76       711
          14       0.00      0.00      0.00        42
          15       0.00      0.00      0.00        12

    accuracy                           0.61      2573
   macro avg       0.41   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.83      0.81      0.82        62
           2       0.00      0.00      0.00        18
           3       0.00      0.00      0.00         6
           4       0.67      0.82      0.74       632
           5       0.39      0.36      0.38       225
           6       0.31      0.38      0.34       152
           7       0.68      0.61      0.64       220
           8       0.47      0.24      0.31       174
           9       0.55      0.25      0.35       102
          10       0.82      0.36      0.50        39
          11       0.93      0.98      0.95        51
          12       0.29      0.04      0.07       119
          13       0.69      0.88      0.77       711
          14       0.00      0.00      0.00        42
          15       0.00      0.00      0.00        12

    accuracy                           0.62      2573
   macro avg       0.41   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training Starts:
Epoch:1 / 200, train loss:1.94501, train acc: 0.44080, valid loss:1.64946, valid acc:0.53585
Epoch:2 / 200, train loss:1.60883, train acc: 0.52545, valid loss:1.46972, valid acc:0.56871
Epoch:3 / 200, train loss:1.48238, train acc: 0.56395, valid loss:1.39677, valid acc:0.59644
Epoch:4 / 200, train loss:1.40169, train acc: 0.58960, valid loss:1.33786, valid acc:0.61324
Epoch:5 / 200, train loss:1.35108, train acc: 0.60367, valid loss:1.30101, valid acc:0.62578
Epoch:6 / 200, train loss:1.30890, train acc: 0.61404, valid loss:1.26503, valid acc:0.63398
Epoch:7 / 200, train loss:1.27091, train acc: 0.62643, valid loss:1.23936, valid acc:0.64609
Epoch:8 / 200, train loss:1.24000, train acc: 0.63209, valid loss:1.21757, valid acc:0.64727
Epoch:9 / 200, train loss:1.21633, train acc: 0.63845, valid loss:1.21875, valid acc:0.65078
Epoch:10 / 200, train loss:1.19244, train acc: 0.64311, valid loss:1.22013, valid acc:0.64414
Epoch:11 / 200, train loss:1.17504, train acc: 0.647

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print('Accuracy of CNN model using different feature sets:\nFeature set\tAccuracy')
for name in list(cnn_acc_sets.keys()):
    print(f'{name}:\t{cnn_acc_sets[name]:.4f}')
print('Training time of CNN model using different feature sets:\nFeature set\tTime')
for name in list(cnn_times.keys()):
    print(f'{name}:\t{cnn_times[name]:.4f}s')

Accuracy of CNN model using different feature sets:
Feature set	Accuracy
mfcc/contrast:	0.6250
mfcc/contrast/chroma:	0.6238
mfcc/contrast/centroid:	0.6183
mfcc/contrast/chroma/centroid:	0.6137
mfcc/contrast/chroma/centroid/tonnetz:	0.6242
mfcc/contrast/chroma/centroid/zcr:	0.6168
all_non-echonest:	0.6269
Training time of CNN model using different feature sets:
Feature set	Time
mfcc/contrast:	31.7812s
mfcc/contrast/chroma:	46.0938s
mfcc/contrast/centroid:	27.9375s
mfcc/contrast/chroma/centroid:	30.0469s
mfcc/contrast/chroma/centroid/tonnetz:	39.2344s
mfcc/contrast/chroma/centroid/zcr:	37.0312s
all_non-echonest:	45.4844s


## 3 Recurrent Neural Networks

In [17]:
class MusicGenreRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, bidirectional=False, dropout=0.5):
        super(MusicGenreRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size=input_size, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True, 
                            dropout=dropout if num_layers > 1 else 0, 
                            bidirectional=bidirectional)
        
        # Fully connected layer
        multiplier = 2 if bidirectional else 1
        self.fc = nn.Linear(hidden_size * multiplier, num_classes)

    def forward(self, x):
        # Set initial hidden and cell states 
        h0 = torch.zeros(self.num_layers * (2 if self.bidirectional else 1), x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * (2 if self.bidirectional else 1), x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [18]:
y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, ['mfcc', 'spectral_contrast', 'chroma_cens'], multi_label=False)
print(y_train.shape, y_val.shape, y_test.shape, X_train.shape, X_val.shape, X_test.shape)
print(X_train.shape[1])

(19922,) (2505,) (2573,) (19922, 273) (2505, 273) (2573, 273)
273


In [19]:
feature_sets = {}
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})
rnn_acc_sets = {}
rnn_times = {}
for fset_name, fset in tqdm(feature_sets.items(), desc='features'):
    y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label=False)
    input_size = X_train.shape[1]
    rnn_model = MusicGenreRNN(input_size, hidden_size=512, num_layers=4, num_classes=16, bidirectional=True, dropout=0.5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Feature set: {fset_name}\n')
    t = time.process_time()
    test_acc_rnn, hist_rnn = train_model(rnn_model, tracks, features_all, fset, device,
                                         multi_label=False, epochs=200, batch_size=64, patience = 10)
    rnn_acc_sets[fset_name] = test_acc_rnn
    rnn_times[fset_name] = time.process_time() - t
    print('-'*70)

features:   0%|          | 0/7 [00:00<?, ?it/s]

Feature set: mfcc/contrast

Training Starts:
Epoch:1 / 200, train loss:1.42500, train acc: 0.56262, valid loss:1.20225, valid acc:0.64058
Epoch:2 / 200, train loss:1.15608, train acc: 0.63987, valid loss:1.16403, valid acc:0.64766
Epoch:3 / 200, train loss:1.05324, train acc: 0.66938, valid loss:1.20464, valid acc:0.65464
Epoch:4 / 200, train loss:0.97505, train acc: 0.69137, valid loss:1.14285, valid acc:0.66636
Epoch:5 / 200, train loss:0.88638, train acc: 0.71818, valid loss:1.20844, valid acc:0.65777
Epoch:6 / 200, train loss:0.80797, train acc: 0.74035, valid loss:1.21449, valid acc:0.65855
Epoch:7 / 200, train loss:0.73338, train acc: 0.76155, valid loss:1.29924, valid acc:0.64362
Epoch:8 / 200, train loss:0.66036, train acc: 0.78670, valid loss:1.34990, valid acc:0.62643
Epoch:9 / 200, train loss:0.59129, train acc: 0.80784, valid loss:1.41046, valid acc:0.63273
Epoch:10 / 200, train loss:0.52805, train acc: 0.82816, valid loss:1.48801, valid acc:0.63307
Epoch:11 / 200, train lo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Feature set: mfcc/contrast/chroma

Training Starts:
Epoch:1 / 200, train loss:1.40808, train acc: 0.56764, valid loss:1.18564, valid acc:0.64918
Epoch:2 / 200, train loss:1.12196, train acc: 0.65492, valid loss:1.14865, valid acc:0.66871
Epoch:3 / 200, train loss:1.00324, train acc: 0.68451, valid loss:1.20758, valid acc:0.64844
Epoch:4 / 200, train loss:0.89861, train acc: 0.71511, valid loss:1.20066, valid acc:0.64766
Epoch:5 / 200, train loss:0.79146, train acc: 0.74719, valid loss:1.19959, valid acc:0.65191
Epoch:6 / 200, train loss:0.69634, train acc: 0.77906, valid loss:1.28788, valid acc:0.64722
Epoch:7 / 200, train loss:0.60369, train acc: 0.80906, valid loss:1.36834, valid acc:0.63984
Epoch:8 / 200, train loss:0.51696, train acc: 0.83556, valid loss:1.39854, valid acc:0.65039
Epoch:9 / 200, train loss:0.44503, train acc: 0.85719, valid loss:1.47855, valid acc:0.63828
Epoch:10 / 200, train loss:0.37820, train acc: 0.87655, valid loss:1.58882, valid acc:0.63546
Epoch:11 / 200, t

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Feature set: mfcc/contrast/centroid

Training Starts:
Epoch:1 / 200, train loss:1.42834, train acc: 0.56262, valid loss:1.21205, valid acc:0.64570
Epoch:2 / 200, train loss:1.15742, train acc: 0.63733, valid loss:1.17020, valid acc:0.65820
Epoch:3 / 200, train loss:1.04945, train acc: 0.67006, valid loss:1.23486, valid acc:0.62604
Epoch:4 / 200, train loss:0.96342, train acc: 0.69409, valid loss:1.19220, valid acc:0.64136
Epoch:5 / 200, train loss:0.88008, train acc: 0.71685, valid loss:1.20554, valid acc:0.64996
Epoch:6 / 200, train loss:0.79509, train acc: 0.74360, valid loss:1.25167, valid acc:0.65113
Epoch:7 / 200, train loss:0.72390, train acc: 0.76667, valid loss:1.26876, valid acc:0.64913
Epoch:8 / 200, train loss:0.64328, train acc: 0.79040, valid loss:1.33481, valid acc:0.64410
Epoch:9 / 200, train loss:0.58980, train acc: 0.80919, valid loss:1.40337, valid acc:0.65308
Epoch:10 / 200, train loss:0.51464, train acc: 0.83265, valid loss:1.55639, valid acc:0.60964
Epoch:11 / 200,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Feature set: mfcc/contrast/chroma/centroid

Training Starts:
Epoch:1 / 200, train loss:1.40065, train acc: 0.57030, valid loss:1.23331, valid acc:0.62183
Epoch:2 / 200, train loss:1.11810, train acc: 0.65336, valid loss:1.17876, valid acc:0.63624
Epoch:3 / 200, train loss:0.99719, train acc: 0.68924, valid loss:1.15177, valid acc:0.65312
Epoch:4 / 200, train loss:0.89035, train acc: 0.71808, valid loss:1.16375, valid acc:0.65508
Epoch:5 / 200, train loss:0.78870, train acc: 0.75017, valid loss:1.21039, valid acc:0.65699
Epoch:6 / 200, train loss:0.68583, train acc: 0.78058, valid loss:1.24940, valid acc:0.64957
Epoch:7 / 200, train loss:0.59265, train acc: 0.81159, valid loss:1.40140, valid acc:0.64336
Epoch:8 / 200, train loss:0.51167, train acc: 0.83776, valid loss:1.41728, valid acc:0.64839
Epoch:9 / 200, train loss:0.44384, train acc: 0.85826, valid loss:1.50820, valid acc:0.64453
Epoch:10 / 200, train loss:0.38059, train acc: 0.87890, valid loss:1.57424, valid acc:0.63277
Epoch:11

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Feature set: mfcc/contrast/chroma/centroid/zcr

Training Starts:
Epoch:1 / 200, train loss:1.39676, train acc: 0.57310, valid loss:1.18431, valid acc:0.65195
Epoch:2 / 200, train loss:1.11292, train acc: 0.65740, valid loss:1.13483, valid acc:0.65933
Epoch:3 / 200, train loss:0.99337, train acc: 0.69094, valid loss:1.17789, valid acc:0.65273
Epoch:4 / 200, train loss:0.88051, train acc: 0.72219, valid loss:1.22796, valid acc:0.64414
Epoch:5 / 200, train loss:0.77821, train acc: 0.75384, valid loss:1.25039, valid acc:0.64332
Epoch:6 / 200, train loss:0.67778, train acc: 0.78510, valid loss:1.29841, valid acc:0.64062
Epoch:7 / 200, train loss:0.59095, train acc: 0.81326, valid loss:1.33922, valid acc:0.63941
Epoch:8 / 200, train loss:0.50462, train acc: 0.84319, valid loss:1.38323, valid acc:0.64644
Epoch:9 / 200, train loss:0.43335, train acc: 0.86164, valid loss:1.49518, valid acc:0.64527
Epoch:10 / 200, train loss:0.36423, train acc: 0.88404, valid loss:1.65322, valid acc:0.64019
Epoc

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.68      0.77      0.72        62
           2       0.00      0.00      0.00        18
           3       0.00      0.00      0.00         6
           4       0.69      0.79      0.74       632
           5       0.34      0.40      0.37       225
           6       0.28      0.23      0.25       152
           7       0.72      0.61      0.66       220
           8       0.33      0.20      0.25       174
           9       0.62      0.36      0.46       102
          10       0.46      0.49      0.48        39
          11       0.74      0.98      0.84        51
          12       0.12      0.12      0.12       119
          13       0.72      0.80      0.75       711
          14       0.17      0.02      0.04        42
          15       0.22      0.17      0.19        12

    accuracy                           0.60      2573
   macro avg       0.38   

In [20]:
print('Accuracy of RNN model using different feature sets:\nFeature set\tAccuracy')
for name in list(rnn_acc_sets.keys()):
    print(f'{name}:\t{rnn_acc_sets[name]:.4f}')
print('Training time of RNN model using different feature sets:\nFeature set\tTime')
for name in list(rnn_times.keys()):
    print(f'{name}:\t{rnn_times[name]:.4f}s')

Accuracy of RNN model using different feature sets:
Feature set	Accuracy
mfcc/contrast:	0.6012
mfcc/contrast/chroma:	0.6024
mfcc/contrast/centroid:	0.6102
mfcc/contrast/chroma/centroid:	0.6218
mfcc/contrast/chroma/centroid/tonnetz:	0.6156
mfcc/contrast/chroma/centroid/zcr:	0.5962
all_non-echonest:	0.6059
Training time of RNN model using different feature sets:
Feature set	Time
mfcc/contrast:	20.9062s
mfcc/contrast/chroma:	22.3594s
mfcc/contrast/centroid:	22.8438s
mfcc/contrast/chroma/centroid:	25.0156s
mfcc/contrast/chroma/centroid/tonnetz:	23.2188s
mfcc/contrast/chroma/centroid/zcr:	18.3281s
all_non-echonest:	23.1875s


## 4 CNN-RNN model

In [21]:
class MusicGenreCNNRNN(nn.Module):
    def __init__(self, num_classes):
        super(MusicGenreCNNRNN, self).__init__()
        # define CNN layers
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),
            
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Dropout(0.5),
            
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.MaxPool1d(2),
        )
        
        # LSTM layer
        self.rnn = nn.LSTM(
            input_size=512, 
            hidden_size=512,
            num_layers=4,
            batch_first=True,
            dropout=0.5
        )
        
        # Fully connected layer
        self.fc = nn.Linear(512, num_classes)
        
    def forward(self, x):
        x = self.cnn(x)
        x = x.transpose(1, 2)  # (batch, seq, feature)
        x, (h_n, c_n) = self.rnn(x)
        x = self.fc(x[:, -1, :])
        
        return x

In [22]:
feature_sets = {}
feature_sets.update({
    'mfcc/contrast': ['mfcc', 'spectral_contrast'],
    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],
    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],
    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],
    'all_non-echonest': list(features.columns.levels[0])
})
cnn_rnn_acc_sets = {}
cnn_rnn_times = {}
for fset_name, fset in tqdm(feature_sets.items(), desc='features'):
    cnn_rnn_model = MusicGenreCNNRNN(num_classes=16)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f'Feature set: {fset_name}\n')
    t = time.process_time()
    test_acc_cnn_rnn, hist_cnn_rnn = train_model(cnn_rnn_model, tracks, features_all, fset, device,
                                         multi_label=False, epochs=200, batch_size=64, patience = 10)
    cnn_rnn_acc_sets[fset_name] = test_acc_cnn_rnn
    cnn_rnn_times[fset_name] = time.process_time() - t
    print('-'*70)

features:   0%|          | 0/7 [00:00<?, ?it/s]

Feature set: mfcc/contrast

Training Starts:
Epoch:1 / 200, train loss:1.93068, train acc: 0.37908, valid loss:1.65315, valid acc:0.47444
Epoch:2 / 200, train loss:1.60457, train acc: 0.49359, valid loss:1.42089, valid acc:0.56992
Epoch:3 / 200, train loss:1.47978, train acc: 0.53316, valid loss:1.33280, valid acc:0.59171
Epoch:4 / 200, train loss:1.40928, train acc: 0.55068, valid loss:1.30362, valid acc:0.60699
Epoch:5 / 200, train loss:1.36588, train acc: 0.56712, valid loss:1.25399, valid acc:0.63160
Epoch:6 / 200, train loss:1.31617, train acc: 0.58121, valid loss:1.26344, valid acc:0.61241
Epoch:7 / 200, train loss:1.29147, train acc: 0.58927, valid loss:1.23863, valid acc:0.61910
Epoch:8 / 200, train loss:1.25254, train acc: 0.59811, valid loss:1.25085, valid acc:0.62305
Epoch:9 / 200, train loss:1.23067, train acc: 0.60490, valid loss:1.23420, valid acc:0.62773
Epoch:10 / 200, train loss:1.20519, train acc: 0.61477, valid loss:1.24520, valid acc:0.62500
Epoch:11 / 200, train lo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training Starts:
Epoch:1 / 200, train loss:2.04443, train acc: 0.33113, valid loss:1.84159, valid acc:0.43937
Epoch:2 / 200, train loss:1.76479, train acc: 0.44325, valid loss:1.59368, valid acc:0.51589
Epoch:3 / 200, train loss:1.60605, train acc: 0.49102, valid loss:1.51902, valid acc:0.53503
Epoch:4 / 200, train loss:1.51189, train acc: 0.51935, valid loss:1.43095, valid acc:0.54753
Epoch:5 / 200, train loss:1.44026, train acc: 0.53777, valid loss:1.35849, valid acc:0.57105
Epoch:6 / 200, train loss:1.39403, train acc: 0.55632, valid loss:1.34593, valid acc:0.57496
Epoch:7 / 200, train loss:1.35087, train acc: 0.56553, valid loss:1.31726, valid acc:0.57565
Epoch:8 / 200, train loss:1.31443, train acc: 0.57788, valid loss:1.27539, valid acc:0.60191
Epoch:9 / 200, train loss:1.28519, train acc: 0.58905, valid loss:1.26194, valid acc:0.61211
Epoch:10 / 200, train loss:1.25102, train acc: 0.60284, valid loss:1.29145, valid acc:0.60503
Epoch:11 / 200, train loss:1.22625, train acc: 0.607

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training Starts:
Epoch:1 / 200, train loss:1.95978, train acc: 0.38028, valid loss:1.57688, valid acc:0.51519
Epoch:2 / 200, train loss:1.61044, train acc: 0.49130, valid loss:1.42371, valid acc:0.55069
Epoch:3 / 200, train loss:1.49944, train acc: 0.52312, valid loss:1.37739, valid acc:0.57370
Epoch:4 / 200, train loss:1.43880, train acc: 0.53999, valid loss:1.33133, valid acc:0.59054
Epoch:5 / 200, train loss:1.38636, train acc: 0.55984, valid loss:1.33850, valid acc:0.59570
Epoch:6 / 200, train loss:1.35132, train acc: 0.56976, valid loss:1.34968, valid acc:0.59640
Epoch:7 / 200, train loss:1.31902, train acc: 0.58426, valid loss:1.27225, valid acc:0.61558
Epoch:8 / 200, train loss:1.29227, train acc: 0.58539, valid loss:1.29672, valid acc:0.60577
Epoch:9 / 200, train loss:1.25498, train acc: 0.59935, valid loss:1.28452, valid acc:0.61675
Epoch:10 / 200, train loss:1.24023, train acc: 0.60465, valid loss:1.24087, valid acc:0.62813
Epoch:11 / 200, train loss:1.22225, train acc: 0.610

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training Starts:
Epoch:1 / 200, train loss:2.06453, train acc: 0.31944, valid loss:1.92677, valid acc:0.42448
Epoch:2 / 200, train loss:1.81508, train acc: 0.42505, valid loss:1.71020, valid acc:0.47730
Epoch:3 / 200, train loss:1.66442, train acc: 0.46319, valid loss:1.60322, valid acc:0.50543
Epoch:4 / 200, train loss:1.57444, train acc: 0.49166, valid loss:1.55228, valid acc:0.50326
Epoch:5 / 200, train loss:1.49838, train acc: 0.51382, valid loss:1.40160, valid acc:0.57886
Epoch:6 / 200, train loss:1.42978, train acc: 0.54062, valid loss:1.36046, valid acc:0.57648
Epoch:7 / 200, train loss:1.37747, train acc: 0.55726, valid loss:1.33675, valid acc:0.58194
Epoch:8 / 200, train loss:1.32783, train acc: 0.57646, valid loss:1.30233, valid acc:0.59957
Epoch:9 / 200, train loss:1.29558, train acc: 0.58304, valid loss:1.26685, valid acc:0.61007
Epoch:10 / 200, train loss:1.26648, train acc: 0.59405, valid loss:1.30837, valid acc:0.60894
Epoch:11 / 200, train loss:1.23831, train acc: 0.601

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training Starts:
Epoch:1 / 200, train loss:2.06885, train acc: 0.31736, valid loss:1.90426, valid acc:0.43546
Epoch:2 / 200, train loss:1.83099, train acc: 0.42014, valid loss:1.64533, valid acc:0.47448
Epoch:3 / 200, train loss:1.64514, train acc: 0.46978, valid loss:1.49771, valid acc:0.52852
Epoch:4 / 200, train loss:1.55899, train acc: 0.49460, valid loss:1.42725, valid acc:0.53589
Epoch:5 / 200, train loss:1.49054, train acc: 0.51868, valid loss:1.40684, valid acc:0.54796
Epoch:6 / 200, train loss:1.44648, train acc: 0.53435, valid loss:1.37695, valid acc:0.55308
Epoch:7 / 200, train loss:1.40082, train acc: 0.54480, valid loss:1.33139, valid acc:0.58668
Epoch:8 / 200, train loss:1.36717, train acc: 0.55898, valid loss:1.29230, valid acc:0.58863
Epoch:9 / 200, train loss:1.33968, train acc: 0.56859, valid loss:1.30356, valid acc:0.58941
Epoch:10 / 200, train loss:1.30771, train acc: 0.58183, valid loss:1.27043, valid acc:0.59449
Epoch:11 / 200, train loss:1.27739, train acc: 0.589

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training Starts:
Epoch:1 / 200, train loss:2.01229, train acc: 0.34521, valid loss:1.81930, valid acc:0.39076
Epoch:2 / 200, train loss:1.72028, train acc: 0.45360, valid loss:1.63000, valid acc:0.46311
Epoch:3 / 200, train loss:1.59138, train acc: 0.48934, valid loss:1.50144, valid acc:0.52995
Epoch:4 / 200, train loss:1.50521, train acc: 0.51615, valid loss:1.43990, valid acc:0.54288
Epoch:5 / 200, train loss:1.43787, train acc: 0.53784, valid loss:1.36980, valid acc:0.56914
Epoch:6 / 200, train loss:1.38671, train acc: 0.55726, valid loss:1.31201, valid acc:0.58511
Epoch:7 / 200, train loss:1.35003, train acc: 0.56444, valid loss:1.33023, valid acc:0.58320
Epoch:8 / 200, train loss:1.30934, train acc: 0.58053, valid loss:1.31289, valid acc:0.59288
Epoch:9 / 200, train loss:1.26484, train acc: 0.59488, valid loss:1.25529, valid acc:0.60738
Epoch:10 / 200, train loss:1.23463, train acc: 0.60628, valid loss:1.23032, valid acc:0.62144
Epoch:11 / 200, train loss:1.21975, train acc: 0.609

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Training Starts:
Epoch:1 / 200, train loss:2.06749, train acc: 0.29595, valid loss:1.83568, valid acc:0.40807
Epoch:2 / 200, train loss:1.70410, train acc: 0.45700, valid loss:1.55284, valid acc:0.50612
Epoch:3 / 200, train loss:1.55741, train acc: 0.50256, valid loss:1.43437, valid acc:0.53924
Epoch:4 / 200, train loss:1.48828, train acc: 0.52304, valid loss:1.41153, valid acc:0.55330
Epoch:5 / 200, train loss:1.44393, train acc: 0.53762, valid loss:1.34206, valid acc:0.57951
Epoch:6 / 200, train loss:1.40333, train acc: 0.54618, valid loss:1.33681, valid acc:0.58034
Epoch:7 / 200, train loss:1.36485, train acc: 0.55747, valid loss:1.31822, valid acc:0.59605
Epoch:8 / 200, train loss:1.33701, train acc: 0.56844, valid loss:1.29861, valid acc:0.59757
Epoch:9 / 200, train loss:1.30370, train acc: 0.57833, valid loss:1.30570, valid acc:0.59839
Epoch:10 / 200, train loss:1.29015, train acc: 0.58141, valid loss:1.27621, valid acc:0.60104
Epoch:11 / 200, train loss:1.26062, train acc: 0.589

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
print('Accuracy of CNN-RNN model using different feature sets:\nFeature set\tAccuracy')
for name in list(cnn_rnn_acc_sets.keys()):
    print(f'{name}:\t{cnn_rnn_acc_sets[name]:.4f}')
print('Training time of CNN-RNN model using different feature sets:\nFeature set\tTime')
for name in list(cnn_rnn_times.keys()):
    print(f'{name}:\t{cnn_rnn_times[name]:.4f}s')

Accuracy of CNN-RNN model using different feature sets:
Feature set	Accuracy
mfcc/contrast:	0.5869
mfcc/contrast/chroma:	0.5939
mfcc/contrast/centroid:	0.5935
mfcc/contrast/chroma/centroid:	0.6090
mfcc/contrast/chroma/centroid/tonnetz:	0.6055
mfcc/contrast/chroma/centroid/zcr:	0.5834
all_non-echonest:	0.5966
Training time of CNN-RNN model using different feature sets:
Feature set	Time
mfcc/contrast:	63.2500s
mfcc/contrast/chroma:	52.6562s
mfcc/contrast/centroid:	53.3594s
mfcc/contrast/chroma/centroid:	41.5312s
mfcc/contrast/chroma/centroid/tonnetz:	57.7031s
mfcc/contrast/chroma/centroid/zcr:	41.6250s
all_non-echonest:	183.3594s
