# Neural Net classification of music genre

## I. Loading and processing data

In [65]:
import numpy as np
import pandas as pd
from utils.load_data import fma_load

features = fma_load('data/fma_metadata/features.csv')

features

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.347620,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.071289,0.000000,2.089872,0.061448
3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,0.619185,...,0.063831,0.014212,0.017740,2.824694,0.466309,0.084578,0.063965,0.000000,1.716724,0.069330
5,0.527563,-0.077654,-0.279610,0.685883,1.937570,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.041504,0.000000,2.193303,0.044861
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.000000,3.542325,0.040800
20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.816410,0.043851,-0.804761,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,-0.490129,0.463834,2.321970,-0.084352,1.662914,2.115189,-0.237794,5.695442,0.830353,1.951819,...,0.128410,0.022547,0.019816,4.448255,0.172852,0.028773,0.028320,0.003906,0.955388,0.012385
155317,-0.461559,-0.229601,-0.496632,-0.422033,0.130612,-0.263825,-0.628103,-0.082687,-0.229483,-0.492753,...,0.132964,0.023548,0.026527,3.270612,0.196289,0.031116,0.027832,0.002441,1.283060,0.019059
155318,0.552473,-0.110498,-0.532014,0.263131,-0.224011,-0.530972,1.713526,1.418444,1.325197,0.120333,...,0.108324,0.017540,0.020471,2.356727,0.212891,0.038450,0.037109,0.003418,0.828569,0.017904
155319,-0.176901,0.187208,-0.050664,0.368843,0.066005,-0.857354,-0.780860,0.626281,-0.630938,-0.787229,...,0.088311,0.018328,0.017936,6.188604,0.167480,0.041480,0.038086,0.004883,1.818740,0.020133


In [66]:
tracks = fma_load('data/fma_metadata/tracks.csv')

In [67]:
tracks['track'].columns

Index(['bit_rate', 'comments', 'composer', 'date_created', 'date_recorded',
       'duration', 'favorites', 'genre_top', 'genres', 'genres_all',
       'information', 'interest', 'language_code', 'license', 'listens',
       'lyricist', 'number', 'publisher', 'tags', 'title'],
      dtype='object')

In [68]:
y = tracks['track','genre_top']

In [69]:
print(f'Nombre de labels: {y.value_counts()}')

Nombre de labels: (track, genre_top)
Rock                   14182
Experimental           10608
Electronic              9372
Hip-Hop                 3552
Folk                    2803
Pop                     2332
Instrumental            2079
International           1389
Classical               1230
Jazz                     571
Old-Time / Historic      554
Spoken                   423
Country                  194
Soul-RnB                 175
Blues                    110
Easy Listening            24
Name: count, dtype: int64


In [7]:
print(f'Nombre de labels non définis: {y.isna().sum()}')

Nombre de labels non définis: 56976


## II. Feature selection

In [16]:
features

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.347620,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.071289,0.000000,2.089872,0.061448
3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,0.619185,...,0.063831,0.014212,0.017740,2.824694,0.466309,0.084578,0.063965,0.000000,1.716724,0.069330
5,0.527563,-0.077654,-0.279610,0.685883,1.937570,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.041504,0.000000,2.193303,0.044861
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.000000,3.542325,0.040800
20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.816410,0.043851,-0.804761,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,-0.490129,0.463834,2.321970,-0.084352,1.662914,2.115189,-0.237794,5.695442,0.830353,1.951819,...,0.128410,0.022547,0.019816,4.448255,0.172852,0.028773,0.028320,0.003906,0.955388,0.012385
155317,-0.461559,-0.229601,-0.496632,-0.422033,0.130612,-0.263825,-0.628103,-0.082687,-0.229483,-0.492753,...,0.132964,0.023548,0.026527,3.270612,0.196289,0.031116,0.027832,0.002441,1.283060,0.019059
155318,0.552473,-0.110498,-0.532014,0.263131,-0.224011,-0.530972,1.713526,1.418444,1.325197,0.120333,...,0.108324,0.017540,0.020471,2.356727,0.212891,0.038450,0.037109,0.003418,0.828569,0.017904
155319,-0.176901,0.187208,-0.050664,0.368843,0.066005,-0.857354,-0.780860,0.626281,-0.630938,-0.787229,...,0.088311,0.018328,0.017936,6.188604,0.167480,0.041480,0.038086,0.004883,1.818740,0.020133


In [70]:
mfcc_df = features['mfcc']

## III. Classification with simple feed-forward MLP

In [71]:
import torch
from sklearn.preprocessing import LabelEncoder, StandardScaler

mfcc_df['genre_top'] = y
mfcc_df.dropna(inplace=True)

le = LabelEncoder()
y_encoded = le.fit_transform(mfcc_df['genre_top'])

mfcc_df.drop(columns=['genre_top'],inplace=True)

# mfcc_df shape: (num_tracks, num_columns)
scaler = StandardScaler()
scaled = scaler.fit_transform(mfcc_df.values)

num_tracks = len(mfcc_df)
num_columns = mfcc_df.shape[1]

# Choose number of statistics manually (or detect)
num_statistics = 7
num_numbers = num_columns // num_statistics  # compute dynamically

tensor = mfcc_df.values.reshape(num_tracks, num_statistics, num_numbers)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mfcc_df['genre_top'] = y
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mfcc_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mfcc_df.drop(columns=['genre_top'],inplace=True)


In [72]:
X = torch.tensor(tensor,dtype=torch.float32)
X = X.unsqueeze(1)

y = torch.tensor(y_encoded, dtype=torch.long)

print(X.shape,y.shape)
print("Classes:", le.classes_)

torch.Size([49598, 1, 7, 20]) torch.Size([49598])
Classes: ['Blues' 'Classical' 'Country' 'Easy Listening' 'Electronic'
 'Experimental' 'Folk' 'Hip-Hop' 'Instrumental' 'International' 'Jazz'
 'Old-Time / Historic' 'Pop' 'Rock' 'Soul-RnB' 'Spoken']


In [73]:
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

all_train = TensorDataset(X_train,y_train)

num_train = int(0.8 * len(all_train))

trainset, valset = torch.utils.data.random_split(all_train, [num_train, len(all_train) - num_train])
trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
valloader = DataLoader(valset, batch_size=32, shuffle=True)

In [74]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [75]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ConvNet(nn.Module):
    def __init__(self, num_classes=16):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool = nn.MaxPool2d(2)
        self.dropout = nn.Dropout(0.3)
        
        # flattened size after conv+pool
        self.fc = nn.Linear(64*3*10, 128)
        self.out = nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc(x))
        x = self.dropout(x)
        x = self.out(x)
        return x



In [76]:
# Training consists of gradient steps over mini batch of data
def train(model, trainloader, criterion, optimizer, epoch, num_epochs):
    # We enter train mode. This is useless for the linear model
    # but is important for layers such as dropout, batchnorm, ...
    model.train()

    loop = tqdm(trainloader)
    loop.set_description(f'Training Epoch [{epoch + 1}/{num_epochs}]')

    # We iterate over the mini batches of our data
    for inputs, targets in loop:

        # Erase any previously stored gradient
        optimizer.zero_grad()


        outputs = model(inputs) # Forwards stage (prediction with current weights)
        loss = criterion(outputs, targets) # loss evaluation

        loss.backward() # Back propagation (evaluate gradients)


        # Making gradient step on the batch (this function takes care of the gradient step for us)
        optimizer.step()

def validation(model, valloader, loss):
    # Do not compute gradient, since we do not need it for validation step
    with torch.no_grad():
        # We enter evaluation mode.
        model.eval()

        total = 0 # keep track of currently used samples
        running_loss = 0.0 # accumulated loss without averagind
        accuracy = 0.0 # accumulated accuracy without averagind (number of correct predictions)

        loop = tqdm(valloader) # This is for the progress bar
        loop.set_description('Validation in progress')


        # We again iterate over the batches of validation data. batch_size does not play any role here
        for inputs, targets in loop:
            # Run samples through our net
            outputs = model(inputs)

            # Total number of used samples
            total += inputs.shape[0]

            # Multiply loss by the batch size to erase averagind on the batch
            running_loss += inputs.shape[0] * loss(outputs, targets).item()

            # how many correct predictions
            accuracy += (outputs.argmax(dim=1) == targets).sum().item()

            # set nice progress meassage
            loop.set_postfix(val_loss=(running_loss / total), val_acc=(accuracy / total))       
        
        return running_loss / total, accuracy / total

In [80]:
# Net + training parameters
num_epochs = 100 # how many passes over the whole train data
lr = 0.003 # learning rate
momentum = 0.9 # momentum

net = ConvNet() # Our neural net
criterion = nn.CrossEntropyLoss() # Loss function to be optimized
optimizer = optim.Adam(net.parameters(), lr=lr) # Optimization algorithm
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=2)

In [81]:
# num_epochs indicates the number of passes over the data

patience = 5
best_loss = float('inf')
wait = 0
best_model_state = None

for epoch in range(num_epochs):

    # makes one pass over the train data and updates weights
    train(net, trainloader, criterion, optimizer, epoch, num_epochs)

    # makes one pass over validation data and provides validation statistics
    val_loss, val_acc = validation(net, valloader, criterion)

    scheduler.step(val_loss)

    # check si amélioration
    if val_loss < best_loss:
        best_loss = val_loss
        wait = 0
        best_model_state = net.state_dict()   # on sauvegarde le meilleur modèle
    else:
        wait += 1

    # early stopping déclenché ?
    if wait >= patience:
        print(f"\nEarly stopping triggered at epoch {epoch+1}")
        break

# restaurer meilleur modèle
net.load_state_dict(best_model_state)
print("✔ Best model restored")

Training Epoch [1/100]: 100%|██████████| 992/992 [00:12<00:00, 81.99it/s] 
Validation in progress: 100%|██████████| 248/248 [00:01<00:00, 191.14it/s, val_acc=0.533, val_loss=1.45]
Training Epoch [2/100]: 100%|██████████| 992/992 [00:10<00:00, 91.51it/s]
Validation in progress: 100%|██████████| 248/248 [00:01<00:00, 178.34it/s, val_acc=0.559, val_loss=1.39]
Training Epoch [3/100]: 100%|██████████| 992/992 [00:10<00:00, 92.88it/s]
Validation in progress: 100%|██████████| 248/248 [00:01<00:00, 176.12it/s, val_acc=0.561, val_loss=1.37]
Training Epoch [4/100]: 100%|██████████| 992/992 [00:10<00:00, 92.30it/s]
Validation in progress: 100%|██████████| 248/248 [00:01<00:00, 187.31it/s, val_acc=0.569, val_loss=1.34]
Training Epoch [5/100]: 100%|██████████| 992/992 [00:10<00:00, 92.97it/s]
Validation in progress: 100%|██████████| 248/248 [00:01<00:00, 186.22it/s, val_acc=0.583, val_loss=1.31]
Training Epoch [6/100]: 100%|██████████| 992/992 [00:10<00:00, 91.82it/s]
Validation in progress: 100%|█


Early stopping triggered at epoch 24
✔ Best model restored





In [82]:
testset = TensorDataset(X_test,y_test)

testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=True)

test_loss, test_acc = validation(net, testloader, criterion)
print(f'Test accuracy: {test_acc} | Test loss: {test_loss}')

Validation in progress: 100%|██████████| 310/310 [00:02<00:00, 132.40it/s, val_acc=0.618, val_loss=1.22]

Test accuracy: 0.6180443548387097 | Test loss: 1.222052727976153



