In [26]:
import os
import pandas as pd

audio_root_folder = './archive/data'
labels_csv = os.path.join(audio_root_folder, 'features_30_sec.csv')
df = pd.read_csv(labels_csv, header=0)
df.drop(df.loc[df.filename == 'jazz.00054.wav'].index, inplace=True)
df.head(5)

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [27]:
import numpy as np
import torch

RANDOM_SEED = RANDOM_STATE = 42

np.random.seed(RANDOM_SEED)
torch.random.manual_seed(RANDOM_SEED);

In [28]:
from sklearn.model_selection import train_test_split

filenames = df['filename']
labels = df['label']

files_train, files_val_test, labels_train, labels_val_test = train_test_split(
    filenames, labels, test_size=0.1, random_state=RANDOM_STATE
)

files_val, files_test, labels_val, labels_test = train_test_split(
    files_val_test, labels_val_test, test_size=0.5, random_state=RANDOM_STATE
)

In [29]:
from audio_toolbox.dataset import AudioOTFDataset

num_frames = 1290
label_encoding = 'Label'
scaling_strategy = None

datasets = {
    'train':
        AudioOTFDataset(
            root_folder=audio_root_folder,
            filenames=files_train.tolist(),
            labels=labels_train.tolist(),
            num_frames=num_frames,
            scaling_strategy=scaling_strategy,
            name='Training set',
            label_encoding=label_encoding,
            flatten_features=True,
            shuffle=True,
            random_state=RANDOM_STATE
        ),
    'val':
        AudioOTFDataset(
            root_folder=audio_root_folder,
            filenames=files_val.tolist(),
            labels=labels_val.tolist(),
            num_frames=num_frames,
            scaling_strategy=scaling_strategy,
            name='Validation set',
            label_encoding=label_encoding,
            flatten_features=True,
            shuffle=True,
            random_state=RANDOM_STATE
        ),
    'test':
        AudioOTFDataset(
            root_folder=audio_root_folder,
            filenames=files_test.tolist(),
            labels=labels_test.tolist(),
            num_frames=num_frames,
            scaling_strategy=scaling_strategy,
            name='Testing set',
            label_encoding=label_encoding,
            flatten_features=True,
            shuffle=True,
            random_state=RANDOM_STATE
        )
}

Loading audios for Training set: 100%|██████████| 899/899 [00:05<00:00, 163.80it/s]
Processing for Training set: 100%|██████████| 899/899 [02:12<00:00,  6.78it/s]
Loading audios for Validation set: 100%|██████████| 50/50 [00:00<00:00, 147.91it/s]
Processing for Validation set: 100%|██████████| 50/50 [00:08<00:00,  5.66it/s]
Loading audios for Testing set: 100%|██████████| 50/50 [00:00<00:00, 143.39it/s]
Processing for Testing set: 100%|██████████| 50/50 [00:06<00:00,  7.18it/s]


In [30]:
n_train, n_val, n_test = len(datasets['train']), len(datasets['val']), len(datasets['test'])
n_train, n_val, n_test

(899, 50, 50)

In [31]:
print(repr(datasets['train']))

Root folder: ./archive/data
Number of samples: 899
Shape of one sample: torch.Size([92880])
Number of classes: 10
Features:
	n_mfcc: 12
	n_chroma: 12
	n_derivatives: 2
Scaling strategy: None


In [32]:
print(repr(datasets['val']))

Root folder: ./archive/data
Number of samples: 50
Shape of one sample: torch.Size([92880])
Number of classes: 10
Features:
	n_mfcc: 12
	n_chroma: 12
	n_derivatives: 2
Scaling strategy: None


In [8]:
print(repr(datasets['test']))

Root folder: ./archive/data
Number of samples: 50
Shape of one sample: torch.Size([6, 12, 1290])
Number of classes: 10
Features:
	n_mfcc: 12
	n_chroma: 12
	n_derivatives: 2
Scaling strategy: None


In [34]:
import os

# Save the processed tensors to save time next time
os.makedirs('./processed_data/dl_data/mlp_data', exist_ok=True)
torch.save(datasets['train'].X, 'processed_data/dl_data/mlp_data/dl_modeling_train_data.pt')
print('Train data saved')
torch.save(datasets['train'].labels, 'processed_data/dl_data/mlp_data/dl_modeling_train_label.pt')
print('Train label saved')
torch.save(datasets['val'].X, 'processed_data/dl_data/mlp_data/dl_modeling_val_data.pt')
print('Val data saved')
torch.save(datasets['val'].labels, 'processed_data/dl_data/mlp_data/dl_modeling_val_label.pt')
print('Val label saved')
torch.save(datasets['test'].X, 'processed_data/dl_data/mlp_data/dl_modeling_test_data.pt')
print('Test data saved')
torch.save(datasets['test'].labels, 'processed_data/dl_data/mlp_data/dl_modeling_test_label.pt')
print('Test label saved')

Train data saved
Train label saved
Val data saved
Val label saved
Test data saved
Test label saved


In [35]:
import torch

train_data = torch.load('processed_data/dl_data/mlp_data/dl_modeling_train_data.pt')
val_data = torch.load('processed_data/dl_data/mlp_data/dl_modeling_val_data.pt')
test_data = torch.load('processed_data/dl_data/mlp_data/dl_modeling_test_data.pt')

train_label = torch.load('processed_data/dl_data/mlp_data/dl_modeling_train_label.pt')
val_label = torch.load('processed_data/dl_data/mlp_data/dl_modeling_val_label.pt')
test_label = torch.load('processed_data/dl_data/mlp_data/dl_modeling_test_label.pt')

train_data.shape, train_label.shape, val_data.shape, val_label.shape, test_data.shape, test_label.shape

(torch.Size([899, 92880]),
 torch.Size([899]),
 torch.Size([50, 92880]),
 torch.Size([50]),
 torch.Size([50, 92880]),
 torch.Size([50]))

In [11]:
from torch.utils.data import TensorDataset

datasets = {
    'train': TensorDataset(train_data, train_label),
    'val': TensorDataset(val_data, val_label),
    'test': TensorDataset(test_data, test_label)
}

train_data.shape

torch.Size([899, 6, 12, 1290])

In [22]:
from audio_toolbox.models import CNNModel

input_size = train_data.size(1)
output_size = 10
batch_size = 16

model_config = {
    "num_conv_layers": 3,
    "in_channels": [6, 32, 64],
    "out_channels": [32, 64, 128],
    "channel_widths": [8, 8, 8],
    "num_post_cnn_fc_layers": 2,
    "linear_in_dims": [21, 64, 32],
    "linear_out_dims": [64, 32, 10]
}
model = CNNModel(**model_config)

In [23]:
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch import nn

loss_fn = nn.CrossEntropyLoss()
learning_rate = 1e-4  # Adjust the learning rate as needed
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

In [24]:
from audio_toolbox.trainer import ModelTrainer

trainer = ModelTrainer(datasets, model, loss_fn, optimizer, scheduler)

In [25]:
trainer_config = {
    'save': False,
    'num_epochs': 100
}
trainer.train(**trainer_config)

RuntimeError: only batches of spatial targets supported (3D tensors) but got targets of dimension: 1