Create dataset (Or directly load from saved numpy arrays)

In [1]:
import os
import pandas as pd

audio_root_folder = './archive/data'
labels_csv = os.path.join(audio_root_folder, 'features_30_sec.csv')
df = pd.read_csv(labels_csv, header=0)
df.drop(df.loc[df.filename == 'jazz.00054.wav'].index, inplace=True)
df.head(5)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [2]:
import numpy as np
import torch

RANDOM_SEED = RANDOM_STATE = 42

np.random.seed(RANDOM_SEED)
torch.random.manual_seed(RANDOM_SEED);

In [3]:
from sklearn.model_selection import train_test_split

filenames = df['filename']
labels = df['label']

files_train, files_test, labels_train, labels_test = train_test_split(
    filenames, labels, test_size=0.1, random_state=RANDOM_STATE
)

In [4]:
from audio_toolbox.dataset import AudioOTFDataset

num_frames = 1290
label_encoding = 'Label'
scaling_strategy = None

datasets = {
    'train':
        AudioOTFDataset(
            root_folder=audio_root_folder,
            filenames=files_train.tolist(),
            labels=labels_train.tolist(),
            num_frames=num_frames,
            scaling_strategy=scaling_strategy,
            name='Training set',
            label_encoding=label_encoding
        ),
    'test':
        AudioOTFDataset(
            root_folder=audio_root_folder,
            filenames=files_test.tolist(),
            labels=labels_test.tolist(),
            num_frames=num_frames,
            scaling_strategy=scaling_strategy,
            name='Testing set',
            label_encoding=label_encoding
        )
}

Loading audios for Training set: 100%|██████████| 899/899 [00:06<00:00, 129.31it/s]
Processing for Training set: 100%|██████████| 899/899 [03:09<00:00,  4.73it/s]
Loading audios for Testing set: 100%|██████████| 100/100 [00:00<00:00, 146.75it/s]
Processing for Testing set: 100%|██████████| 100/100 [00:15<00:00,  6.47it/s]


In [5]:
n_train, n_test = len(datasets['train']), len(datasets['test'])
n_train, n_test

(899, 100)

In [6]:
print(repr(datasets['train']))

Root folder: ./archive/data
Number of samples: 899
Shape of one sample: torch.Size([72, 1290])
Number of classes: 10
Features:
	n_mfcc: 12
	n_chroma: 12
	n_derivatives: 2
Scaling strategy: None


In [7]:
print(repr(datasets['test']))

Root folder: ./archive/data
Number of samples: 100
Shape of one sample: torch.Size([72, 1290])
Number of classes: 10
Features:
	n_mfcc: 12
	n_chroma: 12
	n_derivatives: 2
Scaling strategy: None


In [14]:
X_train = torch.stack([datasets['train'][i][0] for i in range(len(datasets['train']))]).cpu().numpy()
X_test = torch.stack([datasets['test'][i][0] for i in range(len(datasets['test']))]).cpu().numpy()
X_train.shape, X_test.shape

((899, 72, 1290), (100, 72, 1290))

In [15]:
y_train = datasets['train'].labels.cpu().numpy()
y_test = datasets['test'].labels.cpu().numpy()
y_train.shape, y_test.shape

((899,), (100,))

In [16]:
X_train_flat = X_train.reshape((X_train.shape[0], -1))
# y_train_labels = np.argmax(y_train, axis=1)

X_test_flat = X_test.reshape((X_test.shape[0], -1))
# y_test_labels = np.argmax(y_test, axis=1)

In [17]:
print("Flattened training samples and labels:", X_train_flat.shape, y_train.shape)
print("Flattened test samples and labels:", X_test_flat.shape, y_test.shape)

Flattened training samples and labels: (899, 92880) (899,)
Flattened test samples and labels: (100, 92880) (100,)


In [20]:
import os

# Save the processed arrays to save time
os.makedirs('./processed_data/ml_data', exist_ok=True)
np.save('processed_data/ml_modeling_train_data.npy', X_train_flat)
print('Train data saved')
np.save('processed_data/ml_modeling_train_label.npy', y_train)
print('Train label saved')
np.save('processed_data/ml_modeling_test_data.npy', X_test_flat)
print('Test data saved')
np.save('processed_data/ml_modeling_test_label.npy', y_test)
print('Test label saved')

Train data saved
Train label saved
Test data saved
Test label saved


Directly load saved numpy arrays

In [21]:
import numpy as np

X_train_flat = np.load('processed_data/ml_modeling_train_data.npy')
X_test_flat = np.load('processed_data/ml_modeling_test_data.npy')
y_train = np.load('processed_data/ml_modeling_train_label.npy')
y_test = np.load('processed_data/ml_modeling_test_label.npy')

print("Flattened training samples and labels:", X_train_flat.shape, y_train.shape)
print("Flattened test samples and labels:", X_test_flat.shape, y_test.shape)

Flattened training samples and labels: (899, 92880) (899,)
Flattened test samples and labels: (100, 92880) (100,)


In [22]:
import numpy as np
import torch

RANDOM_SEED = RANDOM_STATE = 42

np.random.seed(RANDOM_SEED)
torch.random.manual_seed(RANDOM_SEED);

Do PCA on flattened features

In [23]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Initialize the scaler and PCA
scaler = StandardScaler()
pca = PCA(n_components=0.9)  # Keep 90% of the variance

# Fit the scaler on the training set and transform all sets
X_train_scaled = scaler.fit_transform(X_train_flat)
X_test_scaled = scaler.transform(X_test_flat)

reduced_X_train_flat = pca.fit_transform(X_train_scaled)
reduced_X_test_flat = pca.transform(X_test_scaled)

In [24]:
print(f"Training samples: {X_train_flat.shape} -> {reduced_X_train_flat.shape}")
print(f"Test samples: {X_test_flat.shape} -> {reduced_X_test_flat.shape}")

Training samples: (899, 92880) -> (899, 662)
Test samples: (100, 92880) -> (100, 662)


In [25]:
from sklearn.linear_model import LogisticRegression
from audio_toolbox.metrics import calculate_acc, precision_recall

logistic_model = LogisticRegression(max_iter=1000, C=1e-3, random_state=RANDOM_STATE)
logistic_model.fit(reduced_X_train_flat, y_train)
print('Model fitting finished')

for x, y, split in zip([reduced_X_train_flat, reduced_X_test_flat],
                [y_train, y_test],
                ['Train', 'Test']):
    acc, correct, incorrect = calculate_acc(logistic_model, x, y)
    print(f'{split} accuracy: {acc:.4f}%, {len(incorrect)} mismatches out of {len(incorrect) + len(correct)} samples')
    conf_mat, precision, recall, f1 = precision_recall(logistic_model, x, y, return_each_class=False)
    print(f'(Averaged) {split} precision: {precision:.4f}, recall: {recall:.4f}, f1 score: {f1:.4f}')

Model fitting finished
Train accuracy: 99.8888%, 1 mismatches out of 899 samples
(Averaged) Train precision: 0.9989, recall: 0.9989, f1 score: 0.9989
Test accuracy: 52.0000%, 48 mismatches out of 100 samples
(Averaged) Test precision: 0.5200, recall: 0.5295, f1 score: 0.4889


In [26]:
from sklearn.svm import SVC

svm_classifier = SVC(C=1e-3, random_state=RANDOM_STATE)
svm_classifier.fit(reduced_X_train_flat, y_train)

print('Model fitting finished')

for x, y, split in zip([reduced_X_train_flat, reduced_X_test_flat],
                [y_train, y_test],
                ['Train', 'Test']):
    acc, correct, incorrect = calculate_acc(svm_classifier, x, y)
    print(f'{split} accuracy: {acc:.4f}%, {len(incorrect)} mismatches out of {len(incorrect) + len(correct)} samples')
    conf_mat, precision, recall, f1 = precision_recall(svm_classifier, x, y, return_each_class=False)
    print(f'(Averaged) {split} precision: {precision:.4f}, recall: {recall:.4f}, f1 score: {f1:.4f}')

Model fitting finished
Train accuracy: 10.5673%, 804 mismatches out of 899 samples
(Averaged) Train precision: 0.1057, recall: 0.1000, f1 score: 0.0191
Test accuracy: 5.0000%, 95 mismatches out of 100 samples
(Averaged) Test precision: 0.0500, recall: 0.1000, f1 score: 0.0095


In [27]:
ovo_svm = SVC(decision_function_shape='ovo', random_state=RANDOM_STATE)
ovo_svm.fit(reduced_X_train_flat, y_train)

print('Model fitting finished')

for x, y, split in zip([reduced_X_train_flat, reduced_X_test_flat],
                [y_train, y_test],
                ['Train', 'Test']):
    acc, correct, incorrect = calculate_acc(ovo_svm, x, y)
    print(f'{split} accuracy: {acc:.4f}%, {len(incorrect)} mismatches out of {len(incorrect) + len(correct)} samples')
    conf_mat, precision, recall, f1 = precision_recall(ovo_svm, x, y, return_each_class=False)
    print(f'(Averaged) {split} precision: {precision:.4f}, recall: {recall:.4f}, f1 score: {f1:.4f}')

Model fitting finished
Train accuracy: 98.5539%, 13 mismatches out of 899 samples
(Averaged) Train precision: 0.9861, recall: 0.9850, f1 score: 0.9854
Test accuracy: 32.0000%, 68 mismatches out of 100 samples
(Averaged) Test precision: 0.4814, recall: 0.3291, f1 score: 0.2068


In [28]:
from sklearn.ensemble import RandomForestClassifier

rand_forest_classifier = RandomForestClassifier(n_estimators=100,
                                                max_depth=4,
                                                random_state=RANDOM_STATE)
rand_forest_classifier.fit(reduced_X_train_flat, y_train)

print('Model fitting finished')

for x, y, split in zip([reduced_X_train_flat, reduced_X_test_flat],
                [y_train, y_test],
                ['Train', 'Test']):
    acc, correct, incorrect = calculate_acc(rand_forest_classifier, x, y)
    print(f'{split} accuracy: {acc:.4f}%, {len(incorrect)} mismatches out of {len(incorrect) + len(correct)} samples')
    conf_mat, precision, recall, f1 = precision_recall(rand_forest_classifier, x, y, return_each_class=False)
    print(f'(Averaged) {split} precision: {precision:.4f}, recall: {recall:.4f}, f1 score: {f1:.4f}')

Model fitting finished
Train accuracy: 82.4249%, 158 mismatches out of 899 samples
(Averaged) Train precision: 0.8579, recall: 0.8213, f1 score: 0.8194
Test accuracy: 25.0000%, 75 mismatches out of 100 samples
(Averaged) Test precision: 0.4264, recall: 0.2888, f1 score: 0.1812


In [29]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train_flat, y_train)

print('Model fitting finished')

for x, y, split in zip([X_train_flat, X_test_flat],
                [y_train, y_test],
                ['Train', 'Test']):
    acc, correct, incorrect = calculate_acc(gnb_classifier, x, y)
    print(f'{split} accuracy: {acc:.4f}%, {len(incorrect)} mismatches out of {len(incorrect) + len(correct)} samples')
    conf_mat, precision, recall, f1 = precision_recall(gnb_classifier, x, y, return_each_class=False)
    print(f'(Averaged) {split} precision: {precision:.4f}, recall: {recall:.4f}, f1 score: {f1:.4f}')

Model fitting finished
Train accuracy: 84.0934%, 143 mismatches out of 899 samples
(Averaged) Train precision: 0.8586, recall: 0.8423, f1 score: 0.8411
Test accuracy: 55.0000%, 45 mismatches out of 100 samples
(Averaged) Test precision: 0.4985, recall: 0.5341, f1 score: 0.4992


In [30]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_classifier.fit(reduced_X_train_flat, y_train)

print('Model fitting finished')

for x, y, split in zip([reduced_X_train_flat, reduced_X_test_flat],
                [y_train, y_test],
                ['Train', 'Test']):
    acc, correct, incorrect = calculate_acc(gnb_classifier, x, y)
    print(f'{split} accuracy: {acc:.4f}%, {len(incorrect)} mismatches out of {len(incorrect) + len(correct)} samples')
    conf_mat, precision, recall, f1 = precision_recall(gnb_classifier, x, y, return_each_class=False)
    print(f'(Averaged) {split} precision: {precision:.4f}, recall: {recall:.4f}, f1 score: {f1:.4f}')

Model fitting finished
Train accuracy: 58.2870%, 375 mismatches out of 899 samples
(Averaged) Train precision: 0.6660, recall: 0.5773, f1 score: 0.5611
Test accuracy: 9.0000%, 91 mismatches out of 100 samples
(Averaged) Test precision: 0.4126, recall: 0.1237, f1 score: 0.0539


In [31]:
from sklearn.ensemble import GradientBoostingClassifier

xgboost_classifier = GradientBoostingClassifier(subsample=0.8, max_depth=2, random_state=RANDOM_STATE)
xgboost_classifier.fit(reduced_X_train_flat, y_train)

print('Model fitting finished')

for x, y, split in zip([reduced_X_train_flat, reduced_X_test_flat],
                [y_train, y_test],
                ['Train', 'Test']):
    acc, correct, incorrect = calculate_acc(xgboost_classifier, x, y)
    print(f'{split} accuracy: {acc:.4f}%, {len(incorrect)} mismatches out of {len(incorrect) + len(correct)} samples')
    conf_mat, precision, recall, f1 = precision_recall(gnb_classifier, x, y, return_each_class=False)
    print(f'(Averaged) {split} precision: {precision:.4f}, recall: {recall:.4f}, f1 score: {f1:.4f}')

Model fitting finished
Train accuracy: 99.6663%, 3 mismatches out of 899 samples
(Averaged) Train precision: 0.6660, recall: 0.5773, f1 score: 0.5611
Test accuracy: 44.0000%, 56 mismatches out of 100 samples
(Averaged) Test precision: 0.4126, recall: 0.1237, f1 score: 0.0539
