In [14]:
import torch
import torch.nn as nn
import librosa
import dagshub
import mlflow
import subprocess
import json
import os
import numpy as np
from math import ceil
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [15]:
dagshub.init(repo_owner="stephenjera", repo_name="Genre-Classification", mlflow=True)

In [16]:
SAMPLE_RATE = 22050
DURATION = 30  # length of audio files measured in seconds
NUM_SEGMENTS = 1
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

In [17]:
data_directory = Path.cwd().parent / "data"
genres_dir = data_directory / "genres"

In [18]:
def save_mfcc(
    dataset_path: str | Path,
    json_path: str,
    samples_per_track: int,
    n_mfcc=13,
    n_fft=2048,
    hop_length=512,
    num_segments=5,
):
    """Creates a JSON file of the MFCCs for the dataset
    :param
    ----------
     dataset_path: path to the dataset folder
     json_path: name of JSON file to be created
     n_mfcc: number of MFCC coefficients to create
     n_fft:
     hop_length:
     num_segments:

    """
    # Create a dictionary to map semantic labels to numerical labels
    semantic_to_numeric = {}
    # dictionary to store data
    data = {
        "mappings": {},
        "mfcc": [],
        "labels": [],
    }
    num_samples_per_segment = int(samples_per_track / num_segments)
    expected_num_mfcc_vectors_per_segment = ceil(
        num_samples_per_segment / hop_length
    )  # round up always

    # Loop through all the data
    for i, (dirpath, _, filenames) in enumerate(os.walk(dataset_path)):
        # dirpath = current folder path
        # dirnames = subfolders in dirpath
        # filenames = all files in dirpath

        # ensure that we're not at the root level (Audio folder)
        if dirpath != str(dataset_path):
            # save the semantic label
            dirpath_components = dirpath.split(os.sep)
            semantic_label = dirpath_components[-1]
            # Subtract 1 to skip the root folder
            semantic_to_numeric[semantic_label] = i - 1
            print(f"\nProcessing {semantic_label}")

            # process files
            for filename in filenames:
                # load audio file
                file_path = Path(dirpath, filename)  # os.path.join(dirpath, filename)
                try:
                    signal, sr = librosa.load(
                        file_path  # , sr=SAMPLE_RATE, duration=DURATION
                    )
                except Exception as e:
                    print(f"Error loading {file_path}: {e}")
                    continue
                # process segments extracting mfcc and storing data
                for s in range(num_segments):
                    start_sample = num_samples_per_segment * s
                    finish_sample = start_sample + num_samples_per_segment

                    mfcc = librosa.feature.mfcc(
                        y=signal[start_sample:finish_sample],
                        sr=sr,
                        n_fft=n_fft,
                        n_mfcc=n_mfcc,
                        hop_length=hop_length,
                    )
                    mfcc = mfcc.T

                    # store mfcc for segment if it has expected length
                    if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                        # can't save numpy arrays as json files
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i - 1)
                        print(f"{file_path}, segment:{s+1}")
    data["mappings"] = semantic_to_numeric
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

In [19]:
save_mfcc(genres_dir, "data.json",SAMPLES_PER_TRACK)


Processing blues
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\blues\blues.00000.wav, segment:1
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\blues\blues.00000.wav, segment:2
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\blues\blues.00000.wav, segment:3
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\blues\blues.00000.wav, segment:4
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\blues\blues.00000.wav, segment:5
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\blues\blues.00001.wav, segment:1
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\blues\blues.00001.wav, segment:2
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\blues\blues.00001.wav, segment:3
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genre

  signal, sr = librosa.load(
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\jazz\jazz.00056.wav, segment:1
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\jazz\jazz.00056.wav, segment:2
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\jazz\jazz.00056.wav, segment:3
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\jazz\jazz.00056.wav, segment:4
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\jazz\jazz.00056.wav, segment:5
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\jazz\jazz.00057.wav, segment:1
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\jazz\jazz.00057.wav, segment:2
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\jazz\jazz.00057.wav, segment:3
c:\Users\Stephen\Documents\GitHub_Repositories\Genre-Classification\data\genres\jazz\jazz.00057.wav, segment:4
c

In [20]:
def load_data(dataset_path):
    """
    Loads training dataset from json file.
        :param data_path (str): Path to json file containing data
        :return X (ndarray): Inputs
        :return y (ndarray): Targets
    """

    with open(dataset_path, "r") as fp:
        data = json.load(fp)

    # convert lists to numpy arrays
    X = np.array(data["mfcc"])
    # X = np.array(data["spectrogram"])
    y = np.array(data["labels"])
    mappings = data["mappings"]
    return X, y, mappings

In [21]:
mfccs, labels, _ = load_data("data.json")
mfccs = np.array(mfccs, dtype=np.float32)
labels = torch.tensor(labels, dtype=torch.long).clone().detach()

In [22]:
# Hyperparameters
num_classes = 10  # number of genres
input_size = 20  # number of MFCC coefficients
hidden_size = 128
num_layers = 2
batch_size = 64
num_epochs = 10
learning_rate = 1e-3

In [23]:
# LSTM model
class LSTMGenreModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x shape: (batch, seq_len, input_size)
        out, (hn, cn) = self.lstm(x)
        # out shape: (batch, seq_len, hidden_size)

        # Take the final output and classify
        out = self.fc(out[:, -1, :])
        # out shape: (batch, num_classes)
        return out


# Dataset
class MFCCDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        mfccs = self.X[idx]
        label = self.y[idx]
        return mfccs, label

In [24]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    mfccs, labels, test_size=0.2, random_state=42
)

# Create datasets and dataloaders
train_dataset = MFCCDataset(X_train, y_train)
test_dataset = MFCCDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Train model
model = LSTMGenreModel(input_size, hidden_size, num_layers, num_classes)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    for mfccs, labels in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(mfccs)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

In [25]:
# Evaluation
correct = 0
total = 0
with torch.no_grad():
    for mfccs, labels in test_loader:
        outputs = model(mfccs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f"Accuracy: {accuracy}")

Accuracy: 62.26226226226226


In [26]:
# Log parameters
mlflow.start_run()

mlflow.log_metric("accuracy", accuracy)
mlflow.log_param("SAMPLE_RATE", SAMPLE_RATE)
mlflow.log_param("DURATION", DURATION)
mlflow.log_param("NUM_SEGMENTS", NUM_SEGMENTS)
mlflow.log_param("SAMPLES_PER_TRACK", SAMPLES_PER_TRACK)
mlflow.log_param("num_classes", num_classes)
mlflow.log_param("input_size", input_size)
mlflow.log_param("hidden_size", hidden_size)
mlflow.log_param("num_layers", num_layers)
mlflow.log_param("batch_size", batch_size)
mlflow.log_param("num_epochs", num_epochs)
mlflow.log_param("learning_rate", learning_rate)
mlflow.log_param(
    "data_version",
    commit_hash := subprocess.check_output(
        ["git", "rev-parse", "HEAD"], universal_newlines=True
    ).strip(),
)

mlflow.end_run()