In [None]:
import os
import librosa
import librosa.display
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [None]:
# map the instruments to labels to classify
instrument_to_label = {
    "violin": 0,
    "sax": 1,
    "trumpet": 2,
    "piano": 3,
    "organ": 4,
    "flute": 5,
    "clarinet": 6
}

# defining base path to the project folders including testing and training directories 
# when running this make sure to change location of base path and have a Training and Testing Folder
base_path = os.path.dirname(os.path.abspath(r"C:\Users\ryan3\Final Project Machine Learning\Training"))  #current path of folder
train_path = os.path.join(base_path, "Training")
test_path = os.path.join(base_path, "Testing")

# checking if the folder pathing is same as mine (will provide picture) 
if not os.path.isdir(train_path) or not os.path.isdir(test_path):
    raise FileNotFoundError(f"Expected 'Training' and 'Testing' folders in {base_path}.")

X_train = [] # store speectrograms and labels
y_train = []

In [None]:
# loop through each instrument and its label
for instrument, label in instrument_to_label.items():
    instrument_folder = os.path.join(train_path, instrument) # instrument path
    if os.path.isdir(instrument_folder):  # check folder
        for file in os.listdir(instrument_folder):
            if file.endswith(".wav"):  # checking for correct files
                file_path = os.path.join(instrument_folder, file)
                try:
                    # lodaing audio files
                    y, sr = librosa.load(file_path, sr=16000)
                    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
                    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
                    X_train.append(mel_spec_db) # adds spectrogram
                    y_train.append(label) # adds labels
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

# change lists into arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

print(f"Processed {len(X_train)} spectrograms and {len(y_train)} labels.")

Processed 525 spectrograms and 525 labels.


In [None]:
# normalizing data so mean = 0 and standard deviation is 1 for all samples
X_train = (X_train - np.mean(X_train, axis=(1, 2), keepdims=True)) / np.std(X_train, axis=(1, 2), keepdims=True)

max_time_steps = 94
X_train = np.array([librosa.util.fix_length(x, size=max_time_steps, axis=1) for x in X_train])

# training/validating datasets split 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

# converting to pyTorch sensors https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html
# Tensors are a specialized data structure that are very similar to arrays and matrices. 
# In PyTorch, we use tensors to encode the inputs and outputs of a model, as well as the model’s parameters.
# in this case coverting spectrograms and labels into tensors 
# helps with uneven dimensions
X_train_tensor = torch.tensor(X_train).unsqueeze(1).float()
y_train_tensor = torch.tensor(y_train).long()
X_val_tensor = torch.tensor(X_val).unsqueeze(1).float()
y_val_tensor = torch.tensor(y_val).long()

# dataloadsers creation
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

print(f"Training data tensor shape: {X_train_tensor.shape}")
print(f"Validation data tensor shape: {X_val_tensor.shape}")

Training data tensor shape: torch.Size([420, 1, 128, 94])
Validation data tensor shape: torch.Size([105, 1, 128, 94])


In [None]:
# Model for classification of instruments
class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__() 
        # 3 convulitonal layers 
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1) # first layer
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1) # second layer
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1) # third layer
        self.pool = nn.MaxPool2d(2) # correcting the dimensions
        self.dropout = nn.Dropout(0.5) # regularizaton 
        self.fc1 = None  
        self.fc2 = nn.Linear(256, num_classes)  

    def forward(self, x):
        # convolutional and pooling layers being passed through
        x = self.pool(nn.ReLU()(self.conv1(x)))
        x = self.pool(nn.ReLU()(self.conv2(x)))
        x = self.pool(nn.ReLU()(self.conv3(x)))

        if self.fc1 is None:  # Dynamically initialize the first fully connected layer based on input size
            flattened_size = x.numel() // x.size(0)  # Total elements per sample
            self.fc1 = nn.Linear(flattened_size, 256).to(x.device)

        x = x.view(x.size(0), -1)  # Flattening the input
        x = self.dropout(x)
        x = nn.ReLU()(self.fc1(x))
        return self.fc2(x)


In [None]:
# setting device to GPU is is able to
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNN(num_classes=len(instrument_to_label)).to(device)

# Compute class weights for imbalanced datasets
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# defining loss function and optimizer 
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# training loop
num_epochs = 20
best_val_loss = float('inf')
patience = 3
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad() # clear graidents
        outputs = model(X_batch) # forward passing
        loss = criterion(outputs, y_batch) # compute loss
        loss.backward() # backward passing
        optimizer.step() # update model weights
        epoch_loss += loss.item()

    model.eval() 
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            val_outputs = model(X_val_batch)
            val_loss += criterion(val_outputs, y_val_batch).item()
            correct += (val_outputs.argmax(1) == y_val_batch).sum().item()

    val_loss /= len(val_loader)
    val_accuracy = correct / len(y_val)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}, "
          f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")

    # had issues with occasional stoppages here as a backup
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

Epoch [1/20], Loss: 1.9552, Validation Loss: 1.9393, Accuracy: 0.1429
Epoch [2/20], Loss: 1.9453, Validation Loss: 1.9400, Accuracy: 0.1333
Epoch [3/20], Loss: 1.9361, Validation Loss: 1.9280, Accuracy: 0.2571
Epoch [4/20], Loss: 1.9207, Validation Loss: 1.8907, Accuracy: 0.3048
Epoch [5/20], Loss: 1.8843, Validation Loss: 1.8744, Accuracy: 0.2476
Epoch [6/20], Loss: 1.8506, Validation Loss: 1.8191, Accuracy: 0.3524
Epoch [7/20], Loss: 1.7620, Validation Loss: 1.7548, Accuracy: 0.3905
Epoch [8/20], Loss: 1.7293, Validation Loss: 1.7005, Accuracy: 0.3619
Epoch [9/20], Loss: 1.6884, Validation Loss: 1.6963, Accuracy: 0.3810
Epoch [10/20], Loss: 1.6563, Validation Loss: 1.6586, Accuracy: 0.4667
Epoch [11/20], Loss: 1.5854, Validation Loss: 1.6474, Accuracy: 0.3714
Epoch [12/20], Loss: 1.5333, Validation Loss: 1.6587, Accuracy: 0.4000
Epoch [13/20], Loss: 1.4332, Validation Loss: 1.6356, Accuracy: 0.3810
Epoch [14/20], Loss: 1.4496, Validation Loss: 1.5892, Accuracy: 0.4000
Epoch [15/20], 

In [None]:
torch.save(model.state_dict(), "trained_model.pth")
print("Model saved successfully")

Model saved successfully


In [None]:
test_file = os.path.join(test_path, "Lets go dodgers!.wav") # only need audio file name here make sure it is .wav format
y, sr = librosa.load(test_file, sr=16000)
mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

# normalizing and resizing  the test data
mel_spec_db = (mel_spec_db - np.mean(mel_spec_db)) / np.std(mel_spec_db)
mel_spec_db = librosa.util.fix_length(mel_spec_db, size=max_time_steps, axis=1)
X_test = torch.tensor(mel_spec_db).unsqueeze(0).unsqueeze(0).float().to(device)

with torch.no_grad():
    prediction = model(X_test)
    predicted_class = torch.argmax(prediction, dim=1).item()

# predicted class index mapped back to instrument name
label_to_instrument = {v: k for k, v in instrument_to_label.items()}
predicted_instrument = label_to_instrument.get(predicted_class, "Unknown")
print(f"Predicted instrument: {predicted_instrument}")

Predicted instrument: piano
