In [63]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Import resnet
from torchvision import models
import torchvision.transforms as transforms
from PIL import Image
import requests
import tqdm as tqdm

import json
import os
import sys
sys.path.append('../../functions')
import functions as f

In [6]:
# Set local path to the folder containing the .wav audio files
path = 'C:/Users/lucvo/VScode/Machine_learning/Audio_data/nsynth-valid.jsonwav/nsynth-valid/audio/'

In [44]:
# Load resnet18 model with pretrained weights
model = models.resnet18(pretrained=True)

# Print model architecture
print(model)



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [58]:

def get_model():
    # Load resnet18 model with pretrained weights
    model = models.resnet18(pretrained=True)
    for param in model.parameters():
        param.require_grad = False
    model.avgpool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
    model.fc = nn.Sequential(
      nn.Flatten(),
      nn.Linear(512, 128), # 512 for resnet18 or 2048 for resnet 50
      nn.ReLU(inplace=True),
      nn.Dropout(.2),
      nn.Linear(128, 6),
      nn.Sigmoid()
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    return model, optimizer

def loss_fn(y_pred, y_true):
    return torch.sum(torch.abs(y_pred - y_true))/y_pred.shape[0]


In [94]:
# Run this cell for spectrograms with 3 instruments
mixed_spectograms,  labels = f.generate_mixed_spectrograms(100, 3 , path = path)

# Split into training, validation and test (80/10/10)
X_train, X_val, X_test, y_train, y_val, y_test = f.split_data(mixed_spectograms, labels, 0.1, 0.1)

# Print the size of the training, validation and test sets
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:00<00:00, 159.31it/s]


(80, 1025, 126)
(10, 1025, 126)
(10, 1025, 126)


In [67]:
# Run this cell for spectrograms with varying number of instruments

# Create a new training set, with 1000 samples of each class
X1, y1 = f.generate_mixed_spectrograms(500, 1, path=path)
X2, y2 = f.generate_mixed_spectrograms(500, 2, path=path)
X3, y3 = f.generate_mixed_spectrograms(500, 3, path=path)
X4, y4 = f.generate_mixed_spectrograms(500, 4, path=path)
X5, y5 = f.generate_mixed_spectrograms(500, 5, path=path)
X6, y6 = f.generate_mixed_spectrograms(500, 6, path=path)

# Add the data to a single test array
X_train_multi = np.concatenate((X1, X2, X3, X4, X5, X6))
y_train_multi = np.concatenate((y1, y2, y3, y4, y5, y6))

# Shuffle the data using numpy
shuffled_indices = np.random.permutation(len(y_train_multi))
X_train_multi = X_train_multi[shuffled_indices]
y_train_multi = y_train_multi[shuffled_indices]

# Split into training, validation and test (80/10/10)
X_train_multi, X_val_multi, X_test_multi, y_train_multi, y_val_multi, y_test_multi = f.split_data(X_train_multi, y_train_multi, 0.1, 0.1)

# Print the shapes of the training, validation and test sets
print(X_train_multi.shape, y_train_multi.shape)
print(X_val_multi.shape, y_val_multi.shape)
print(X_test_multi.shape, y_test_multi.shape)


100%|██████████| 500/500 [00:04<00:00, 109.36it/s]
100%|██████████| 500/500 [00:06<00:00, 81.64it/s]
100%|██████████| 500/500 [00:07<00:00, 63.40it/s]
100%|██████████| 500/500 [00:08<00:00, 59.42it/s]
100%|██████████| 500/500 [00:09<00:00, 55.36it/s]
100%|██████████| 500/500 [00:09<00:00, 55.41it/s]


(2400, 1025, 126) (2400, 6)
(300, 1025, 126) (300, 6)
(300, 1025, 126) (300, 6)


In [68]:
import librosa
# Function to plot spectrogram using librosa

def spectrogram_to_rgb(spectrogram, eps=1e-6):
    # Min-max scale to fit inside 8-bit RGB
    img = 255 * (spectrogram - spectrogram.min()) / (spectrogram.max() - spectrogram.min())
    
    # Convert to uint8
    img = img.astype(np.uint8)
    
    # Stack the image to create a 3-channel image
    img = np.stack([img] * 3, axis=-1)
    
    return img



In [95]:
# Run this cell if you generated the spectrograms with 3 instruments

# Convert X_train_multi, X_val_multi and X_test_multi to RGB images
X_train_rgb = []
for i in tqdm.tqdm(range(X_train.shape[0])):
    rgb_img = spectrogram_to_rgb(X_train[i])
    rgb_img = np.moveaxis(rgb_img, 2, 0)
    X_train_rgb.append(rgb_img)
X_train_rgb = np.array(X_train_rgb)

X_val_rgb = []
for i in tqdm.tqdm(range(X_val.shape[0])):
    rgb_img = spectrogram_to_rgb(X_val[i])
    rgb_img = np.moveaxis(rgb_img, 2, 0)
    X_val_rgb.append(rgb_img)
X_val_rgb = np.array(X_val_rgb)

X_test_rgb = []
for i in tqdm.tqdm(range(X_test.shape[0])):
    rgb_img = spectrogram_to_rgb(X_test[i])
    rgb_img = np.moveaxis(rgb_img, 2, 0)
    X_test_rgb.append(rgb_img)
X_test_rgb = np.array(X_test_rgb)

# Print the shapes of the RGB images
print(X_train_rgb.shape)
print(X_val_rgb.shape)
print(X_test_rgb.shape)

100%|██████████| 80/80 [00:00<00:00, 1360.93it/s]
100%|██████████| 10/10 [00:00<00:00, 883.25it/s]
100%|██████████| 10/10 [00:00<00:00, 2789.32it/s]

(80, 3, 1025, 126)
(10, 3, 1025, 126)
(10, 3, 1025, 126)





In [76]:
# Run this cell if you generated the spectrograms with varying number of instruments

# Convert X_train_multi, X_val_multi and X_test_multi to RGB images
X_train_multi_rgb = []
for i in tqdm.tqdm(range(X_train_multi.shape[0])):
    rgb_img = spectrogram_to_rgb(X_train_multi[i])
    rgb_img = np.moveaxis(rgb_img, 2, 0)
    X_train_multi_rgb.append(rgb_img)
X_train_multi_rgb = np.array(X_train_multi_rgb)

X_val_multi_rgb = []
for i in tqdm.tqdm(range(X_val_multi.shape[0])):
    rgb_img = spectrogram_to_rgb(X_val_multi[i])
    rgb_img = np.moveaxis(rgb_img, 2, 0)
    X_val_multi_rgb.append(rgb_img)
X_val_multi_rgb = np.array(X_val_multi_rgb)

X_test_multi_rgb = []
for i in tqdm.tqdm(range(X_test_multi.shape[0])):
    rgb_img = spectrogram_to_rgb(X_test_multi[i])
    rgb_img = np.moveaxis(rgb_img, 2, 0)
    X_test_multi_rgb.append(rgb_img)
X_test_multi_rgb = np.array(X_test_multi_rgb)

# Print the shapes of the RGB images
print(X_train_multi_rgb.shape)
print(X_val_multi_rgb.shape)
print(X_test_multi_rgb.shape)

100%|██████████| 2400/2400 [00:02<00:00, 898.86it/s]
100%|██████████| 300/300 [00:00<00:00, 849.31it/s]
100%|██████████| 300/300 [00:00<00:00, 1201.53it/s]


(2400, 3, 1025, 126)
(300, 3, 1025, 126)
(300, 3, 1025, 126)


In [96]:
# Run this cell if you generated the spectrograms with 3 instruments

# Convert the numpy array to a torch tensor
X_train_rgb = torch.tensor(X_train_rgb)
y_train = torch.tensor(y_train)
X_val_rgb = torch.tensor(X_val_rgb)
y_val = torch.tensor(y_val)
X_test_rgb = torch.tensor(X_test_rgb)
y_test = torch.tensor(y_test)

# Scale the data to be between 0 and 1
def torch_min_max_normalization(X):
    X = (X - X.min()) / (X.max() - X.min())
    return X

X_train_rgb = torch_min_max_normalization(X_train_rgb)
X_val_rgb = torch_min_max_normalization(X_val_rgb)
X_test_gb = torch_min_max_normalization(X_test_rgb)

# Print the shapes of the torch tensors
print(X_train_rgb.shape)
print(X_val_rgb.shape)
print(X_test_rgb.shape)

torch.Size([80, 3, 1025, 126])
torch.Size([10, 3, 1025, 126])
torch.Size([10, 3, 1025, 126])


In [78]:
# Convert the numpy array to a torch tensor
X_train_multi_rgb = torch.tensor(X_train_multi_rgb)
y_train_multi = torch.tensor(y_train_multi)
X_val_multi_rgb = torch.tensor(X_val_multi_rgb)
y_val_multi = torch.tensor(y_val_multi)
X_test_multi_rgb = torch.tensor(X_test_multi_rgb)
y_test_multi = torch.tensor(y_test_multi)

# Scale the data to be between 0 and 1
def torch_min_max_normalization(X):
    X = (X - X.min()) / (X.max() - X.min())
    return X

X_train_multi_rgb = torch_min_max_normalization(X_train_multi_rgb)
X_val_multi_rgb = torch_min_max_normalization(X_val_multi_rgb)
X_test_multi_rgb = torch_min_max_normalization(X_test_multi_rgb)

# Print the shapes of the torch tensors
print(X_train_multi_rgb.shape)
print(X_val_multi_rgb.shape)
print(X_test_multi_rgb.shape)


  y_train_multi = torch.tensor(y_train_multi)
  y_val_multi = torch.tensor(y_val_multi)
  y_test_multi = torch.tensor(y_test_multi)


torch.Size([2400, 3, 1025, 126]) torch.Size([2400, 6])
torch.Size([300, 3, 1025, 126]) torch.Size([300, 6])
torch.Size([300, 3, 1025, 126]) torch.Size([300, 6])


In [99]:
# Define the model
model, optimizer = get_model()

def get_accuracy(y_pred, y_true):
    y_pred = y_pred.detach().numpy()
    y_true = y_true.detach().numpy()
    y_pred = np.round(y_pred)
    accuracy = np.mean(y_pred == y_true)
    return accuracy

# Function to train the model
def train_model(model, x_train, y_train, x_val, y_val, optimizer, loss_fn, num_epochs=100, patience = 5):
    train_loss = []
    train_acc = []
    val_loss = []
    val_acc = []

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        y_pred = model(x_train)
        loss = loss_fn(y_pred, y_train)
        loss.backward()
        train_loss.append(loss.item())

        # Compute the accuracy
        accuracy = get_accuracy(y_pred, y_train)
        train_acc.append(accuracy)

        # Compute the validation loss and accuracy
        model.eval()
        with torch.no_grad():
            y_pred_val = model(x_val)
            val_loss.append(loss_fn(y_pred_val, y_val).item())
            val_accuracy = get_accuracy(y_pred_val, y_val)
            val_acc.append(val_accuracy)

        optimizer.step()

        # Convert y_pred to np array, round to 2 decimal places
        y_pred = y_pred.detach().numpy().round(2)
        y_labels = y_train.detach().numpy()
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {loss.item():.4f}, Train Accuracy: {accuracy:.4f}, Val Loss: {val_loss[-1]:.4f}, Val Accuracy: {val_accuracy:.4f}")
        print(y_pred[0])
        print(y_labels[0])
        print("")

        # Include early stopping
        if epoch > patience:
            if val_loss[-patience] <= min(val_loss):
                print("Early stopping activated")
                break


    return model, train_loss, train_acc, val_loss, val_acc


# Train the model
trained_model, train_loss, train_acc, val_loss, val_acc = train_model(model, X_train_rgb, y_train, X_val_rgb, y_val, optimizer, loss_fn, num_epochs=100, patience = 5)




Epoch 1/100, Train Loss: 2.9955, Train Accuracy: 0.5188, Val Loss: 3.0320, Val Accuracy: 0.4000
[0.53 0.48 0.46 0.5  0.51 0.47]
[1. 1. 0. 0. 1. 0.]

Epoch 2/100, Train Loss: 2.8260, Train Accuracy: 0.7333, Val Loss: 3.0342, Val Accuracy: 0.3833
[0.51 0.51 0.41 0.44 0.52 0.45]
[1. 1. 0. 0. 1. 0.]

Epoch 3/100, Train Loss: 2.7043, Train Accuracy: 0.7729, Val Loss: 3.0237, Val Accuracy: 0.4167
[0.54 0.51 0.45 0.46 0.55 0.47]
[1. 1. 0. 0. 1. 0.]

Epoch 4/100, Train Loss: 2.5990, Train Accuracy: 0.8000, Val Loss: 3.0114, Val Accuracy: 0.5333
[0.55 0.49 0.43 0.4  0.56 0.43]
[1. 1. 0. 0. 1. 0.]

Epoch 5/100, Train Loss: 2.5169, Train Accuracy: 0.8250, Val Loss: 3.0113, Val Accuracy: 0.5167
[0.53 0.44 0.41 0.4  0.59 0.45]
[1. 1. 0. 0. 1. 0.]

Epoch 6/100, Train Loss: 2.4280, Train Accuracy: 0.8500, Val Loss: 3.0107, Val Accuracy: 0.5000
[0.56 0.48 0.48 0.33 0.6  0.41]
[1. 1. 0. 0. 1. 0.]

Epoch 7/100, Train Loss: 2.3312, Train Accuracy: 0.8542, Val Loss: 3.0066, Val Accuracy: 0.5333
[0.59 0.5 

KeyboardInterrupt: 

In [101]:
# Plot the training and validation loss as a function of the epoch
plt.plot(train_loss, label='train loss')
plt.plot(val_loss, label='validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot the training and validation accuracy as a function of the epoch
plt.plot(train_acc, label='train accuracy')
plt.plot(val_acc, label='validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Evaluate the model on the test set
trained_model.eval()
with torch.no_grad():
    y_pred_test = trained_model(X_test_rgb)
    test_loss = loss_fn(y_pred_test, y_test)
    test_accuracy = get_accuracy(y_pred_test, y_test)
    print(f"Test Loss: {test_loss.item():.4f}, Test Accuracy: {test_accuracy:.4f}")


NameError: name 'train_loss' is not defined