In [1]:
import scipy.io.wavfile as wav
import scipy.signal as signal
from matplotlib import pyplot as plt
import os
import numpy as np

import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
import torch.nn.functional as F
import gc
from pathlib import Path

In [2]:
# Define the device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
#device = 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [3]:
#chose test song
path = "genres_user_test/classical"
name = "Nimrod"

In [4]:
# Custom Dataset class
class NumpyDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.Zsamples = []
        self.Tsamples = []
        self.Fsamples = []
        
        # Load file paths and labels
        for label, class_dir in enumerate(os.listdir(root_dir)):
            class_path = os.path.join(root_dir, class_dir)
            if os.path.isdir(class_path):
                if (class_path != path):
                    continue
                for file_name in os.listdir(class_path):
                    if file_name.endswith("Z.npy"):
                        file_path = os.path.join(class_path, file_name)
                        self.Zsamples.append((file_path, label))
                    elif file_name.endswith("T.npy"):
                        file_path = os.path.join(class_path, file_name)
                        self.Tsamples.append((file_path, label))
                    elif file_name.endswith("F.npy"):
                        file_path = os.path.join(class_path, file_name)
                        self.Fsamples.append((file_path, label))
    
    def __len__(self):
        return len(self.Zsamples)
    
    def __getitem__(self, idx):
        file_path, label = self.Zsamples[idx]
        data = np.load(file_path)  # Load numpy array
        real = np.real(data)
        imag = np.imag(data)
        mag = np.abs(data)
        angle = np.angle(data)
        real = np.expand_dims(real, axis=0)
        imag = np.expand_dims(imag, axis=0)
        mag = np.expand_dims(mag, axis=0)
        angle = np.expand_dims(angle, axis=0)
        data = np.concatenate((real, imag, mag, angle), axis=0)
        data = torch.tensor(data, dtype=torch.float32)  # Convert to PyTorch tensor
        
        if self.transform:
            data = self.transform(data)
        
        return data, label

    def spectrograms(self, idx):
        Zfile_path, label = self.Zsamples[idx]
        Tfile_path, label = self.Tsamples[idx]
        Ffile_path, label = self.Fsamples[idx]
        Zdata = np.load(Zfile_path)  # Load numpy array
        Tdata = np.load(Tfile_path)  # Load numpy array
        Fdata = np.load(Ffile_path)  # Load numpy array
        print(Zdata.shape)
        # Create a 2x2 subplot grid
        fig, axes = plt.subplots(5, 1, figsize=(10, 16))
        
        # First subplot
        c1 = axes[0].pcolormesh(Tdata, Fdata, np.log(np.abs(Zdata)), cmap='gnuplot')
        fig.colorbar(c1, ax=axes[0])
        axes[0].set_title("Spectrogram Magnitude")
        
        # Second subplot
        c2 = axes[1].pcolormesh(Tdata, Fdata, np.angle(Zdata), cmap='gnuplot')
        fig.colorbar(c2, ax=axes[1])
        axes[1].set_title("Spectrogram Angle")      

        # Third subplot
        c3 = axes[2].pcolormesh(Tdata, Fdata, np.log(np.square(np.real(Zdata))), cmap='gnuplot')
        fig.colorbar(c3, ax=axes[2])
        axes[2].set_title("Spectrogram Real")  

        # Fourth subplot
        c4 = axes[3].pcolormesh(Tdata, Fdata, np.log(np.square(np.imag(Zdata))), cmap='gnuplot')
        fig.colorbar(c4, ax=axes[3])
        axes[3].set_title("Spectrogram Imag")  

        # Fifth subplot
        c5 = axes[4].pcolormesh(Tdata, Fdata, np.log(np.square(np.imag(Zdata)) + np.square(np.real(Zdata))), cmap='gnuplot')
        fig.colorbar(c4, ax=axes[4])
        axes[4].set_title("Spectrogram Imag + Real") 

In [5]:
# Define the transformations
# - ToTensor: Converts the image to a PyTorch tensor
# - Normalize: Normalizes using mean and std of the ImageNet dataset
transform = transforms.Compose([
    transforms.CenterCrop((544, 240)),
    transforms.Normalize((0,), (0.5,))
])

In [6]:
# Define the model
class MusicNet(nn.Module):
    def __init__(self):
        super(MusicNet, self).__init__()
        # C1: Convolutional Layer (input channels: 1, output channels: 6, kernel size: 3x3)
        self.conv1 = nn.Conv2d(in_channels=4, out_channels=8, kernel_size=3, stride=1, padding='same', groups=4)
        
        # C2: Convolutional Layer (input channels: 6, output channels: 12, kernel size: 3x3)
        self.conv2 = nn.Conv2d(in_channels=8, out_channels=12, kernel_size=3, stride=1, padding='same')
        
        # S3: Subsampling (Max Pooling with kernel size: 2x2 and stride: 2)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # C4: Convolutional Layer (input channels: 12, output channels: 24, kernel size: 3x3)
        self.conv3 = nn.Conv2d(in_channels=12, out_channels=14, kernel_size=3, stride=1, padding='same')

        # C5: Convolutional Layer (input channels: 6, output channels: 16, kernel size: 3x3)
        self.conv4 = nn.Conv2d(in_channels=14, out_channels=16, kernel_size=3, stride=1, padding='same')
        
        # S6: Subsampling (Max Pooling with kernel size: 2x2 and stride: 2)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # C7: Convolutional Layer (input channels: 48, output channels: 96, kernel size: 3x3)
        self.conv5 = nn.Conv2d(in_channels=16, out_channels=18, kernel_size=3, stride=1, padding='same')

        # C8: Convolutional Layer (input channels: 6, output channels: 16, kernel size: 3x3)
        self.conv6 = nn.Conv2d(in_channels=18, out_channels=20, kernel_size=3, stride=1, padding='same')
        
        # S9: Subsampling (Max Pooling with kernel size: 2x2 and stride: 2)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)

        # C10: Convolutional Layer (input channels: 48, output channels: 96, kernel size: 3x3)
        self.conv7 = nn.Conv2d(in_channels=20, out_channels=22, kernel_size=3, stride=1, padding='same')

        # C11: Convolutional Layer (input channels: 6, output channels: 16, kernel size: 3x3)
        self.conv8 = nn.Conv2d(in_channels=22, out_channels=24, kernel_size=3, stride=1, padding='same')
        
        # S12: Subsampling (Max Pooling with kernel size: 2x2 and stride: 2)
        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # C13: Fully connected convolutional layer (input size: 192, output size: 120)
        self.fc1 = nn.Linear(in_features=24*34*15, out_features=4000)
        
        # F14: Fully connected layer (input size: 10000, output size: 1000)
        self.fc2 = nn.Linear(in_features=4000, out_features=1000)

        # F15: Fully connected layer (input size: 1000, output size: 100)
        self.fc3 = nn.Linear(in_features=1000, out_features=100)
        
        # Output layer (input size: 100, output size: 10)
        self.fc4 = nn.Linear(in_features=100, out_features=10)

    def forward(self, x):
        # Apply the first convolution and activation function
        # print('x dims', x.shape)
        
        x = F.relu(self.conv1(x))    # C1
        x = F.relu(self.conv2(x))    # C2
        x = self.pool1(x)            # S3
        x = F.relu(self.conv3(x))    # C4
        x = F.relu(self.conv4(x))    # C5
        x = self.pool2(x)            # S6
        x = F.relu(self.conv5(x))    # C7
        x = F.relu(self.conv6(x))    # C8
        x = self.pool3(x)            # S9
        x = F.relu(self.conv7(x))    # C10
        x = F.relu(self.conv8(x))    # C11
        x = self.pool4(x)            # S12
        
        x = x.view(x.shape[0], 24*34*15)       # Flatten before passing to fully connected layers
        # Fully connected layers with activation functions
        x = F.relu(self.fc1(x))      # C13
        x = F.relu(self.fc2(x))      # F14
        x = F.relu(self.fc3(x))      # F15
        # Output layer (no activation function because we will use CrossEntropyLoss which includes Softmax)
        x = self.fc4(x)              # Output layer
        return x

# Instantiate the model
model = MusicNet().to(device)

# Print the model architecture
print(model)

MusicNet(
  (conv1): Conv2d(4, 8, kernel_size=(3, 3), stride=(1, 1), padding=same, groups=4)
  (conv2): Conv2d(8, 12, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(12, 14, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (conv4): Conv2d(14, 16, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv5): Conv2d(16, 18, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (conv6): Conv2d(18, 20, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv7): Conv2d(20, 22, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (conv8): Conv2d(22, 24, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (pool4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=12240, out_

In [7]:
model_path = "MusicGenreClassifier.pth"
model = MusicNet().to(device)
model.load_state_dict(torch.load(model_path, weights_only=False))

<All keys matched successfully>

In [8]:
#before loading music, make sure it is at least 6 seconds long and sampled at 22050 Hz
# ffmpeg -i foo.mp3 -vn -acodec pcm_s16le -ac 1 -ar 22050 -f wav foo.wav
num_divisions = 5
clip_length = 132301

sample_rate, samples = wav.read(path + "/" + name + ".wav") # extract audio
        
# convert to mono
if len(samples.shape) > 1:
    # Do a mean of all channels and keep it in one channel
    samples = np.mean(samples, axis=1)

# pad so divisible by num_divisions
samples = np.append(samples, np.zeros(num_divisions - (len(samples) % num_divisions)))

# perform STFT on 6sec samples
clip_samples=[]
start = 0
end = start + clip_length

while (end < len(samples)):
    clip_samples.append(samples[start: end])
    start = end
    end = end + clip_length

i=0
for sample in clip_samples:
    SFT = signal.ShortTimeFFT.from_window(win_param='tukey', 
                                          fs=sample_rate, 
                                          nperseg=sample_rate//20,      #make 20Hz minimum sampled frequency
                                          noverlap=(sample_rate//20)//2,  #50% overlap
                                          fft_mode='onesided', 
                                          scale_to='magnitude', 
                                          phase_shift=None,
                                          symmetric_win=True)
    Zxx = SFT.stft(sample)
    t = SFT.t(len(sample))
    f = SFT.f
    np.save(path + "/" + name + "_" + str(i) + "_Z.npy", Zxx)
    np.save(path + "/" + name + "_" + str(i) + "_T.npy", t)
    np.save(path + "/" + name + "_" + str(i) + "_F.npy", f)
    i+=1
    
    

In [9]:
genres = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

In [10]:
# Testing loop
def music_classifier(model, music_loader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    print("Real Genre : Guess")

    with torch.no_grad():
        for images, labels in music_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            
            # Print the real and predicted genre for each sample in the batch
            for label, pred in zip(labels, predicted):
                print(f"{genres[label]:10} : {genres[pred]:10}")
                
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            

    accuracy = 100 * correct / total
    print(f'Test Accuracy of the model on the test dataset: {accuracy:.2f}%')

In [11]:
root_dir = "genres_user_test/"
music_dataset = NumpyDataset(root_dir=root_dir, transform=transform)

print(len(music_dataset))

music_loader = DataLoader(dataset=music_dataset, batch_size=1, shuffle=False)

music_classifier(model, music_loader)

28
Real Genre : Guess
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : classical 
classical  : jazz      
classical  : classical 
classical  : classical 
Test Accuracy of the model on the test dataset: 96.43%
