### Imports

In [1]:
import scipy.io.wavfile as wav
import scipy.signal as signal
from matplotlib import pyplot as plt
import os
import numpy as np

import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
from time import time
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
import torch.nn.functional as F

### Generate Images

In [2]:
path = "genres_original"
genres = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
save_path = "genres_images"
max_per_genre = 280

if (False):
    for genre in genres:
        files = [f for f in os.listdir(path + "/" + genre) if os.path.isfile(os.path.join(path + "/" + genre, f))]
        print("Generating " + genre)
        n=0
        for file in files:
            if(n>=max_per_genre):
                break
                
            if(file == "jazz.00054.wav"):
                continue
            else:
                sample_rate, samples = wav.read(path + "/" + genre + "/" + file) # extract audio)
                    
            # convert to mono
            if len(samples.shape) > 1:
                # Do a mean of all channels and keep it in one channel
                samples = np.mean(samples, axis=1)
    
            # pad so divisible by 3
            samples = np.append(samples, np.zeros(3 - (len(samples) % 3)))
    
            # perform STFT on 10sec samples
            clip_samples = np.split(samples, 3, axis=0)
    
            i=0
            for sample in clip_samples:
                SFT = signal.ShortTimeFFT.from_window(win_param='tukey', 
                                                      fs=sample_rate, 
                                                      nperseg=sample_rate//20,      #make 20Hz minimum sampled frequency
                                                      noverlap=(sample_rate//20)//2,  
                                                      fft_mode='onesided', 
                                                      scale_to='magnitude', 
                                                      phase_shift=None,
                                                      symmetric_win=True)
                Zxx = SFT.stft(sample)
                t = SFT.t(len(sample))
                f = SFT.f
                np.save(save_path + "/" + genre + "/" + genre + "_" + str((n*3)+i) + "_Z.npy", Zxx)
                np.save(save_path + "/" + genre + "/" + genre + "_" + str((n*3)+i) + "_T.npy", t)
                np.save(save_path + "/" + genre + "/" + genre + "_" + str((n*3)+i) + "_F.npy", f)
                i+=1
            n+=1
    
    

### Run CNN

In [3]:
# Define the device (use GPU if available)
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
#device = 'cpu'
print(f'Using device: {device}')

Using device: cpu


In [4]:
# Custom Dataset class
class NumpyDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.Zsamples = []
        self.Tsamples = []
        self.Fsamples = []
        
        # Load file paths and labels
        for label, class_dir in enumerate(os.listdir(root_dir)):
            class_path = os.path.join(root_dir, class_dir)
            if os.path.isdir(class_path):
                for file_name in os.listdir(class_path):
                    if file_name.endswith("Z.npy"):
                        file_path = os.path.join(class_path, file_name)
                        self.Zsamples.append((file_path, label))
                    elif file_name.endswith("T.npy"):
                        file_path = os.path.join(class_path, file_name)
                        self.Tsamples.append((file_path, label))
                    elif file_name.endswith("F.npy"):
                        file_path = os.path.join(class_path, file_name)
                        self.Fsamples.append((file_path, label))
    
    def __len__(self):
        return len(self.Zsamples)
    
    def __getitem__(self, idx):
        file_path, label = self.Zsamples[idx]
        data = np.load(file_path)  # Load numpy array
        data = np.expand_dims(data, axis=0)
        data = np.abs(data) #take magnitude only
        data = torch.tensor(data, dtype=torch.float32)  # Convert to PyTorch tensor
        
        if self.transform:
            data = self.transform(data)
        
        return data, label

    def spectrograms(self, idx):
        Zfile_path, label = self.Zsamples[idx]
        Tfile_path, label = self.Tsamples[idx]
        Ffile_path, label = self.Fsamples[idx]
        Zdata = np.load(Zfile_path)  # Load numpy array
        Tdata = np.load(Tfile_path)  # Load numpy array
        Fdata = np.load(Ffile_path)  # Load numpy array
        # Create a 2x2 subplot grid
        fig, axes = plt.subplots(2, 1, figsize=(10, 16))
        
        # First subplot
        c1 = axes[0].pcolormesh(Tdata, Fdata, np.log(np.abs(Zdata)), cmap='gnuplot')
        fig.colorbar(c1, ax=axes[0])
        axes[0].set_title("Spectrogram Magnitude")
        
        # Second subplot
        c2 = axes[1].pcolormesh(Tdata, Fdata, np.angle(Zdata), cmap='gnuplot')
        fig.colorbar(c2, ax=axes[1])
        axes[1].set_title("Spectrogram Angle")        

In [5]:
# Define the transformations
# - ToTensor: Converts the image to a PyTorch tensor
# - Normalize: Normalizes using mean and std of the ImageNet dataset
transform = transforms.Compose([
    transforms.CenterCrop((552, 400)),
    transforms.Normalize((0.5,), (0.5,))
])

In [6]:
root_dir = "genres_images/"
dataset = NumpyDataset(root_dir=root_dir, transform=transform)

total_count = len(dataset)
train_count = int(0.8 * total_count)
test_count = total_count - train_count
train_dataset, test_dataset = torch.utils.data.random_split(dataset, (train_count, test_count))
print(len(train_dataset))
print(len(test_dataset))

6720
1680


In [7]:
#dataset[0][0].shape

In [8]:
#dataset.spectrograms(0)

In [9]:
#dataset.spectrograms(1)

In [10]:
# Create PyTorch Datasets and DataLoaders for training and testing
# Define the root directory where data is stored
root_dir = "genres_images/"

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

print('Data loaders created successfully.')

Data loaders created successfully.


In [11]:
# Define the model
class MusicNet(nn.Module):
    def __init__(self):
        super(MusicNet, self).__init__()
        # C1: Convolutional Layer (input channels: 1, output channels: 6, kernel size: 3x3)
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding='same')
        
        # C2: Convolutional Layer (input channels: 6, output channels: 12, kernel size: 3x3)
        self.conv2 = nn.Conv2d(in_channels=4, out_channels=8, kernel_size=3, stride=1, padding='same')
        
        # S3: Subsampling (Max Pooling with kernel size: 2x2 and stride: 2)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # C4: Convolutional Layer (input channels: 12, output channels: 24, kernel size: 3x3)
        self.conv3 = nn.Conv2d(in_channels=8, out_channels=12, kernel_size=3, stride=1, padding='same')

        # C5: Convolutional Layer (input channels: 6, output channels: 16, kernel size: 3x3)
        self.conv4 = nn.Conv2d(in_channels=12, out_channels=16, kernel_size=3, stride=1, padding='same')
        
        # S6: Subsampling (Max Pooling with kernel size: 2x2 and stride: 2)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)

        # C7: Convolutional Layer (input channels: 48, output channels: 96, kernel size: 3x3)
        self.conv5 = nn.Conv2d(in_channels=16, out_channels=20, kernel_size=3, stride=1, padding='same')

        # C8: Convolutional Layer (input channels: 6, output channels: 16, kernel size: 3x3)
        self.conv6 = nn.Conv2d(in_channels=20, out_channels=24, kernel_size=3, stride=1, padding='same')
        
        # S9: Subsampling (Max Pooling with kernel size: 2x2 and stride: 2)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # C10: Fully connected convolutional layer (input size: 192, output size: 120)
        self.fc1 = nn.Linear(in_features=24*69*50, out_features=5000)
        
        # F11: Fully connected layer (input size: 10000, output size: 1000)
        self.fc2 = nn.Linear(in_features=5000, out_features=1000)

        # F12: Fully connected layer (input size: 1000, output size: 100)
        self.fc3 = nn.Linear(in_features=1000, out_features=100)
        
        # Output layer (input size: 100, output size: 10)
        self.fc4 = nn.Linear(in_features=100, out_features=10)

    def forward(self, x):
        # Apply the first convolution and activation function
        # print('x dims', x.shape)
        
        x = F.relu(self.conv1(x))    # C1
        x = F.relu(self.conv2(x))    # C2
        x = self.pool1(x)            # S3
        x = F.relu(self.conv3(x))    # C4
        x = F.relu(self.conv4(x))    # C5
        x = self.pool2(x)            # S6
        x = F.relu(self.conv5(x))    # C7
        x = F.relu(self.conv6(x))    # C8
        x = self.pool3(x)            # S9
        
        x = x.view(x.shape[0], 24*69*50)       # Flatten before passing to fully connected layers
        # Fully connected layers with activation functions
        x = F.relu(self.fc1(x))      # C10
        x = F.relu(self.fc2(x))      # F11
        x = F.relu(self.fc3(x))      # F12
        # Output layer (no activation function because we will use CrossEntropyLoss which includes Softmax)
        x = self.fc4(x)              # Output layer
        return x

# Instantiate the model
model = MusicNet().to(device)

# Print the model architecture
print(model)

MusicNet(
  (conv1): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (conv2): Conv2d(4, 8, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(8, 12, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (conv4): Conv2d(12, 16, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv5): Conv2d(16, 20, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (conv6): Conv2d(20, 24, kernel_size=(3, 3), stride=(1, 1), padding=same)
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=82800, out_features=5000, bias=True)
  (fc2): Linear(in_features=5000, out_features=1000, bias=True)
  (fc3): Linear(in_features=1000, out_features=100, bias=True)
  (fc4): Linear(in_features=100, out_features=10, bias=True)
)


In [12]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss() # includes softmax function
optimizer = optim.Adam(model.parameters(), lr=0.001)

print('Model, loss function, and optimizer are set up.')

Model, loss function, and optimizer are set up.


In [13]:
# Training loop
def train_model(model, train_loader, criterion, optimizer, epochs=5):
    time_start = time()
    model.train()  # Set model to training mode
    for epoch in range(epochs):
        total_loss = 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # Note: we should do early stopping here! 
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}')

    time_stop = time()
    time_elapsed = time_stop - time_start
    print(f'elapsed time {round(time_elapsed,1)} sec.')

train_model(model, train_loader, criterion, optimizer, epochs=10)

Epoch [1/6], Loss: 2.4145
Epoch [2/6], Loss: 1.7026
Epoch [3/6], Loss: 1.3647
Epoch [4/6], Loss: 1.1233
Epoch [5/6], Loss: 0.9354
Epoch [6/6], Loss: 0.6977
elapsed time 5514.4 sec.


In [14]:
# Testing loop
def test_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy of the model on the test dataset: {accuracy:.2f}%')

test_model(model, test_loader)

Test Accuracy of the model on the test dataset: 56.01%
