In [2]:
import torch
import librosa
import os
import matplotlib.pyplot as plt
import csv
import pandas as pd

dataset_name = "HeartsOnFire-v.1.0.0"

df = pd.read_csv(os.path.join(dataset_name, "master.csv"))
train_df = pd.DataFrame()
valid_df = pd.DataFrame()

for idx, row in df.iterrows():
    onsets_path = row['onsets']
    onsets_df = pd.read_csv(onsets_path)
    train_onsets = onsets_df[onsets_df['split'] == 'training']
    valid_onsets = onsets_df[onsets_df['split'] == 'validation']
    
    # Append to respective dataframes
    train_df = pd.concat([train_df, train_onsets], ignore_index=True)
    valid_df = pd.concat([valid_df, valid_onsets], ignore_index=True)

train_df

Unnamed: 0,onset_time,name,changed,kick,snare,hihat,tom,crash,ride,click,uncertain,other,split
0,4.806531,Pantera - Walk,True,True,False,False,False,False,False,False,False,False,training
1,5.143220,Pantera - Walk,True,True,False,False,False,False,False,False,False,False,training
2,5.479909,Pantera - Walk,True,True,False,False,False,False,False,False,False,False,training
3,5.816599,Pantera - Walk,True,True,False,False,False,False,False,False,False,False,training
4,5.979138,Pantera - Walk,True,True,False,False,False,False,False,False,False,False,training
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1179,102.562540,Toehider - I Have Little To No Memory of These...,True,False,True,False,False,False,False,False,False,False,training
1180,102.887619,Toehider - I Have Little To No Memory of These...,True,True,False,False,False,False,False,False,False,False,training
1181,102.945669,Toehider - I Have Little To No Memory of These...,True,True,False,False,False,False,False,False,False,False,training
1182,103.026939,Toehider - I Have Little To No Memory of These...,True,True,False,False,False,False,False,False,False,False,training


In [9]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
from torchvision import transforms
import numpy as np

# Define the dataset class
class DrumDataset(Dataset):
    def __init__(self, df, master_csv_path, transform, window_size=8192):
        self.df = df
        self.master_df = pd.read_csv(master_csv_path)
        self.window_size = window_size
        self.transform = transform
        self.audio_cache = {}

        for i, row in self.master_df.iterrows():
            audio_path = row['drums']
            audio, sr = torchaudio.load(audio_path, format="mp3")
            if audio.shape[0] == 2:
                audio = torch.mean(audio, dim=0, keepdim=True)
            self.audio_cache[row['name']] = (audio, sr)
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Load the onset time, label, and track name for the given index
        row = self.df.iloc[idx]
        onset_time = row['onset_time']
        labels = row[['kick', 'snare', 'hihat', 'tom', 'crash', 'ride', 'click']].astype(int).values.flatten()
        labels = torch.tensor(labels).float()
        track_name = row['name']

        audio = self.audio_cache[track_name][0]
        sr = self.audio_cache[track_name][1]

        onset_window = audio[:, int(onset_time*sr)-self.window_size//2:int(onset_time*sr)+self.window_size//2]
        spec = self.transform(onset_window[0])
        return spec, labels

transform = transforms.Compose([
    # transforms.RandomApply([
    #     torchaudio.transforms.PitchShift(sample_rate=44100, n_steps=random.uniform(-1, 1)),
    #     torchaudio.transforms.TimeStretch(hop_length=64, fixed_rate=random.uniform(0.8, 1.2)),
    # ], p=0.5),
    transforms.Lambda(lambda x: torch.stack([
            torchaudio.transforms.MelSpectrogram(
                sample_rate=44100,
                n_fft=2048,
                hop_length=64,
                n_mels=128
            )(x),
            torchaudio.transforms.Spectrogram(
                hop_length=64,
                n_fft=2048)(x)
        ], dim=0)),
    transforms.Lambda(lambda x: torch.stack([
            x[1],
            x[0],
            torchaudio.transforms.AmplitudeToDB()(x[0])
        ], dim=0))
])

#valid_dataset = DrumDataset(valid_df, "HeartsOnFire-v.1.0.0/master.csv", transform)
train_dataset = DrumDataset(train_df, "HeartsOnFire-v.1.0.0/master.csv", transform)

In [10]:
a, x = train_dataset.__getitem__(8)
specgram = a[0].squeeze(0).numpy()

# Display the spectrogram using matplotlib
plt.figure(figsize=(10, 4))
plt.imshow(specgram, cmap='Spectral_r', origin='lower')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.colorbar()
plt.show()

RuntimeError: stack expects each tensor to be equal size, but got [128, 129] at entry 0 and [1025, 129] at entry 1

In [12]:
# Define the neural network architecture
import torch.nn as nn

class DrumCNN(nn.Module):
    def __init__(self):
        super(DrumCNN, self).__init__()
        
        # Convolutional Layers
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.conv4 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm2d(128)
        self.relu4 = nn.ReLU()
        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)

        # Fully Connected Layers
        self.fc1 = nn.Linear(128 * 8 * 8, 512)
        self.bn5 = nn.BatchNorm1d(512)
        self.relu5 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.5)
        
        self.fc2 = nn.Linear(512, 256)
        self.bn6 = nn.BatchNorm1d(256)
        self.relu6 = nn.ReLU()
        self.dropout2 = nn.Dropout(p=0.5)
        
        self.fc3 = nn.Linear(256, 7)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        # Convolutional Layers
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.pool3(x)
        
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.relu4(x)
        x = self.pool4(x)
        
        # Flatten
        x = x.view(x.size(0), -1)
        
        # Fully Connected Layers
        x = self.fc1(x)
        x = self.bn5(x)
        x = self.relu5(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn6(x)
        x = self.relu6(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        x = self.softmax(x)
        
        return x
    
x = x.view(x.size(0), -1)

def accuracy(outputs, labels, threshold=0.5):
    with torch.no_grad():
        preds = (outputs > threshold).float()
        correct = (preds == labels).sum(dim=1).eq(labels.shape[1]).sum().item()
        total = labels.shape[0]
        return 100 * correct / total

# Create data loaders for training and validation sets
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# Initialize the neural network and optimizer
device = torch.device('cuda')
model = DrumCNN().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.1)

# Define the loss function
criterion = nn.CrossEntropyLoss()

# Train the model
n_epochs = 10
for epoch in range(n_epochs):
    train_loss = 0.0
    train_acc = 0.0
    valid_loss = 0.0
    valid_acc = 0.0

    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
        train_acc += accuracy(outputs, labels) * inputs.size(0)
    train_loss /= len(train_loader.dataset)
    train_acc /= len(train_loader.dataset)
    # Evaluate the model on the validation set
    model.eval()
    with torch.no_grad():
        for i, (inputs, labels) in enumerate(valid_loader):
            inputs = inputs.to(device)
            labels = labels.to(device).float()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            valid_loss += loss.item() * inputs.size(0)
            valid_acc += accuracy(outputs, labels) * inputs.size(0)
        valid_loss /= len(valid_loader.dataset)
        valid_acc /= len(valid_loader.dataset)
    
    # Update the learning rate based on the validation loss
    scheduler.step(valid_loss)
    
    # Print the training and validation loss and accuracy
    print(f'Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Valid Loss: {valid_loss:.4f}, Valid Acc: {valid_acc:.2f}%')

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x32768 and 2048x256)