In [1]:
import os
import torch
import torchaudio
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import librosa
import random
from scipy.io import wavfile
from scipy.fft import fft, fftfreq

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [3]:
class AudioDataset(Dataset):
    def __init__(self, root_dir, sample_rate=16000, segment_length=5*16000, exclude_files=None, train_ratio=0.2):
        """
        Args:
            root_dir (str): Root directory with genre folders containing audio files.
            sample_rate (int): Target sample rate for audio files.
            segment_length (int): Length of each audio segment in samples (6 seconds).
            set_type (str): Specify "train" for training set and "test" for testing set.
            train_ratio (float): The ratio of the data to use for training (e.g., 0.2 for 20% train, 80% test).
        """
        self.root_dir = root_dir
        self.sample_rate = sample_rate
        self.segment_length = segment_length  # Length of each segment (10 seconds)
        self.exclude_files = exclude_files if exclude_files else []
        self.file_paths = []
        self.labels = []
        self.classes = sorted(os.listdir(root_dir))  # Get genre names as class labels

        file_limit = 289
        # Collect file paths and corresponding labels
        for label_idx, genre in enumerate(self.classes):
            genre_folder = os.path.join(root_dir, genre)
            i = 0
            for file_name in os.listdir(genre_folder):
                if i == file_limit:
                    break
                if file_name.endswith(".wav") and file_name not in self.exclude_files:
                    self.file_paths.append(os.path.join(genre_folder, file_name))
                    self.labels.append(label_idx)
                    i += 1

        # Shuffle and split data
        combined = list(zip(self.file_paths, self.labels))
        random.shuffle(combined)
        self.file_paths, self.labels = zip(*combined)
    
    def __len__(self):
        return len(self.file_paths) * 6  # Each file is split into 6 segments
    
    def load_audio_with_fallback(self, file_path):
        """
        Load audio file using torchaudio, and fall back to librosa if needed.
        """
        waveform, sr = torchaudio.load(file_path)
        if sr != self.sample_rate:
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=self.sample_rate)(waveform)
        return waveform

    def __getitem__(self, idx):
        """
        Get a 5-second audio segment and its label from the dataset.

        Args:
            idx (int): Index of the audio segment to retrieve.

        Returns:
            tuple: A tuple containing the Fourier-transformed waveform segment and its corresponding label.
        """
        # Determine file index and segment index
        file_idx = idx // 6       # Index of the file in self.file_paths
        segment_idx = idx % 6     # Segment within the file (for 5-second clips)

        # Load the waveform
        audio_path = self.file_paths[file_idx]
        waveform = self.load_audio_with_fallback(audio_path)
        if waveform is None:
            print(f"Skipping {audio_path} due to load failure.")
            return None

        # Ensure mono audio
        if waveform.size(0) > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Calculate start and end for each 5-second segment
        start = segment_idx * self.segment_length
        end = start + self.segment_length
        waveform_segment = waveform[:, start:end]

        # If segment is shorter than segment_length, pad with zeros
        num_samples = waveform_segment.size(1)
        if num_samples < self.segment_length:
            padding = self.segment_length - num_samples
            waveform_segment = torch.nn.functional.pad(waveform_segment, (0, padding))

        # Perform Fourier Transform on the segment and keep only the first half
        waveform_np = waveform_segment.numpy()  # Convert tensor to numpy array
        fourier_transform = np.abs(fft(waveform_np[0]))[:self.segment_length // 2]  # Only the first half

        # Retrieve label
        label = self.labels[file_idx]

        return torch.tensor(fourier_transform, dtype=torch.float32).unsqueeze(0), label  # Add channel dimension

In [4]:
# Define another 1D CNN model for Fourier-transformed audio data
class AudioCNN(nn.Module):
    def __init__(self, input_length=5*16000 // 2):
        super(AudioCNN, self).__init__()
        
        # First convolutional layer and pooling
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=8, kernel_size=3, stride=1, padding='same')
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)

        # Second convolutional layer and pooling
        self.conv2 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=3, stride=1, padding='same')
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)

        # Third convolutional layer and pooling
        self.conv3 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding='same')
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # Fourth convolutional layer and pooling
        self.conv4 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding='same')
        self.pool4 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        # Fifth convolutional layer and pooling
        self.conv5 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding='same')
        self.pool5 = nn.MaxPool1d(kernel_size=2, stride=2)

        # Calculate the flattened size after the last pooling layer
        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, input_length)  # Batch size 1, 1 channel, input length
            out = F.relu(self.conv1(dummy_input))
            out = self.pool1(out)
            out = F.relu(self.conv2(out))
            out = self.pool2(out)
            out = F.relu(self.conv3(out))
            out = self.pool3(out)
            out = F.relu(self.conv4(out))
            out = self.pool4(out)
            out = F.relu(self.conv5(out))
            out = self.pool5(out)
            self.flattened_size = out.shape[1] * out.shape[2]  # Channels * Width after conv/pooling layers

        # Define fully connected layers using the computed flattened size
        self.fc1 = nn.Linear(in_features=self.flattened_size, out_features=3200)
        self.fc2 = nn.Linear(in_features=3200, out_features=1600)
        self.fc3 = nn.Linear(in_features=1600, out_features=800)
        self.fc4 = nn.Linear(in_features=800, out_features=10)

    def forward(self, x):
        # Pass through convolutional and pooling layers
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = self.pool3(x)
        x = F.relu(self.conv4(x))
        x = self.pool4(x)

        # Flatten the tensor to match the input of the fully connected layer
        x = x.view(x.size(0), -1)  # Flatten all dimensions except batch
        x = F.relu(self.fc1(x))    # Pass through first fully connected layer
        x = F.relu(self.fc2(x))    # Pass through second fully connected layer
        x = F.relu(self.fc3(x))    # Pass through third fully connected layer
        x = self.fc4(x)            # Output layer
        return x

In [5]:
model_path = "MusicGenreClassifier1D_M4.pth"
model = AudioCNN().to(device)
model.load_state_dict(torch.load(model_path, weights_only=False))
print(model)

AudioCNN(
  (conv1): Conv1d(1, 8, kernel_size=(3,), stride=(1,), padding=same)
  (pool1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(8, 16, kernel_size=(3,), stride=(1,), padding=same)
  (pool2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(16, 32, kernel_size=(3,), stride=(1,), padding=same)
  (pool3): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv1d(32, 64, kernel_size=(3,), stride=(1,), padding=same)
  (pool4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv5): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=same)
  (pool5): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=160000, out_features=3200, bias=True)
  (fc2): Linear(in_features=3200, out_features=1600, bias=True)
  (fc3): Linear(in_features=1600, out_features=800, bias=True)
  (fc4): Linea

In [7]:
# Evaluation loop to calculate accuracy
def evaluate_single_file(model, file_path, label, classes, device, segment_length=5*16000):
    model.eval()  # Set the model to evaluation mode
    print(f"Evaluating file: {file_path}, True Label: {label}")

    label_index = classes.index(label)
    
    dataset = AudioDataset(root_dir="genres_test/", segment_length=segment_length)
    waveform = dataset.load_audio_with_fallback(file_path)

    # Ensure mono and process segments
    if waveform.size(0) > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    num_segments = waveform.size(1) // segment_length

    # Calculate the number of segments
    num_segments = (waveform.size(1) + segment_length - 1) // segment_length  # Round up for last segment

    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation
        for segment_idx in range(num_segments):
            start = segment_idx * segment_length
            end = start + segment_length
            waveform_segment = waveform[:, start:end]
            
            # Pad if necessary
            if waveform_segment.size(1) < segment_length:
                padding = segment_length - waveform_segment.size(1)
                waveform_segment = torch.nn.functional.pad(waveform_segment, (0, padding))
            
            # Perform Fourier Transform
            waveform_np = waveform_segment.numpy()
            fourier_transform = np.abs(fft(waveform_np[0]))[:segment_length // 2]
            
            # Convert to tensor
            input_tensor = torch.tensor(fourier_transform, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)

            # Forward pass
            output = model(input_tensor)
            _, predicted = torch.max(output.data, 1)
            
            # Check and update correctness
            if predicted.item() == label_index:
                correct += 1
            total += 1
            
            # Print prediction for the segment
            print(f"Segment {segment_idx + 1}: Predicted Genre: {classes[predicted.item()]}")

    # Calculate and display accuracy
    accuracy = 100 * correct / total
    print(f"Accuracy: {accuracy:.2f}%")

In [8]:
#chose test song
path = "genres_test/rock/RockOrBust.wav"
label = "rock"

genres = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

root_dir = "genres_test/"
music_dataset = AudioDataset(root_dir=root_dir, exclude_files=[])

music_loader = DataLoader(dataset=music_dataset, batch_size=1, shuffle=False)

evaluate_single_file(model=model,
                       file_path=path,
                       label=label,
                       classes=music_dataset.classes,
                       device=device)

Evaluating file: genres_test/rock/RockOrBust.wav, True Label: rock
Segment 1: Predicted Genre: metal
Segment 2: Predicted Genre: metal
Segment 3: Predicted Genre: rock
Segment 4: Predicted Genre: rock
Segment 5: Predicted Genre: metal
Segment 6: Predicted Genre: metal
Segment 7: Predicted Genre: metal
Segment 8: Predicted Genre: metal
Segment 9: Predicted Genre: metal
Segment 10: Predicted Genre: metal
Segment 11: Predicted Genre: rock
Segment 12: Predicted Genre: rock
Segment 13: Predicted Genre: metal
Segment 14: Predicted Genre: metal
Segment 15: Predicted Genre: rock
Segment 16: Predicted Genre: metal
Segment 17: Predicted Genre: metal
Segment 18: Predicted Genre: metal
Segment 19: Predicted Genre: metal
Segment 20: Predicted Genre: rock
Segment 21: Predicted Genre: rock
Segment 22: Predicted Genre: metal
Segment 23: Predicted Genre: metal
Segment 24: Predicted Genre: metal
Segment 25: Predicted Genre: metal
Segment 26: Predicted Genre: metal
Segment 27: Predicted Genre: metal
Segm