In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pickle
import os
import numpy as np
import pandas as pd

In [2]:
class MFCCDataset(Dataset):
    def __init__(self, metadata_path, fold_numbers, transform=None):
        """
        Args:
            metadata_path (string): Path to the metadata CSV file
            fold_numbers (list): List of fold numbers to include (e.g., [1,2,3] for training)
            transform (callable, optional): Optional transform to be applied
        """
        self.metadata = pd.read_csv(metadata_path)
        self.metadata = self.metadata[self.metadata['fold'].isin(fold_numbers)]
        self.transform = transform
        self.classes = sorted(self.metadata['class'].unique())
        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        # Get the row from metadata
        row = self.metadata.iloc[idx]
        
        # Load MFCC features from pickle file
        with open(row['mfcc_path'], 'rb') as f:
            mfcc_data = pickle.load(f)
        
        # Get features and convert to tensor
        features = mfcc_data['features']
        features = torch.from_numpy(features).float()
        
        # Add channel dimension (PyTorch expects channels first)
        features = features.unsqueeze(0)  # Shape becomes (1, n_mfcc, time_steps)
        
        # Get label and convert to tensor
        label = self.class_to_idx[row['class']]
        label = torch.tensor(label).long()
        
        if self.transform:
            features = self.transform(features)
            
        return features, label


In [3]:
def get_data_loaders(batch_size=32):
    # Path to your metadata file (created during preprocessing)
    metadata_path = 'processed_data/mfcc_metadata.csv'
    
    # Define which folds to use for each set
    train_folds = [1, 2, 3, 4, 5, 6, 7, 8]  # 80% of data
    val_folds = [9]                          # 10% of data
    test_folds = [10]                        # 10% of data
    
    # Create datasets
    train_dataset = MFCCDataset(metadata_path, train_folds)
    val_dataset = MFCCDataset(metadata_path, val_folds)
    test_dataset = MFCCDataset(metadata_path, test_folds)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, 
                             shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size,
                          shuffle=False, num_workers=4, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size,
                           shuffle=False, num_workers=4, pin_memory=True)
    
    return train_loader, val_loader, test_loader, train_dataset.classes

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class AudioCNN(nn.Module):
    def __init__(self, num_classes):
        super(AudioCNN, self).__init__()
        # Input shape: (batch_size, 1, 40, ~87)
        
        # Simplified architecture with fewer parameters
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)  # Output: (16, 20, 43)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)  # Output: (32, 10, 21)
        )
        
        self.conv3 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)  # Output: (64, 5, 10)
        )

        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))  # Output: (64, 1, 1)

        # Simplified classifier with more dropout
        self.fc_out = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(32, num_classes)
        )
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.global_avg_pool(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.fc_out(x)
        return x

In [5]:
def train_model(train_loader, val_loader, model, device, num_epochs=30):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)
    
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        train_loss /= len(train_loader)
        train_acc = 100 * correct / total
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_loss /= len(val_loader)
        val_acc = 100 * correct / total
        
        # Update learning rate
        scheduler.step(val_loss)
        
        # Print statistics
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%')
        print(f'Val Loss: {val_loss:.4f}, Acc: {val_acc:.2f}%')
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            print('Saved new best model')
    
    print('Training complete')

In [6]:
def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Training on device: {device}")
    
    # Get data loaders
    train_loader, val_loader, test_loader, classes = get_data_loaders(batch_size=32)
    print(f'Found {len(classes)} classes: {classes}')
    
    # Initialize model
    model = AudioCNN(num_classes=len(classes)).to(device)
    
    # Train model
    train_model(train_loader, val_loader, model, device, num_epochs=50)
    
    # Evaluate on test set
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f'Test Accuracy: {100 * correct / total:.2f}%')

In [7]:
main()

Training on device: cuda
Found 10 classes: ['air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music']
Epoch 1/50:
Train Loss: 2.0252, Acc: 25.54%
Val Loss: 1.7606, Acc: 39.34%
Saved new best model
Epoch 2/50:
Train Loss: 1.6351, Acc: 40.12%
Val Loss: 1.5177, Acc: 43.63%
Saved new best model
Epoch 3/50:
Train Loss: 1.4680, Acc: 46.65%
Val Loss: 1.4896, Acc: 42.16%
Saved new best model
Epoch 4/50:
Train Loss: 1.3498, Acc: 50.95%
Val Loss: 1.3395, Acc: 54.04%
Saved new best model
Epoch 5/50:
Train Loss: 1.2537, Acc: 54.09%
Val Loss: 1.4046, Acc: 56.74%
Epoch 6/50:
Train Loss: 1.2043, Acc: 56.35%
Val Loss: 1.3644, Acc: 56.00%
Epoch 7/50:
Train Loss: 1.1542, Acc: 58.68%
Val Loss: 1.5663, Acc: 45.71%
Epoch 8/50:
Train Loss: 1.1249, Acc: 60.25%
Val Loss: 1.2713, Acc: 60.17%
Saved new best model
Epoch 9/50:
Train Loss: 1.0754, Acc: 62.06%
Val Loss: 1.4727, Acc: 53.19%
Epoch 10/50:
Train Loss: 1.0460, Acc: 62.3