In [3]:
import os
import torch
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Dataset class to handle audio loading and feature extraction
class AudioDataset(Dataset):
    def __init__(self, file_list, label_dict, sr=16000, max_len=16000):
        """
        Args:
            file_list: List of file paths to audio files.
            label_dict: Dictionary mapping file names to labels.
            sr: Sampling rate.
            max_len: Maximum length of audio in samples (padding/truncating length).
        """
        self.file_list = file_list
        self.label_dict = label_dict
        self.sr = sr
        self.max_len = max_len

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        # Get audio file path
        path = self.file_list[idx]
        # Get the corresponding label from label dictionary
        label = self.label_dict[os.path.basename(path)]
        
        # Load the audio file using librosa
        try:
            y, _ = librosa.load(path, sr=self.sr)
        except Exception as e:
            print(f"Error loading {path}: {e}")
            y = np.zeros(self.max_len)  # If error, return a zero-filled array

        # Truncate or pad the audio signal
        if len(y) > self.max_len:
            y = y[:self.max_len]
        else:
            y = np.pad(y, (0, self.max_len - len(y)))

        # Extract MFCC features (13 MFCCs by default)
        mfcc = librosa.feature.mfcc(y=y, sr=self.sr, n_mfcc=13)
        mfcc = torch.tensor(mfcc, dtype=torch.float32)
        
        return mfcc.T, torch.tensor(label, dtype=torch.long)  # Transpose for time x features

# Custom collate function to pad feature sequences
def collate_fn(batch):
    features, labels = zip(*batch)
    features = [torch.tensor(f) for f in features]
    features_padded = pad_sequence(features, batch_first=True)  # Shape: [batch_size, max_seq_len, features]
    labels = torch.stack(labels)  # Stack labels into a tensor
    return features_padded, labels

# Example file list and label dictionary (you will provide these)
file_list = ["/path/to/audio1.wav", "/path/to/audio2.wav"]  # Replace with actual paths
label_dict = {"audio1.wav": 0, "audio2.wav": 1}  # Example labels

# Create the dataset
dataset = AudioDataset(file_list=file_list, label_dict=label_dict, sr=16000, max_len=16000)

# DataLoader to batch the data
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Define a simple neural network model
class SimpleModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, 128, batch_first=True)
        self.fc = nn.Linear(128, output_dim)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        x = lstm_out[:, -1, :]  # Take the last time step output
        x = self.fc(x)
        return x

# Initialize the model, criterion, and optimizer
model = SimpleModel(input_dim=13, output_dim=2)  # 13 MFCCs as input, 2 output classes
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
def train(model, dataloader, criterion, optimizer, device):
    model.to(device)
    model.train()
    total_loss = 0
    
    # Iterate over batches
    for x, y in tqdm(dataloader, desc="Training"):
        x, y = x.to(device), y.to(device)
        
        # Zero gradients, forward pass, calculate loss, and backward pass
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Training Loss: {avg_loss:.4f}")

# Run training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, dataloader, criterion, optimizer, device)


  y, _ = librosa.load(path, sr=self.sr)
  features = [torch.tensor(f) for f in features]
Training: 100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 129.55it/s]

Error loading /path/to/audio2.wav: [Errno 2] No such file or directory: '/path/to/audio2.wav'
Error loading /path/to/audio1.wav: [Errno 2] No such file or directory: '/path/to/audio1.wav'
Training Loss: 0.7057





In [4]:
!pip install soundfile audioread




In [5]:
import os
import torch
import librosa
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Dataset class to handle audio loading and feature extraction
class AudioDataset(Dataset):
    def __init__(self, file_list, label_dict, sr=16000, max_len=16000):
        """
        Args:
            file_list: List of file paths to audio files.
            label_dict: Dictionary mapping file names to labels.
            sr: Sampling rate.
            max_len: Maximum length of audio in samples (padding/truncating length).
        """
        self.file_list = file_list
        self.label_dict = label_dict
        self.sr = sr
        self.max_len = max_len

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        # Get audio file path
        path = self.file_list[idx]
        # Get the corresponding label from label dictionary
        label = self.label_dict[os.path.basename(path)]
        
        # Load the audio file using librosa
        try:
            y, _ = librosa.load(path, sr=self.sr)
        except Exception as e:
            print(f"Error loading {path}: {e}")
            y = np.zeros(self.max_len)  # If error, return a zero-filled array

        # Truncate or pad the audio signal
        if len(y) > self.max_len:
            y = y[:self.max_len]
        else:
            y = np.pad(y, (0, self.max_len - len(y)))

        # Extract MFCC features (13 MFCCs by default)
        mfcc = librosa.feature.mfcc(y=y, sr=self.sr, n_mfcc=13)
        mfcc = torch.tensor(mfcc, dtype=torch.float32)
        
        return mfcc.T, torch.tensor(label, dtype=torch.long)  # Transpose for time x features

# Custom collate function to pad feature sequences
def collate_fn(batch):
    features, labels = zip(*batch)
    features = [f.detach().clone() for f in features]  # Fix for the tensor warning
    features_padded = pad_sequence(features, batch_first=True)  # Shape: [batch_size, max_seq_len, features]
    labels = torch.stack(labels)  # Stack labels into a tensor
    return features_padded, labels

# Example file list and label dictionary (replace these with actual files)
file_list = ["./data/audio1.wav", "./data/audio2.wav"]  # Replace with actual paths
label_dict = {"audio1.wav": 0, "audio2.wav": 1}  # Example labels

# Create the dataset
dataset = AudioDataset(file_list=file_list, label_dict=label_dict, sr=16000, max_len=16000)

# DataLoader to batch the data
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Define a simple neural network model
class SimpleModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, 128, batch_first=True)
        self.fc = nn.Linear(128, output_dim)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        x = lstm_out[:, -1, :]  # Take the last time step output
        x = self.fc(x)
        return x

# Initialize the model, criterion, and optimizer
model = SimpleModel(input_dim=13, output_dim=2)  # 13 MFCCs as input, 2 output classes
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
def train(model, dataloader, criterion, optimizer, device):
    model.to(device)
    model.train()
    total_loss = 0
    
    # Iterate over batches
    for x, y in tqdm(dataloader, desc="Training"):
        x, y = x.to(device), y.to(device)
        
        # Zero gradients, forward pass, calculate loss, and backward pass
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Training Loss: {avg_loss:.4f}")

# Run training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, dataloader, criterion, optimizer, device)


  y, _ = librosa.load(path, sr=self.sr)
Training: 100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 35.68it/s]

Error loading ./data/audio1.wav: [Errno 2] No such file or directory: './data/audio1.wav'
Error loading ./data/audio2.wav: [Errno 2] No such file or directory: './data/audio2.wav'
Training Loss: 0.7692



