In [1]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

class ASVspoofDataset(Dataset):
    def __init__(self, protocol_file, flac_dir, sample_size=16000, device='cpu'):
        self.protocol_file = protocol_file
        self.flac_dir = flac_dir
        self.sample_size = sample_size
        self.device = device
        self.data = self._load_protocol()

        # Define MFCC transform using torchaudio
        self.mfcc_transform = torchaudio.transforms.MFCC(
            sample_rate=16000,
            n_mfcc=13,
            melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 40}
        ).to(self.device)

    def _load_protocol(self):
        with open(self.protocol_file, 'r') as f:
            lines = f.readlines()

        data = []
        for line in lines:
            parts = line.strip().split()
            if len(parts) < 5:
                continue
            file_name = parts[1]
            label = 0 if parts[4] == 'bonafide' else 1
            data.append((file_name, label))
        return data

    def _pad_audio(self, audio):
        if audio.size(1) < self.sample_size:
            pad_size = self.sample_size - audio.size(1)
            audio = torch.nn.functional.pad(audio, (0, pad_size))
        else:
            audio = audio[:, :self.sample_size]
        return audio

    def __getitem__(self, idx):
        file_name, label = self.data[idx]
        path = os.path.join(self.flac_dir, f"{file_name}.flac")

        try:
            waveform, sample_rate = torchaudio.load(path)
        except Exception as e:
            print(f"Error loading {path}: {e}")
            return torch.zeros((13, self.sample_size // 160)), -1

        waveform = self._pad_audio(waveform)
        waveform = waveform.to(self.device)

        mfcc = self.mfcc_transform(waveform)
        mfcc = mfcc.squeeze(0)

        return mfcc, label

    def __len__(self):
        return len(self.data)

class SimpleDNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        super(SimpleDNN, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.classifier(x)

def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for x, y in tqdm(dataloader, desc="Training"):
        x = x.to(device)
        y = y.to(device)

        x = x.mean(dim=-1)  # Average over time dimension
        outputs = model(x)
        loss = criterion(outputs, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Training loss: {total_loss / len(dataloader):.4f}")

# Configs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
protocol_path = "/home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"
flac_dir = "/home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac"
sample_size = 16000  # 1 second of audio

# Load data
dataset = ASVspoofDataset(protocol_path, flac_dir, sample_size=sample_size, device=device)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Build and train model
model = SimpleDNN(input_dim=13).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(model, dataloader, criterion, optimizer, device)


/pytorch/aten/src/ATen/native/cuda/Loss.cu:242: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:242: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:242: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:242: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [3,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:242: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [4,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/src/ATen/native/cuda/Loss.cu:242: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [5,0,0] Assertion `t >= 0 && t < n_classes` failed.
/pytorch/aten/sr

Error loading /home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/LA_T_8869771.flac: Error loading audio file: failed to open file /home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/LA_T_8869771.flac
Error loading /home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/LA_T_5968863.flac: Error loading audio file: failed to open file /home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/LA_T_5968863.flac
Error loading /home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/LA_T_8697779.flac: Error loading audio file: failed to open file /home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/LA_T_8697779.flac
Error loading /home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/LA_T_7574942.flac: Error loading audio file: failed to open file /home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/LA_T_7574942.flac
Error loading /home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/LA_T_3101729.flac: Err

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [2]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

class ASVspoofDataset(Dataset):
    def __init__(self, protocol_file, flac_dir, sample_size=16000, device='cpu'):
        """
        Args:
            protocol_file (str): Path to the protocol file containing file names and labels.
            flac_dir (str): Directory where FLAC audio files are stored.
            sample_size (int): Target sample size for the audio (default: 16000 for 1 second).
            device (str): The device to store the data ('cpu' or 'cuda').
        """
        self.protocol_file = protocol_file
        self.flac_dir = flac_dir
        self.sample_size = sample_size
        self.device = device
        self.data = self._load_protocol()

        # Define MFCC transform using torchaudio
        self.mfcc_transform = torchaudio.transforms.MFCC(
            sample_rate=16000,
            n_mfcc=13,
            melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 40}
        ).to(self.device)

    def _load_protocol(self):
        """
        Load the protocol file containing file names and labels.
        """
        with open(self.protocol_file, 'r') as f:
            lines = f.readlines()

        data = []
        for line in lines:
            parts = line.strip().split()
            if len(parts) < 5:
                continue
            file_name = parts[1]
            label = 0 if parts[4] == 'bonafide' else 1
            data.append((file_name, label))
        return data

    def _pad_audio(self, audio):
        """
        Pad or truncate the audio to the target sample size.
        """
        if audio.size(1) < self.sample_size:
            pad_size = self.sample_size - audio.size(1)
            audio = torch.nn.functional.pad(audio, (0, pad_size))
        else:
            audio = audio[:, :self.sample_size]
        return audio

    def __getitem__(self, idx):
        """
        Fetch the waveform and corresponding label for a given index.
        """
        file_name, label = self.data[idx]
        path = os.path.join(self.flac_dir, f"{file_name}.flac")

        try:
            waveform, sample_rate = torchaudio.load(path)
        except Exception as e:
            print(f"Error loading {path}: {e}")
            # Return a tensor of zeros (representing failed load)
            return torch.zeros((13, self.sample_size // 160)), -1

        waveform = self._pad_audio(waveform)
        waveform = waveform.to(self.device)

        mfcc = self.mfcc_transform(waveform)
        mfcc = mfcc.squeeze(0)  # Remove channel dimension if it's 1

        return mfcc, label

    def __len__(self):
        """
        Return the total number of samples in the dataset.
        """
        return len(self.data)


class SimpleDNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        """
        Simple feed-forward neural network for classification.
        
        Args:
            input_dim (int): The input dimension for the first layer (13 for MFCC features).
            hidden_dim (int): Number of neurons in the hidden layer.
            output_dim (int): Number of output classes (2: bonafide, spoof).
        """
        super(SimpleDNN, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        """
        Forward pass of the model.
        """
        return self.classifier(x)


def train(model, dataloader, criterion, optimizer, device):
    """
    Training loop.
    
    Args:
        model (nn.Module): The model to train.
        dataloader (DataLoader): DataLoader providing batches of data.
        criterion (nn.Module): The loss function.
        optimizer (torch.optim.Optimizer): The optimizer.
        device (str): The device ('cpu' or 'cuda').
    """
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0

    # Loop through the training data
    for x, y in tqdm(dataloader, desc="Training"):
        x = x.to(device)
        y = y.to(device)

        # Average over time (dim=-1)
        x = x.mean(dim=-1)  # MFCC features averaged across time

        # Forward pass
        outputs = model(x)
        loss = criterion(outputs, y)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Loss and accuracy calculation
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == y).sum().item()
        total_preds += y.size(0)

    # Print loss and accuracy
    avg_loss = total_loss / len(dataloader)
    accuracy = 100 * correct_preds / total_preds
    print(f"Training loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

# Configs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
protocol_path = "/home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"
flac_dir = "/home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac"
sample_size = 16000  # 1 second of audio

# Load data
dataset = ASVspoofDataset(protocol_path, flac_dir, sample_size=sample_size, device=device)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

# Build and train model
model = SimpleDNN(input_dim=13).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
train(model, dataloader, criterion, optimizer, device)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [3]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

class ASVspoofDataset(Dataset):
    def __init__(self, protocol_file, flac_dir, sample_size=16000, device='cpu'):
        """
        Dataset class for loading the ASVspoof dataset.

        Args:
            protocol_file (str): Path to the protocol file.
            flac_dir (str): Directory containing the FLAC audio files.
            sample_size (int): The fixed size to pad or truncate the audio.
            device (str): The device to run the model on ('cpu' or 'cuda').
        """
        self.protocol_file = protocol_file
        self.flac_dir = flac_dir
        self.sample_size = sample_size
        self.device = device
        self.data = self._load_protocol()

        # Define MFCC transform using torchaudio
        self.mfcc_transform = torchaudio.transforms.MFCC(
            sample_rate=16000,
            n_mfcc=13,
            melkwargs={'n_fft': 400, 'hop_length': 160, 'n_mels': 40}
        ).to(self.device)

    def _load_protocol(self):
        """
        Load the protocol file and extract the filenames and labels.

        Returns:
            data (list): List of tuples (filename, label)
        """
        with open(self.protocol_file, 'r') as f:
            lines = f.readlines()

        data = []
        for line in lines:
            parts = line.strip().split()
            if len(parts) < 5:
                continue
            file_name = parts[1]
            label = 0 if parts[4] == 'bonafide' else 1  # bonafide -> 0, spoof -> 1
            data.append((file_name, label))
        return data

    def _pad_audio(self, audio):
        """
        Pad or truncate the audio to a fixed sample size.

        Args:
            audio (Tensor): Audio waveform tensor.

        Returns:
            Tensor: Padded or truncated audio tensor.
        """
        if audio.size(1) < self.sample_size:
            pad_size = self.sample_size - audio.size(1)
            audio = torch.nn.functional.pad(audio, (0, pad_size))
        else:
            audio = audio[:, :self.sample_size]
        return audio

    def __getitem__(self, idx):
        """
        Fetch the waveform and corresponding label for a given index.

        Args:
            idx (int): Index of the item to fetch.

        Returns:
            Tuple: (MFCC features, label) or None if there's an error.
        """
        file_name, label = self.data[idx]
        path = os.path.join(self.flac_dir, f"{file_name}.flac")

        try:
            waveform, sample_rate = torchaudio.load(path)
        except Exception as e:
            print(f"Error loading {path}: {e}")
            # Skip invalid files by returning None
            return None

        waveform = self._pad_audio(waveform)
        waveform = waveform.to(self.device)

        # Apply MFCC transform
        mfcc = self.mfcc_transform(waveform)
        mfcc = mfcc.squeeze(0)  # Remove the channel dimension if it's 1

        return mfcc, label

    def __len__(self):
        """
        Return the number of samples in the dataset.

        Returns:
            int: Number of samples in the dataset.
        """
        return len(self.data)


class SimpleDNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        """
        Simple fully connected neural network for classification.

        Args:
            input_dim (int): The dimension of the input features.
            hidden_dim (int): The number of units in the hidden layer.
            output_dim (int): The number of output classes.
        """
        super(SimpleDNN, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        """
        Forward pass through the network.

        Args:
            x (Tensor): Input features.

        Returns:
            Tensor: Predicted class probabilities.
        """
        return self.classifier(x)


def train(model, dataloader, criterion, optimizer, device):
    """
    Training loop.

    Args:
        model (nn.Module): The model to train.
        dataloader (DataLoader): DataLoader providing batches of data.
        criterion (nn.Module): The loss function.
        optimizer (torch.optim.Optimizer): The optimizer.
        device (str): The device ('cpu' or 'cuda').
    """
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0

    # Loop through the training data
    for x, y in tqdm(dataloader, desc="Training"):
        if x is None:  # Skip invalid data points
            continue

        x = x.to(device)
        y = y.to(device)

        # Average over time (dim=-1)
        x = x.mean(dim=-1)  # MFCC features averaged across time

        # Forward pass
        outputs = model(x)
        loss = criterion(outputs, y)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Loss and accuracy calculation
        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_preds += (predicted == y).sum().item()
        total_preds += y.size(0)

    # Print loss and accuracy
    avg_loss = total_loss / len(dataloader)
    accuracy = 100 * correct_preds / total_preds
    print(f"Training loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")


# Configs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
protocol_path = "/home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"
flac_dir = "/home/rben10/team_labs/ASVSpoof19/LA/ASVspoof2019_LA_train/flac"
sample_size = 16000  # 1 second of audio

# Load data
dataset = ASVspoofDataset(protocol_path, flac_dir, sample_size=sample_size, device=device)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Build and train model
model = SimpleDNN(input_dim=13).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(model, dataloader, criterion, optimizer, device)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
