In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
import librosa
import numpy as np

In [93]:
import zipfile
from pathlib import Path
import requests

def download_and_extract_zip(url: str, destination: str, remove_source: bool = True) -> Path:

    data_folder = Path('data/')
    destination_path = data_folder / destination

    if data_folder.is_dir():
        print(f"Data folder {data_folder} already exists.")
    else:
        print(f"[INFO] Downloading data from {url} to {data_folder}")
        destination_path.mkdir(parents=True, exist_ok=True)

        target_file = Path(url).name

        with open(data_folder / target_file, 'wb') as f:
            response = requests.get(url)
            f.write(response.content)
        
        with zipfile.ZipFile(data_folder / target_file, "r") as zip_ref:
            zip_ref.extractall(data_folder)
        
        if remove_source:
            os.remove(data_folder / target_file)
    
    return destination_path

In [94]:
try:
    from datasets import load_dataset
except:
    %pip -q install datasets
    from datasets import load_dataset

ds = load_dataset("danavery/urbansound8K")

In [95]:
available_splits = ds.keys()
print(f"Available splits: {available_splits}")

Available splits: dict_keys(['train'])


In [96]:
ds['train'].features

{'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None),
 'slice_file_name': Value(dtype='string', id=None),
 'fsID': Value(dtype='int64', id=None),
 'start': Value(dtype='float64', id=None),
 'end': Value(dtype='float64', id=None),
 'salience': Value(dtype='int64', id=None),
 'fold': Value(dtype='int64', id=None),
 'classID': Value(dtype='int64', id=None),
 'class': Value(dtype='string', id=None)}

In [109]:
if "train" in ds.keys():
    full_dataset = ds["train"]
    print(f"Full dataset size: {len(full_dataset)}")

    if "fold" in full_dataset.features:
        print("Fold column exists. Splitting dataset into predefined folds.")

        train_dataset = full_dataset.filter(lambda x: x["fold"] <= 8)
        val_dataset = full_dataset.filter(lambda x: x["fold"] == 9)
        test_dataset = full_dataset.filter(lambda x: x["fold"] == 10)

        print(f"Train dataset size: {len(train_dataset)}")
        print(f"Validation dataset size: {len(val_dataset)}")
        print(f"Test dataset size: {len(test_dataset)}")

        if len(train_dataset) + len(val_dataset) + len(test_dataset) == len(
            full_dataset
        ):
            print("Data split successfully.")
        else:
            print(
                "ERROR: Sum of split doesn't match the full dataset. Check the folds again."
            )

    else:
        print("ERROR: 'fold' column not found. Performing a random split.")

        # Split full dataset
        # Training : 80%
        # Temp Test: 20%
        train_test_split = full_dataset.train_test_split(test_size=0.2, seed=42)
        train_dataset = train_test_split["train"]
        test_temp_dataset = train_test_split["test"]

        # Split temp test dataset
        # val_dataset: 50%
        # test_dataset: 50%
        val_test_split = test_temp_dataset.train_test_split(test_size=0.5, seed=42)
        val_dataset = val_test_split["train"]
        test_dataset = val_test_split["test"]

        print(f"Train dataset size: {len(train_dataset)}")
        print(f"Validation dataset size: {len(val_dataset)}")
        print(f"Test dataset size: {len(test_dataset)}")

else:
    print("ERROR: 'train' split not found. Please check the dataset.")
    train_dataset = None
    val_dataset = None
    test_dataset = None
    print("The dataset has ", ds.keys())

Full dataset size: 8732
Fold column exists. Splitting dataset into predefined folds.
Train dataset size: 7079
Validation dataset size: 816
Test dataset size: 837
Data split successfully.


In [110]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 7079
Validation dataset size: 816
Test dataset size: 837


In [111]:
train_dataset[0]

{'audio': {'path': '100032-3-0-0.wav',
  'array': array([-0.00454712, -0.00483704, -0.00460815, ..., -0.00065613,
         -0.00048828,  0.        ], shape=(14004,)),
  'sampling_rate': 44100},
 'slice_file_name': '100032-3-0-0.wav',
 'fsID': 100032,
 'start': 0.0,
 'end': 0.317551,
 'salience': 1,
 'fold': 5,
 'classID': 3,
 'class': 'dog_bark'}

In [112]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchaudio.transforms as T
import torchvision.transforms as TV
import numpy as np

# 1. Preprocessing function
# parameters
N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512
TARGET_SAMPLE_RATE = 16000
VIT_INPUT_SIZE = (224, 224)
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]
FREQ_MASK_PARAM = 80  # for SpecAugment
TIME_MASK_PARAM = 100  # for SpecAugment



In [113]:
# Spectrogram Calculation
mel_spectrogram_transform = T.MelSpectrogram(
    sample_rate=TARGET_SAMPLE_RATE,
    n_fft=N_FFT,
    hop_length=HOP_LENGTH,
    n_mels=N_MELS,
    power=2.0,
)

amplitude_to_db_transform = T.AmplitudeToDB(stype="power", top_db=80.0)

# SpecAugment (Frequency and Time Masking)
freq_mask_transform = T.FrequencyMasking(freq_mask_param=30)
time_mask_transform = T.TimeMasking(time_mask_param=40)


# # transforms to make it compatible with vision models
# custom transform to handle channels
class HandleChannels(nn.Module):
    def forward(self, spec: torch.Tensor) -> torch.Tensor:
        # Input shape: [1, n_mels, time_steps]
        # Output shape: [3, n_mels, time_steps]
        if spec.ndim == 2:
            # If input is 2D, add a channel dimension
            spec = spec.unsqueeze(0)
        # format now: [1, n_mels, time_steps]
        # Repeat across channel dimension to get 3 channels (mimicking RGB)
        # If input has 1 channel, repeat to 3 channels
        if spec.shape[0] == 1:
            spec = spec.repeat(3, 1, 1)
        # Shape after: [3, n_mels, time_steps]
        return spec


handle_channels_transform = HandleChannels()

# Vision transforms
resize_transform = TV.Resize(VIT_INPUT_SIZE, antialias=True)
normalize_transform = TV.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)

# Create the transform pipeline

eval_transforms = TV.Compose(
    [
        mel_spectrogram_transform,
        amplitude_to_db_transform,
        handle_channels_transform,
        resize_transform,
        normalize_transform,
    ]
)

training_transforms = TV.Compose(
    [
        mel_spectrogram_transform,
        amplitude_to_db_transform,
        # These transforms expect (..., freq, time)
        freq_mask_transform,
        time_mask_transform,
        # Image transforms
        handle_channels_transform,
        resize_transform,
        normalize_transform,
    ]
)

In [114]:
train_dataset[0]

{'audio': {'path': '100032-3-0-0.wav',
  'array': array([-0.00454712, -0.00483704, -0.00460815, ..., -0.00065613,
         -0.00048828,  0.        ], shape=(14004,)),
  'sampling_rate': 44100},
 'slice_file_name': '100032-3-0-0.wav',
 'fsID': 100032,
 'start': 0.0,
 'end': 0.317551,
 'salience': 1,
 'fold': 5,
 'classID': 3,
 'class': 'dog_bark'}

In [115]:
torch.from_numpy(train_dataset[0]["audio"]["array"])

tensor([-0.0045, -0.0048, -0.0046,  ..., -0.0007, -0.0005,  0.0000],
       dtype=torch.float64)

In [116]:
def preprocess_data(is_training: bool):

    processor = training_transforms if is_training else eval_transforms

    def preprocess_audio_to_spectrogram(sample):
        print("Transform called with sample['audio'] type:", type(sample["audio"]))

        audio_data = sample["audio"]
        if isinstance(audio_data, dict) and "array" in audio_data:
            waveform = torch.from_numpy(audio_data["array"]).float()
            sample_rate = audio_data["sampling_rate"]
            label = (
                torch.tensor(sample["classID"], dtype=torch.long)
                if "classID" in sample
                else sample["label"]
            )
            processed_spectrogram = processor(waveform)
            result = dict(sample)
            result["pixel_values"] = processed_spectrogram
            result["label"] = label
            return result

        # Batch: list of dicts
        elif (
            isinstance(audio_data, list)
            and isinstance(audio_data[0], dict)
            and "array" in audio_data[0]
        ):
            pixel_values = []
            labels = []
            for i, audio_dict in enumerate(audio_data):
                waveform = torch.from_numpy(audio_dict["array"]).float()
                sample_rate = audio_dict["sampling_rate"]
                if sample_rate != TARGET_SAMPLE_RATE:
                    resampler = T.Resample(
                        orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE
                    )
                    waveform = resampler(waveform)
                if waveform.ndim > 1 and waveform.shape[0] > 1:
                    waveform = torch.mean(waveform, dim=0)
                if waveform.ndim == 0:
                    waveform = waveform.unsqueeze(0)
                processed = processor(waveform)
                pixel_values.append(processed)
                # Handle label for batch
                if "classID" in sample and isinstance(sample["classID"], list):
                    labels.append(sample["classID"][i])
                elif "label" in sample and isinstance(sample["label"], list):
                    labels.append(sample["label"][i])
                else:
                    # fallback: try to get label from audio_dict if present
                    labels.append(
                        audio_dict.get("classID", audio_dict.get("label", -1))
                    )
            result = dict(sample)
            result["pixel_values"] = pixel_values
            result["label"] = labels
            return result
        else:
            raise TypeError(f"Unexpected type for sample['audio']: {type(audio_data)}")

        # Resample to target sample rate
        if sample_rate != TARGET_SAMPLE_RATE:
            resampler = T.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
            waveform = resampler(waveform)
            sample_rate = TARGET_SAMPLE_RATE

        # Convert stereo to mono
        if waveform.ndim > 1 and waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0)

        # Ensure waveform is 1D for MelSpectrogram if it became 0D after mean
        if waveform.ndim == 0:
            waveform = waveform.unsqueeze(0)

        # Apply the training transforms
        processed_spectrogram = processor(waveform)
        print(f"Type after processor: {type(processed_spectrogram)}")  # Debug

        # return {
        #     "pixel_values": processed_spectrogram,
        #     "label": label,
        # }
        result = dict(sample)  # copy all original keys
        result["pixel_values"] = processed_spectrogram
        result["label"] = label
        return result

    return preprocess_audio_to_spectrogram

In [117]:
train_transform_fn = preprocess_data(is_training=True)
val_transform_fn = preprocess_data(is_training=False)

print("Applying transforms to datasets...")
try:
    train_exists = "train_dataset" in locals() and train_dataset is not None
    val_exists = "val_dataset" in locals() and val_dataset is not None
    test_exists = "test_dataset" in locals() and test_dataset is not None

    print("Sample before transform:", train_dataset[0])
    if train_exists:
        train_dataset.set_transform(train_transform_fn)
    if val_exists:
        val_dataset.set_transform(val_transform_fn)
    if test_exists:
        test_dataset.set_transform(val_transform_fn)

    if train_exists or val_exists or test_exists:
        print("Transforms applied successfully.")
    else:
        print("No datasets found to apply transforms.")

    if train_exists:
        processed_sample = train_dataset[0]
        print(f"Processed sample shape: {processed_sample['pixel_values'].shape}")
        print(f"Processed sample label: {processed_sample['label']}")
        print(f"Processed spectrogram type: {processed_sample['pixel_values'].dtype}")
# ...existing code...

except Exception as e:
    print(f"Error applying transforms: {e}")

Applying transforms to datasets...
Sample before transform: {'audio': {'path': '100032-3-0-0.wav', 'array': array([-0.00454712, -0.00483704, -0.00460815, ..., -0.00065613,
       -0.00048828,  0.        ], shape=(14004,)), 'sampling_rate': 44100}, 'slice_file_name': '100032-3-0-0.wav', 'fsID': 100032, 'start': 0.0, 'end': 0.317551, 'salience': 1, 'fold': 5, 'classID': 3, 'class': 'dog_bark'}
Transforms applied successfully.
Transform called with sample['audio'] type: <class 'list'>
Processed sample shape: torch.Size([3, 224, 224])
Processed sample label: 3
Processed spectrogram type: torch.float32


In [118]:
print("Type of train_dataset:", type(train_dataset))
first = train_dataset[0]
print("Type of train_dataset[0]:", type(first))
print("train_dataset[0]:", first)

Type of train_dataset: <class 'datasets.arrow_dataset.Dataset'>
Transform called with sample['audio'] type: <class 'list'>
Type of train_dataset[0]: <class 'dict'>
train_dataset[0]: {'audio': {'path': '100032-3-0-0.wav', 'array': array([-0.00454712, -0.00483704, -0.00460815, ..., -0.00065613,
       -0.00048828,  0.        ], shape=(14004,)), 'sampling_rate': 44100}, 'slice_file_name': '100032-3-0-0.wav', 'fsID': 100032, 'start': 0.0, 'end': 0.317551, 'salience': 1, 'fold': 5, 'classID': 3, 'class': 'dog_bark', 'pixel_values': tensor([[[-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         ...,
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179]],

        [[-2.0357, -2.035