In [1]:
import os
import sys

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import torch

sys.path.append("../utils")
sys.path.append("../models/preprocessing")

from extract_PaSST_features import get_embeddings
from dcase2025_dataset import DCASE2025Dataset
from torch_utils import load_wave, evaluate_torch
from mel_features import LogMelSpectrogram

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
folder_path = "../../../Descargas/TAU/TAU-urban-acoustic-scenes-2022-mobile-development/"

meta_path = "~/Descargas/TAU-urban-acoustic-scenes-2022-mobile-development.audio.1/TAU-urban-acoustic-scenes-2022-mobile-development/meta.csv"
train_path = "~/Descargas/TAU-urban-acoustic-scenes-2022-mobile-development.audio.1/TAU-urban-acoustic-scenes-2022-mobile-development/split25.csv"
val_path = "~/Descargas/TAU-urban-acoustic-scenes-2022-mobile-development.audio.1/TAU-urban-acoustic-scenes-2022-mobile-development/test.csv"

meta = pd.read_csv(meta_path, sep="\t")
train_meta = pd.read_csv(train_path, sep="\t")
val_meta = pd.read_csv(val_path, sep="\t")

encoder = LabelEncoder()
meta["scene"] = encoder.fit_transform(meta["scene_label"])

train = meta[meta["filename"].isin(train_meta["filename"])].reset_index(drop=True)
val = meta[meta["filename"].isin(val_meta["filename"])].reset_index(drop=True)

train["wave"] = train["filename"].map(lambda x: load_wave(os.path.join(folder_path, x)))
val["wave"] = val["filename"].map(lambda x: load_wave(os.path.join(folder_path, x)))

In [3]:
meta["scene"]

0         0
1         0
2         0
3         0
4         0
         ..
230345    9
230346    9
230347    9
230348    9
230349    9
Name: scene, Length: 230350, dtype: int64

In [4]:
w_train = torch.stack(train["wave"].to_list())
w_val = torch.stack(val["wave"].to_list())

s_train = torch.tensor(train["scene"].to_list())
s_val = torch.tensor(val["scene"].to_list())

f_train = train["filename"].to_list()
f_val = val["filename"].to_list()

d_train = train["source_label"].to_list()
d_val = val["source_label"].to_list()

In [5]:
w_train

tensor([[[-0.0850, -0.0975, -0.0918,  ...,  0.0755,  0.0822,  0.0827]],

        [[ 0.0830,  0.0859,  0.0828,  ...,  0.0752,  0.0583,  0.0499]],

        [[ 0.0490,  0.0503,  0.0595,  ...,  0.0669,  0.0722,  0.0682]],

        ...,

        [[ 0.1003,  0.1057,  0.1105,  ..., -0.0191, -0.0156, -0.0119]],

        [[-0.0073, -0.0018,  0.0045,  ..., -0.0459, -0.0459, -0.0468]],

        [[-0.0479, -0.0485, -0.0482,  ..., -0.0259, -0.0246, -0.0240]]])

In [None]:
mel_extractor = LogMelSpectrogram()

m_train = evaluate_torch(mel_extractor, w_train)
m_val = evaluate_torch(mel_extractor, w_val)

e_train = get_embeddings(w_train.squeeze())
e_val = get_embeddings(w_val.squeeze())



 Loading PASST TRAINED ON AUDISET 


PaSST(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), ep

Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /pytorch/aten/src/ATen/native/SpectralOps.cpp:873.)
  return _VF.stft(  # type: ignore[attr-defined]
  with torch.cuda.amp.autocast(enabled=False):


 after 12 atten blocks x torch.Size([64, 158, 768])
forward_features torch.Size([64, 768])
head torch.Size([64, 527])


In [None]:
train_ds = DCASE2025Dataset(
    filenames=f_train, 
    waves=w_train, 
    mels=m_train, 
    embeddings=e_train, 
    devices=d_train, 
    scenes=s_train
    )

val_ds = DCASE2025Dataset(
    filenames=f_val, 
    waves=w_val, 
    mels=m_val, 
    embeddings=e_val, 
    devices=d_val, 
    scenes=s_val
    )

In [None]:
train_ds.save("train.pt")
val_ds.save("val.pt")