## Descargar dataset Urbansound8K desde Kaggle

In [7]:
!pip install kaggle
import os

!mkdir ~/.kaggle # crear carpeta en root
# Cargar archivos desde el equipo local
if not os.path.exists("/root/.kaggle/kaggle.json"):
  from google.colab import files
  files.upload();
  !cp kaggle.json ~/.kaggle/ #copiar token hacia la carpeta creada
  !chmod 600 ~/.kaggle/kaggle.json # asigna el permiso necesario

if not os.path.exists("/content/urbansound8k"):
  import kaggle
  kaggle.api.authenticate()
  kaggle.api.dataset_download_files('chrisfilo/urbansound8k', path='/content/urbansound8k', unzip=True)

## VGGish

In [8]:
!pip install resampy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting resampy
  Downloading resampy-0.4.2-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: resampy
Successfully installed resampy-0.4.2


In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torchaudio
import tensorflow_hub as hub
from IPython.display import Audio

In [23]:
full_data = pd.read_csv('/content/urbansound8k/UrbanSound8K.csv')
full_data['duration'] = full_data['end'] - full_data['start']

selected_data = full_data[full_data.duration >= 4]
selected_data.head(10)

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,duration
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing,4.0
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing,4.0
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing,4.0
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing,4.0
5,100263-2-0-143.wav,100263,71.5,75.5,1,5,2,children_playing,4.0
6,100263-2-0-161.wav,100263,80.5,84.5,1,5,2,children_playing,4.0
7,100263-2-0-3.wav,100263,1.5,5.5,1,5,2,children_playing,4.0
8,100263-2-0-36.wav,100263,18.0,22.0,1,5,2,children_playing,4.0
14,100652-3-0-0.wav,100652,0.0,4.0,1,2,3,dog_bark,4.0
15,100652-3-0-1.wav,100652,0.5,4.5,1,2,3,dog_bark,4.0


In [11]:
# Cargar modelo y pasar a modo evaluación:
model = torch.hub.load('harritaylor/torchvggish', 'vggish')
model.eval()

Downloading: "https://github.com/harritaylor/torchvggish/zipball/master" to /root/.cache/torch/hub/master.zip
Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth" to /root/.cache/torch/hub/checkpoints/vggish-10086976.pth
100%|██████████| 275M/275M [00:16<00:00, 17.8MB/s]
Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish_pca_params-970ea276.pth" to /root/.cache/torch/hub/checkpoints/vggish_pca_params-970ea276.pth
100%|██████████| 177k/177k [00:00<00:00, 25.1MB/s]


VGGish(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False

In [12]:
selected_data.shape[0]

7077

In [17]:
extracted_features = torch.empty((0, 4, 128)) # 0= no se tiene definido hasta el momento el tamaño
labels = np.empty(0)

for idx, row in tqdm(selected_data.iterrows()):
  path_file = f"/content/urbansound8k/fold{row['fold']}/{row['slice_file_name']}"
  vggish_emb = model.forward(path_file)
  extracted_features = torch.cat((extracted_features, vggish_emb.detach().cpu().view(1, 4, 128)), axis=0)
  labels = np.append(labels, row["classID"])


7077it [22:49,  5.17it/s]


In [18]:
extracted_features.shape

torch.Size([7077, 4, 128])

In [19]:
labels.shape

(7077,)

In [20]:
extracted_features_fl = torch.flatten(extracted_features, start_dim=1).numpy()
extracted_features_fl = np.append(extracted_features_fl, labels[:,None], axis=1)

In [21]:
extracted_features_fl.shape

(7077, 513)

In [22]:
import h5py
# with: context manager
with h5py.File("vggish_features_labels.hdf5", "w") as f:
    dset = f.create_dataset("data", extracted_features_fl.shape, data=extracted_features_fl)