In [1]:
import pandas as pd
import numpy as np
import json
import zipfile
from tqdm import tqdm

class Dataset:
    _log_mfb_shape = (None, 64)
    def __init__(self, annotations_path, dataset_path, ytids_path,
                 class_associations=(('speech',0),('music',1),('noise',2)),
                 transform_fn = None,
                 load_data=True):
        self.annotations = pd.read_csv(annotations_path).set_index('ytid')
        self.dataset_path = dataset_path
        with open(ytids_path, 'r') as f:
            self.ytids = json.load(f)
        self.class_associations = dict(class_associations)
        self.reverse_class_associations = dict((v,k) for k,v in class_associations)
        self.transform_fn = transform_fn
        
        if load_data:
            self.load_data()
    
    def load_data(self):
        self.data = {}
        with zipfile.ZipFile(self.dataset_path, 'r') as zf:
            for ytid in tqdm(self.ytids, desc='Loading data'):
                data = {}
                with zf.open(self.annotations.loc[ytid,'log_mfb_path']) as f:
                    data['log_mfb'] = np.load(f)
                
                self.data[ytid] = (data, self.class_associations[
                                       self.annotations.loc[ytid,'plausible_superclass']])
    
    def __getitem__(self, index):
        ytid = self.ytids[index]
        data_without_ytid, label = self.data[ytid]
        data = {'ytid':ytid}
        data.update(data_without_ytid)
        if self.transform_fn is not None:
            data = self.transform_fn(data)
        return data, label
    
    def __len__(self):
        return len(self.data)
    
    @property
    def tf_dataloader(self):
        try:
            return self._tf_dataloader
        except AttributeError:
            import tensorflow as tf
            
            _data = self.data
            _log_mfb_shape = self._log_mfb_shape
            def _load_data(ytid):
                data, label = _data[ytid.numpy().decode('utf-8')]
                log_mfb = data['log_mfb']
                return log_mfb, label
            
            @tf.function
            def _load_data_tf(ytid):
                log_mfb, label = tf.py_function(_load_data, [ytid], [tf.float32, tf.int32])
                log_mfb.set_shape(_log_mfb_shape)
                label.set_shape(())
                return (
                    {
                        'ytid':ytid,
                        'log_mfb': log_mfb
                    },
                    label
                )
            self._tf_dataloader = _load_data_tf
            return self._tf_dataloader
    
    def get_shuffled_tf_dataset(self, ytids=None):
        if ytids is None:
            ytids = self.ytids
        return tf.data.Dataset.from_tensor_slices(ytids).shuffle(len(ytids)).map(self.tf_dataloader)
    
    def get_unshuffled_tf_dataset(self, ytids=None):
        if ytids is None:
            ytids = self.ytids
        return tf.data.Dataset.from_tensor_slices(ytids).map(self.tf_dataloader)
    

In [2]:
annotations_path = '../data/train_test_splits/train_dataset.csv.zip'
dataset_path = r'G:\datasets\audioset-derived.zip'
ytids_path = '../data/train_test_splits/train_ytids.json'

dataset = Dataset(annotations_path, dataset_path, ytids_path)

Loading data: 100%|██████████| 7841/7841 [00:13<00:00, 573.61it/s]


In [4]:
dataset.data['--BfvyPmVMo']

({'log_mfb': array([[-12.657357 , -11.910873 , -12.428922 , ..., -10.195707 ,
          -10.186556 , -10.161953 ],
         [-10.5030575, -10.368566 , -10.445654 , ..., -11.005456 ,
          -11.021426 , -11.006858 ],
         [-10.477143 , -10.238664 , -10.129091 , ..., -11.818546 ,
          -11.803687 , -11.785731 ],
         ...,
         [-11.223254 , -10.959818 , -10.6100025, ..., -11.41233  ,
          -11.402614 , -11.389905 ],
         [-11.532803 , -11.399852 , -10.256641 , ..., -12.986043 ,
          -12.9783125, -12.954549 ],
         [-11.877029 , -11.344776 , -10.755081 , ..., -11.223252 ,
          -11.207252 , -11.183005 ]], dtype=float32)},
 2)

In [5]:
dataset[0]

({'ytid': '-OhudZ743CE',
  'log_mfb': array([[-14.89137  , -12.728969 , -12.116203 , ...,  -7.0896773,
           -7.196857 ,  -8.67631  ],
         [-10.76857  , -10.449691 , -10.842775 , ...,  -6.947009 ,
           -7.0724125,  -8.099452 ],
         [ -9.930698 ,  -7.9328694,  -7.560336 , ...,  -7.804921 ,
           -8.327517 ,  -8.520834 ],
         ...,
         [-12.839853 , -11.126363 ,  -9.748734 , ...,  -8.104726 ,
           -7.857449 ,  -7.464834 ],
         [-10.298605 ,  -8.610804 ,  -7.6734467, ...,  -8.199435 ,
           -9.86635  ,  -9.537319 ],
         [-11.363624 ,  -9.043201 ,  -7.110422 , ...,  -8.247557 ,
          -10.945918 , -10.742892 ]], dtype=float32)},
 0)

# TF experiments

In [6]:
import tensorflow as tf
train_dataset = dataset.get_shuffled_tf_dataset()

next(iter(train_dataset))

({'ytid': <tf.Tensor: shape=(), dtype=string, numpy=b'VzEqM3da16E'>,
  'log_mfb': <tf.Tensor: shape=(1001, 64), dtype=float32, numpy=
  array([[ -6.1097007,  -3.9416633,  -4.371738 , ...,  -3.0839028,
           -3.7548316,  -3.8276918],
         [ -5.5286193,  -4.6864185,  -4.7393837, ...,  -3.04174  ,
           -3.7088096,  -4.014519 ],
         [ -5.9782786,  -6.1311736,  -8.099994 , ...,  -3.5454993,
           -4.1681085,  -4.6266875],
         ...,
         [-10.78713  ,  -7.548249 ,  -6.2613487, ...,  -4.201949 ,
           -4.4311595,  -4.84781  ],
         [ -8.842784 ,  -6.6146364,  -6.610764 , ...,  -4.6556497,
           -4.9842944,  -5.371841 ],
         [ -8.800042 ,  -8.195217 ,  -6.326751 , ...,  -4.674864 ,
           -5.120435 ,  -5.6490145]], dtype=float32)>},
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [6]:
def _dataset_timer():
    for e in train_dataset:
        pass

%timeit _dataset_timer()

2.88 s ± 35.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
class RandomWindow:
    def __init__(self, window_size, features, pad_value):
        self.window_size = window_size
        self.features = features
        self.pad_value = pad_value
        
    def __call__(self, data, label):
        data_out = data.copy()
        
        data_len = tf.shape(data_out[self.features[0]])[0]
        
        pad_size = tf.maximum(0, self.window_size - data_len)
        for feature in self.features:
            data_out[feature] = tf.pad(data_out[feature], [[pad_size,0]]+[[0, 0]]*(data[feature].shape.rank-1),
                                       constant_values=self.pad_value)
        
        data_len = data_len + pad_size
        
        start_idx = tf.random.uniform((), 0, data_len-self.window_size+1, dtype=tf.int32)
        end_idx = start_idx + self.window_size
        
        window = tf.stack([start_idx, end_idx], axis=0)
        data_out['window'] = window
        
        for feature in self.features:
            data_out[feature] = data_out[feature][start_idx:end_idx]
        return data_out, label
    
    

In [8]:
random_window = RandomWindow(window_size=200, features=['log_mfb'], pad_value=-16.)

window_dataset = train_dataset.map(random_window)

next(iter(window_dataset))

({'ytid': <tf.Tensor: shape=(), dtype=string, numpy=b'B-Id6gGDi9E'>,
  'log_mfb': <tf.Tensor: shape=(200, 64), dtype=float32, numpy=
  array([[-10.730423 ,  -9.304807 ,  -9.301556 , ...,  -7.3564925,
           -7.431468 ,  -8.186854 ],
         [ -9.251912 ,  -9.2857   ,  -8.550494 , ...,  -7.7066913,
           -7.7682257,  -7.9623384],
         [-10.390761 , -10.216873 ,  -9.0816965, ...,  -7.919459 ,
           -8.197783 ,  -7.7523375],
         ...,
         [-12.2998705,  -8.875496 ,  -7.289873 , ...,  -9.314437 ,
           -9.722735 ,  -9.7799015],
         [ -8.90485  ,  -7.589591 ,  -6.9767375, ..., -10.43234  ,
          -10.424762 , -10.047817 ],
         [-10.229837 ,  -8.087737 ,  -7.2587786, ...,  -9.223938 ,
          -10.249124 , -10.478591 ]], dtype=float32)>,
  'window': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([682, 882])>},
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [9]:
dataset.data['B-Id6gGDi9E'][0]['log_mfb'][682:882]

array([[-10.730423 ,  -9.304807 ,  -9.301556 , ...,  -7.3564925,
         -7.431468 ,  -8.186854 ],
       [ -9.251912 ,  -9.2857   ,  -8.550494 , ...,  -7.7066913,
         -7.7682257,  -7.9623384],
       [-10.390761 , -10.216873 ,  -9.0816965, ...,  -7.919459 ,
         -8.197783 ,  -7.7523375],
       ...,
       [-12.2998705,  -8.875496 ,  -7.289873 , ...,  -9.314437 ,
         -9.722735 ,  -9.7799015],
       [ -8.90485  ,  -7.589591 ,  -6.9767375, ..., -10.43234  ,
        -10.424762 , -10.047817 ],
       [-10.229837 ,  -8.087737 ,  -7.2587786, ...,  -9.223938 ,
        -10.249124 , -10.478591 ]], dtype=float32)

In [10]:
window_dataset_batched = window_dataset.batch(32)

next(iter(window_dataset_batched))

({'ytid': <tf.Tensor: shape=(32,), dtype=string, numpy=
  array([b'-N5vYXYJGn0', b'Ec0svHZBxyk', b'-MqTlBT-AHQ', b'8XUVAOMzjFY',
         b'DVY5qqxHQyA', b'QHVUSqKkrMc', b'QJ37xmg5Kd0', b'-9srU9TwFnw',
         b'QM-rGoklsqE', b'-1wZ7uIA-Qs', b'ebhTLAnD8nQ', b'0RZRFj7zDnQ',
         b'JtPcsFkrbSU', b'CmTBVTlArrI', b'BMFmzBBDH_k', b'QRSM9qkPywg',
         b'cQna9Zyq37M', b'QFke_rVztPY', b'4TFCpuwtSdU', b'STe3TUVpJoc',
         b'-KVHAAfS0qQ', b'--1nnAww5MY', b'-Z1ZSWDouUU', b'P4aTFrJws40',
         b'9dx9IhSqwC4', b'MqQNO5Uo8HM', b'BL6DESnl79Y', b'35scjU1vIGw',
         b'-1XMHlpbDc8', b'Cxg_hn2cLhc', b'QP9ethxU03c', b'XzRxO8n3WRE'],
        dtype=object)>,
  'log_mfb': <tf.Tensor: shape=(32, 200, 64), dtype=float32, numpy=
  array([[[-10.944441 , -10.382028 , -11.201226 , ...,  -6.805329 ,
            -7.679354 ,  -8.771717 ],
          [-11.16768  , -10.558443 , -11.261365 , ...,  -6.806487 ,
            -7.7153053,  -8.765092 ],
          [-11.196216 , -10.603569 , -11.02641  , ..., 

# Torch Experiments

In [1]:

import pandas as pd
import numpy as np
import json
import zipfile
from tqdm import tqdm

class Dataset:
    _log_mfb_shape = (None, 64)
    def __init__(self, annotations_path, dataset_path, ytids_path,
                 class_associations=(('speech',0),('music',1),('noise',2)),
                 transform_fn = None,
                 data = None,
                 load_data=True):
        self.annotations = pd.read_csv(annotations_path).set_index('ytid')
        self.dataset_path = dataset_path
        with open(ytids_path, 'r') as f:
            self.ytids = json.load(f)
        self.class_associations = dict(class_associations)
        self.reverse_class_associations = dict((v,k) for k,v in class_associations)
        self.transform_fn = transform_fn
        
        if load_data:
            self.load_data()
    
    def load_data(self):
        self.data = {}
        with zipfile.ZipFile(self.dataset_path, 'r') as zf:
            for ytid in tqdm(self.ytids, desc='Loading data'):
                data = {}
                with zf.open(self.annotations.loc[ytid,'log_mfb_path']) as f:
                    data['log_mfb'] = np.load(f)
                
                data['label'] = self.class_associations[self.annotations.loc
                                                        [ytid,'plausible_superclass']]
                self.data[ytid] = data
    
    def __getitem__(self, index):
        ytid = self.ytids[index]
        data_without_ytid = self.data[ytid]
        data = {'ytid':ytid}
        data.update(data_without_ytid)
        if self.transform_fn is not None:
            data = self.transform_fn(data)
        return data
    
    def __len__(self):
        return len(self.data)
    
    

class RandomWindow:
    def __init__(self, window_size, features, pad_value):
        self.window_size = window_size
        self.features = features
        self.pad_value = pad_value
        
    def __call__(self, data):
        data_out = data.copy()
        
        data_len = data_out[self.features[0]].shape[0]
        
        pad_size = max(0, self.window_size - data_len)
        if pad_size > 0:
            for feature in self.features:
                data_out[feature] = np.pad(data_out[feature], [[pad_size,0]]+[[0, 0]]*(len(data[feature].shape)-1),
                                        constant_values=self.pad_value)
            data_len = data_len + pad_size
            data_out['window'] = np.array([0, data_len], dtype=np.int32)
        else:
            start_idx = np.random.randint(0, data_len-self.window_size+1, dtype=np.int32)
            end_idx = start_idx + self.window_size
            
            window = np.stack([start_idx, end_idx], axis=0)
            data_out['window'] = window
            
            for feature in self.features:
                data_out[feature] = data_out[feature][start_idx:end_idx]
        return data_out


In [2]:
annotations_path = '../data/train_test_splits/train_dataset.csv.zip'
dataset_path = r'G:\datasets\audioset-derived.zip'
ytids_path = '../data/train_test_splits/train_ytids.json'

random_window = RandomWindow(window_size=200, features=['log_mfb'], pad_value=-16.)
dataset = Dataset(annotations_path, dataset_path, ytids_path, transform_fn=random_window)

Loading data: 100%|██████████| 7841/7841 [00:10<00:00, 779.44it/s]


In [22]:
dataset[0]

{'ytid': '-OhudZ743CE',
 'log_mfb': array([[ -9.714898 ,  -7.8681226,  -6.9284725, ...,  -9.486728 ,
         -10.693724 , -11.042415 ],
        [ -9.737786 ,  -9.259062 ,  -7.601204 , ...,  -8.273821 ,
          -8.8921795,  -9.107322 ],
        [ -7.815244 ,  -7.790437 ,  -7.3319097, ...,  -8.832484 ,
          -9.531226 , -10.547821 ],
        ...,
        [-11.421468 , -11.281325 , -11.325998 , ...,  -5.6270795,
          -6.1960373,  -6.6504436],
        [-13.191524 , -13.413879 , -13.653507 , ...,  -6.8725953,
          -7.4126644,  -7.3682575],
        [-10.307065 ,  -9.902206 ,  -9.737846 , ...,  -7.8437257,
          -8.271171 ,  -8.361786 ]], dtype=float32),
 'label': 0,
 'window': array([349, 549])}

In [54]:
dataset.transform_fn = RandomWindow(window_size=200, features=['log_mfb'], pad_value=-16.)
dataset[0]

{'ytid': '-OhudZ743CE',
 'log_mfb': array([[-11.416741 , -10.347622 ,  -9.73281  , ..., -11.6811   ,
         -11.643856 , -11.639836 ],
        [-12.532125 , -10.655686 ,  -9.6775465, ..., -11.618555 ,
         -11.044423 , -11.50934  ],
        [-13.42306  , -10.803362 ,  -9.418505 , ..., -11.20941  ,
         -10.838674 , -11.4871435],
        ...,
        [-12.997514 , -11.296269 , -10.689694 , ..., -10.445326 ,
          -9.451973 , -10.520681 ],
        [-11.977348 , -10.564443 , -10.187405 , ..., -10.548757 ,
          -9.502732 , -10.760075 ],
        [-11.956531 , -10.832023 , -10.045336 , ..., -10.259729 ,
          -9.270515 , -10.563932 ]], dtype=float32),
 'label': 0,
 'window': array([189, 389])}

In [58]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, pin_memory=True)

for batch in dataloader:
    print(batch)
    break

{'ytid': ['3OUNEL8XaR0', '3FJFcgaa0oE', 'FGoDfNZezh0', '9BHvpWP2V9Y', '4M-Q1DGFQsk', '-0ZEUItqz3E', 'MTB-P5Bt3_Q', '-Iyr8rHMMMY', '-OtGsSBS9xg', '4BO6EbCTKwM', 'AoIHTerxdas', '-5JJzt0-PxM', '-JGJ3Y_GiEc', 'E3wQrPffYuc', '-K0BFxTJzNw', '-MIiDF56GoA', 'VfEJHqtsuIo', '4oI3MUQn-M0', 'kj1hFWgc98s', '-26I2EZZpNo', 'g1HyZDXSyI8', 'QOcSskkipGY', 'MLmmVEDpWls', 'kIEEDYowZhA', '9EsNtRXnYbE', 'QSx5MMn1vJQ', '-35kWQH9FVg', 'CT6AQ9fhe4s', '-All_IT36WQ', 'EO5G5HDH090', 'NtInDkT7GHQ', 'EabhwDpC30s'], 'log_mfb': tensor([[[-11.7831, -11.8407, -12.4903,  ..., -12.1648, -12.7539, -14.2098],
         [-10.5526, -10.7212, -11.3150,  ..., -11.7971, -12.2629, -12.5421],
         [-11.2378, -13.0192, -13.3352,  ..., -12.4883, -13.2087, -14.8138],
         ...,
         [-10.1961,  -9.8067,  -9.3711,  ...,  -8.2074,  -9.8707, -10.6815],
         [ -8.9263,  -8.9588,  -8.3211,  ...,  -8.5484,  -9.8192, -10.4672],
         [ -9.9688,  -9.3545,  -8.5890,  ...,  -8.7075,  -9.8859, -10.3209]],

        [[ -5.7137, 