In [1]:
import pandas as pd
import numpy as np
import json
import zipfile
from tqdm import tqdm

class Dataset:
    _log_mfb_shape = (None, 64)
    def __init__(self, annotations_path, dataset_path, ytids_path,
                 class_associations=(('speech',0),('music',1),('noise',2)),
                 load_data=True):
        self.annotations = pd.read_csv(annotations_path).set_index('ytid')
        self.dataset_path = dataset_path
        with open(ytids_path, 'r') as f:
            self.ytids = json.load(f)
        self.class_associations = dict(class_associations)
        self.reverse_class_associations = dict((v,k) for k,v in class_associations)
        
        if load_data:
            self.load_data()
    
    def load_data(self):
        self.data = {}
        with zipfile.ZipFile(self.dataset_path, 'r') as zf:
            for ytid in tqdm(self.ytids, desc='Loading data'):
                data = {}
                with zf.open(self.annotations.loc[ytid,'log_mfb_path']) as f:
                    data['log_mfb'] = np.load(f)
                
                self.data[ytid] = (data, self.class_associations[
                                       self.annotations.loc[ytid,'plausible_superclass']])
    
    def __getitem__(self, index):
        ytid = self.ytids[index]
        data_without_ytid, label = self.data[ytid]
        data = {'ytid':ytid}
        data.update(data_without_ytid)
        return data, label
    
    def __len__(self):
        return len(self.data)
    
    @property
    def tf_dataloader(self):
        try:
            return self._tf_dataloader
        except AttributeError:
            import tensorflow as tf
            
            _data = self.data
            _log_mfb_shape = self._log_mfb_shape
            def _load_data(ytid):
                data, label = _data[ytid.numpy().decode('utf-8')]
                log_mfb = data['log_mfb']
                return log_mfb, label
            
            @tf.function
            def _load_data_tf(ytid):
                log_mfb, label = tf.py_function(_load_data, [ytid], [tf.float32, tf.int32])
                log_mfb.set_shape(_log_mfb_shape)
                label.set_shape(())
                return (
                    {
                        'ytid':ytid,
                        'log_mfb': log_mfb
                    },
                    label
                )
            self._tf_dataloader = _load_data_tf
            return self._tf_dataloader
    
    def get_shuffled_tf_dataset(self, ytids=None):
        if ytids is None:
            ytids = self.ytids
        return tf.data.Dataset.from_tensor_slices(ytids).shuffle(len(ytids)).map(self.tf_dataloader)
    
    def get_unshuffled_tf_dataset(self, ytids=None):
        if ytids is None:
            ytids = self.ytids
        return tf.data.Dataset.from_tensor_slices(ytids).map(self.tf_dataloader)
    

In [2]:
annotations_path = 'train_test_splits/train_dataset.csv.zip'
dataset_path = r'G:\datasets\audioset-derived.zip'
ytids_path = 'train_test_splits/train_ytids.json'

dataset = Dataset(annotations_path, dataset_path, ytids_path)

Loading data: 100%|██████████| 7841/7841 [00:09<00:00, 784.73it/s]


In [3]:
dataset.data['--BfvyPmVMo']

({'log_mfb': array([[-12.657357 , -11.910873 , -12.428922 , ..., -10.195707 ,
          -10.186556 , -10.161953 ],
         [-10.5030575, -10.368566 , -10.445654 , ..., -11.005456 ,
          -11.021426 , -11.006858 ],
         [-10.477143 , -10.238664 , -10.129091 , ..., -11.818546 ,
          -11.803687 , -11.785731 ],
         ...,
         [-11.223254 , -10.959818 , -10.6100025, ..., -11.41233  ,
          -11.402614 , -11.389905 ],
         [-11.532803 , -11.399852 , -10.256641 , ..., -12.986043 ,
          -12.9783125, -12.954549 ],
         [-11.877029 , -11.344776 , -10.755081 , ..., -11.223252 ,
          -11.207252 , -11.183005 ]], dtype=float32)},
 2)

In [4]:
dataset[0]

({'ytid': '-OhudZ743CE',
  'log_mfb': array([[-14.89137  , -12.728969 , -12.116203 , ...,  -7.0896773,
           -7.196857 ,  -8.67631  ],
         [-10.76857  , -10.449691 , -10.842775 , ...,  -6.947009 ,
           -7.0724125,  -8.099452 ],
         [ -9.930698 ,  -7.9328694,  -7.560336 , ...,  -7.804921 ,
           -8.327517 ,  -8.520834 ],
         ...,
         [-12.839853 , -11.126363 ,  -9.748734 , ...,  -8.104726 ,
           -7.857449 ,  -7.464834 ],
         [-10.298605 ,  -8.610804 ,  -7.6734467, ...,  -8.199435 ,
           -9.86635  ,  -9.537319 ],
         [-11.363624 ,  -9.043201 ,  -7.110422 , ...,  -8.247557 ,
          -10.945918 , -10.742892 ]], dtype=float32)},
 0)

In [5]:
import tensorflow as tf
train_dataset = dataset.get_shuffled_tf_dataset()

next(iter(train_dataset))

({'ytid': <tf.Tensor: shape=(), dtype=string, numpy=b'Wk9rfiU9wvg'>,
  'log_mfb': <tf.Tensor: shape=(891, 64), dtype=float32, numpy=
  array([[-4.9784822, -3.838155 , -5.765702 , ..., -3.0246427, -2.0652351,
          -2.985682 ],
         [-5.0928216, -4.1160746, -6.061986 , ..., -2.8219302, -2.1946497,
          -2.9094625],
         [-5.120795 , -4.250414 , -6.435459 , ..., -2.6672957, -2.7416472,
          -2.9809673],
         ...,
         [-7.645954 , -6.160941 , -6.194725 , ..., -1.5784578, -2.2365365,
          -3.4383805],
         [-6.043252 , -5.870902 , -5.2462025, ..., -1.4150183, -1.833616 ,
          -2.6609511],
         [-5.5630746, -5.4060726, -4.8932343, ..., -1.5111382, -2.098419 ,
          -2.5488796]], dtype=float32)>},
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [6]:
def _dataset_timer():
    for e in train_dataset:
        pass

%timeit _dataset_timer()

2.88 s ± 35.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
class RandomWindow:
    def __init__(self, window_size, features, pad_value):
        self.window_size = window_size
        self.features = features
        self.pad_value = pad_value
        
    def __call__(self, data, label):
        data_out = data.copy()
        
        data_len = tf.shape(data_out[self.features[0]])[0]
        
        pad_size = tf.maximum(0, self.window_size - data_len)
        for feature in self.features:
            data_out[feature] = tf.pad(data_out[feature], [[pad_size,0]]+[[0, 0]]*(data[feature].shape.rank-1),
                                       constant_values=self.pad_value)
        
        data_len = data_len + pad_size
        
        start_idx = tf.random.uniform((), 0, data_len-self.window_size+1, dtype=tf.int32)
        end_idx = start_idx + self.window_size
        
        window = tf.stack([start_idx, end_idx], axis=0)
        data_out['window'] = window
        
        for feature in self.features:
            data_out[feature] = data_out[feature][start_idx:end_idx]
        return data_out, label
    
    

In [27]:
random_window = RandomWindow(window_size=200, features=['log_mfb'], pad_value=-16.)

window_dataset = train_dataset.map(random_window)

next(iter(window_dataset))

({'ytid': <tf.Tensor: shape=(), dtype=string, numpy=b'Pef6g19i5iI'>,
  'log_mfb': <tf.Tensor: shape=(200, 64), dtype=float32, numpy=
  array([[-6.844574 , -5.051794 , -5.6503344, ..., -4.3814936, -4.917168 ,
          -5.85741  ],
         [-7.329627 , -5.132898 , -5.916989 , ..., -4.578798 , -5.0970182,
          -5.8394337],
         [-7.484717 , -5.678336 , -5.8584743, ..., -5.2393026, -5.5340085,
          -6.196355 ],
         ...,
         [-7.4337907, -5.723109 , -6.7144737, ..., -5.6403174, -5.819802 ,
          -6.449281 ],
         [-7.0909367, -5.585424 , -6.1685767, ..., -5.3362393, -5.8842244,
          -6.804651 ],
         [-6.9376855, -6.548622 , -6.2792115, ..., -5.288309 , -6.1700425,
          -6.9629073]], dtype=float32)>,
  'window': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([145, 345])>},
 <tf.Tensor: shape=(), dtype=int32, numpy=2>)

In [29]:
dataset.data['Pef6g19i5iI'][0]['log_mfb'][145:345]

array([[-6.844574 , -5.051794 , -5.6503344, ..., -4.3814936, -4.917168 ,
        -5.85741  ],
       [-7.329627 , -5.132898 , -5.916989 , ..., -4.578798 , -5.0970182,
        -5.8394337],
       [-7.484717 , -5.678336 , -5.8584743, ..., -5.2393026, -5.5340085,
        -6.196355 ],
       ...,
       [-7.4337907, -5.723109 , -6.7144737, ..., -5.6403174, -5.819802 ,
        -6.449281 ],
       [-7.0909367, -5.585424 , -6.1685767, ..., -5.3362393, -5.8842244,
        -6.804651 ],
       [-6.9376855, -6.548622 , -6.2792115, ..., -5.288309 , -6.1700425,
        -6.9629073]], dtype=float32)