In [None]:
#| default_exp data.block
#| default_cls_lvl 3

In [None]:
#| export
from fastai.data.all import *
from tsfast.data.core import *
from tsfast.data.split import ParentSplitter, ApplyToDict, PercentageSplitter
from tsfast.data.transforms import SeqNoiseInjection, Normalize,SeqSlice

## 5. Dataloaders Creation
A Datasets combines all implemented components on item level.

In [None]:
#| export
def pad_sequence(batch,sorting = False):
    '''collate_fn for padding of sequences of different lengths, use in before_batch of databunch, still quite slow'''
    #takes list of tuples as input, returns list of tuples
    sorted_batch = sorted(batch, key=lambda x: x[0].shape[0], reverse=True) if sorting else batch

    pad_func = partial(torch.nn.utils.rnn.pad_sequence,batch_first=True)
    padded_tensors = [pad_func([x[tup] for x in sorted_batch]) for tup in range(len(batch[0]))]
    padded_list = [retain_types(tuple([tup[entry] for tup in padded_tensors]),batch[0]) for entry in range(len(batch))]
    #retain types is important for decoding later back to source items
#     import pdb; pdb.set_trace()
    
    return padded_list

### 5.1 Low-Level with Transforms

In [None]:
from nbdev.config import get_config
from tsfast.data.core import CreateDict, ValidClmContains,DfHDFCreateWindows

In [None]:
project_root = get_config().config_file.parent
f_path = project_root / 'test_data/WienerHammerstein'
hdf_files = get_files(f_path,extensions='.hdf5',recurse=True)
tfm_src = CreateDict([ValidClmContains(['valid']),DfHDFCreateWindows(win_sz=100+1,stp_sz=10,clm='u')])
src_dicts = tfm_src(hdf_files)

In [None]:
tfm_src = CreateDict([ValidClmContains(['valid']),DfHDFCreateWindows(win_sz=100+1,stp_sz=10,clm='u')])
src_dicts = tfm_src(hdf_files)

tfms=[  [HDF2Sequence(['u','y']),SeqSlice(l_slc=1),toTensorSequencesInput],
        [HDF2Sequence(['y']),SeqSlice(r_slc=-1),toTensorSequencesOutput]]
splits = PercentageSplitter()([x['path'] for x in src_dicts])
dsrc = Datasets(src_dicts,tfms=tfms,splits=splits)

In [None]:
# %%timeit
# dsrc[0]

In [None]:
db = dsrc.dataloaders(bs=128,after_batch=[SeqNoiseInjection(std=[1.1,0.01]),Normalize(axes=[0,1])],before_batch=pad_sequence)
db.one_batch()[0].shape

torch.Size([128, 100, 2])

### 5.2 Mid-Level with Datablock API

In [None]:
#| export
class SequenceBlock(TransformBlock):
    def __init__(self, seq_extract,padding=False):
        return super().__init__(type_tfms=[seq_extract],
                                batch_tfms=[Normalize(axes=[0,1])],
                                dls_kwargs={} if not padding else {'before_batch': pad_sequence})

    @classmethod
    @delegates(HDF2Sequence, keep=True)
    def from_hdf(cls, clm_names, seq_cls=TensorSequencesInput,padding=False, **kwargs):
        return cls(HDF2Sequence(clm_names,to_cls=seq_cls,**kwargs), padding)

    @classmethod
    def from_numpy(cls, seq_cls=TensorSequencesInput,padding=False, **kwargs):
        return cls(ToTensor(enc=seq_cls), padding)

In [None]:
seq = DataBlock(blocks=(SequenceBlock.from_hdf(['u','y'],TensorSequencesInput,padding=True,cached=None),
                        SequenceBlock.from_hdf(['y'],TensorSequencesOutput,cached=None)),
                get_items=tfm_src,
                splitter=ApplyToDict(ParentSplitter()))

In [None]:
dls = seq.dataloaders(hdf_files)

In [None]:
#| export
class ScalarNormalize(DisplayedTransform):
    def __init__(self, mean=None, std=None, axes=(0,)): store_attr()
        
    @classmethod
    def from_stats(cls, mean, std, dim=1, ndim=4, cuda=True): return cls(*broadcast_vec(dim, ndim, mean, std, cuda=cuda))
    
    def setups(self, dl:DataLoader):
        if self.mean is None or self.std is None:
            b = dl.one_batch()
            for x in b:
                if isinstance(x,TensorScalarsInput):
                    self.mean,self.std = x.mean(self.axes, keepdim=True),x.std(self.axes, keepdim=True)+1e-7
                    return

    def encodes(self, x:TensorScalarsInput): 
        if x.device != self.mean.device:
            self.mean = self.mean.to(x.device)
            self.std = self.std.to(x.device)
        return (x-self.mean) / self.std
    
    def decodes(self, x:TensorScalarsInput):
        if x.device != self.mean.device:
            self.mean = self.mean.to(x.device)
            self.std = self.std.to(x.device)
        return (x*self.std + self.mean)

class ScalarBlock(TransformBlock):
    def __init__(self, scl_extract):
        return super().__init__(type_tfms=[scl_extract],
                                batch_tfms=[ScalarNormalize()])

    @classmethod
    @delegates(HDF_Attrs2Scalars, keep=True)
    def from_hdf_attrs(cls, clm_names, scl_cls=TensorScalarsInput, **kwargs):
        return cls(HDF_Attrs2Scalars(clm_names,to_cls=scl_cls,**kwargs))

    @classmethod
    @delegates(HDF_DS2Scalars, keep=True)
    def from_hdf_ds(cls, clm_names, scl_cls=TensorScalarsInput, **kwargs):
        return cls(HDF_DS2Scalars(clm_names,to_cls=scl_cls,**kwargs))

In [None]:
seq = DataBlock(blocks=(SequenceBlock.from_hdf(['u'],TensorSequencesInput),
                        ScalarBlock.from_hdf_ds(['y'],TensorScalarsOutput)),
                get_items=tfm_src,
                splitter=ApplyToDict(ParentSplitter()))

In [None]:
dls = seq.dataloaders(hdf_files)

In [None]:
#| include: false
import nbdev
nbdev.nbdev_export()