# 1. Setting up the data: 

In [27]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader



train_dataset_path = "../dataset/classification/FingerMovements/train.pt"
train_data = torch.load(train_dataset_path)
train_data

{'samples': tensor([[[ 41.8000,  55.2000,  -8.6000,  ...,  16.9000,  42.2000,  13.0000],
          [ 44.8000,  53.8000,  -3.6000,  ...,  24.5000,  35.0000,  26.6000],
          [ 47.1000,  59.9000,  14.4000,  ...,  24.5000,  41.7000,  52.5000],
          ...,
          [ 69.8000,  17.5000,  23.3000,  ...,  51.9000,  51.5000,  -3.5000],
          [ 72.6000,  28.0000,  35.9000,  ...,  59.6000,  58.5000,  -3.2000],
          [ 76.1000,  12.1000,  23.2000,  ...,  57.3000,  46.9000,  -2.6000]],
 
         [[132.0000,  99.7000,  18.7000,  ..., -11.2000, -21.7000, -41.3000],
          [137.0000,  95.8000,   0.8000,  ..., -11.6000, -21.1000, -40.2000],
          [148.0000,  99.5000,  20.0000,  ..., -13.8000, -23.6000, -33.3000],
          ...,
          [ 78.1000, -39.9000,   0.9000,  ..., -31.9000, -57.8000, -51.1000],
          [ 90.5000, -10.2000,  31.6000,  ..., -19.9000, -44.6000, -45.4000],
          [ 85.7000, -37.4000,   7.3000,  ..., -25.7000, -45.6000, -42.5000]],
 
         [[ 69.40

## Basic Dataset with samples + labels: 

In [16]:
class Dataset_Classification(Dataset):
    def __init__(self, dataset):
        super().__init__()

        self.x = dataset["samples"]  # (N, T, C)
        self.y = dataset["labels"]  # (N,)

        # Calculate Means/stds:
        self.means = self.x.mean(dim=[0, 1]) 
        self.stds = self.x.std(dim=[0, 1]) 

        # Normalize
        self.x = (self.x - self.means) / self.stds

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.x.shape[0]
    

train_dataset = Dataset_Classification(train_data)
train_dataset[0]

(tensor([[-9.5397e-01,  6.1665e-01, -3.7655e-01,  ...,  1.2540e-03,
           8.8399e-01,  1.4750e-01],
         [-8.3868e-01,  5.8382e-01, -2.3852e-01,  ...,  2.3525e-01,
           6.7116e-01,  5.3996e-01],
         [-7.5029e-01,  7.2689e-01,  2.5842e-01,  ...,  2.3525e-01,
           8.6921e-01,  1.2874e+00],
         ...,
         [ 1.2205e-01, -2.6755e-01,  5.0412e-01,  ...,  1.0788e+00,
           1.1589e+00, -3.2864e-01],
         [ 2.2965e-01, -2.1287e-02,  8.5198e-01,  ...,  1.3159e+00,
           1.3658e+00, -3.1999e-01],
         [ 3.6416e-01, -3.9420e-01,  5.0136e-01,  ...,  1.2451e+00,
           1.0229e+00, -3.0267e-01]]),
 tensor(0.))

## Setting up an Imputing Noising Dataset: 

In [24]:
def geom_noise_mask_single(L, lm, masking_ratio):
    """
    Randomly create a boolean mask of length `L`, consisting of subsequences of average length lm, masking with 0s a `masking_ratio`
    proportion of the sequence L. The length of masking subsequences and intervals follow a geometric distribution.
    Args:
        L: length of mask and sequence to be masked
        lm: average length of masking subsequences (streaks of 0s)
        masking_ratio: proportion of L to be masked

    Returns:
        (L,) boolean numpy array intended to mask ('drop') with 0s a sequence of length L
    """
    keep_mask = np.ones(L, dtype=bool)
    p_m = 1 / lm  # probability of each masking sequence stopping. parameter of geometric distribution.
    p_u = p_m * masking_ratio / (1 - masking_ratio)  # probability of each unmasked sequence stopping. parameter of geometric distribution.
    p = [p_m, p_u]

    # Start in state 0 with masking_ratio probability
    state = int(np.random.rand() > masking_ratio)  # state 0 means masking, 1 means not masking
    for i in range(L):
        keep_mask[i] = state  # here it happens that state and masking value corresponding to state are identical
        if np.random.rand() < p[state]:
            state = 1 - state

    return keep_mask

def noise_mask(X, masking_ratio, lm=3, mode='separate', distribution='geometric', exclude_feats=None):
    """
    Creates a random boolean mask of the same shape as X, with 0s at places where a feature should be masked.
    Args:
        X: (seq_length, feat_dim) numpy array of features corresponding to a single sample
        masking_ratio: proportion of seq_length to be masked. At each time step, will also be the proportion of
            feat_dim that will be masked on average
        lm: average length of masking subsequences (streaks of 0s). Used only when `distribution` is 'geometric'.
        mode: whether each variable should be masked separately ('separate'), or all variables at a certain positions
            should be masked concurrently ('concurrent')
        distribution: whether each mask sequence element is sampled independently at random, or whether
            sampling follows a markov chain (and thus is stateful), resulting in geometric distributions of
            masked squences of a desired mean length `lm`
        exclude_feats: iterable of indices corresponding to features to be excluded from masking (i.e. to remain all 1s)

    Returns:
        boolean numpy array with the same shape as X, with 0s at places where a feature should be masked
    """
    if exclude_feats is not None:
        exclude_feats = set(exclude_feats)

    if distribution == 'geometric':  # stateful (Markov chain)
        if mode == 'separate':  # each variable (feature) is independent
            mask = np.ones(X.shape, dtype=bool)
            for m in range(X.shape[1]):  # feature dimension
                if exclude_feats is None or m not in exclude_feats:
                    mask[:, m] = geom_noise_mask_single(X.shape[0], lm, masking_ratio)  # time dimension
        else:  # replicate across feature dimension (mask all variables at the same positions concurrently)
            mask = np.tile(np.expand_dims(geom_noise_mask_single(X.shape[0], lm, masking_ratio), 1), X.shape[1])
    else:  # each position is independent Bernoulli with p = 1 - masking_ratio
        if mode == 'separate':
            mask = np.random.choice(np.array([True, False]), size=X.shape, replace=True,
                                    p=(1 - masking_ratio, masking_ratio))
        else:
            mask = np.tile(np.random.choice(np.array([True, False]), size=(X.shape[0], 1), replace=True,
                                            p=(1 - masking_ratio, masking_ratio)), X.shape[1])

    return mask

class ImputationDataset(Dataset):
    """Dynamically computes missingness (noise) mask for each sample"""

    def __init__(self, data, mean_mask_length=3, masking_ratio=0.15,
                 mode='separate', distribution='geometric', exclude_feats=None):
        super(ImputationDataset, self).__init__()

        # self.data = data  # this is a subclass of the BaseData class in data.py
        self.data = data["samples"]
        self.labels = data["labels"]

        # Normalise features: 
        self.means = self.data.mean(dim=[0, 1]) 
        self.stds = self.data.std(dim=[0, 1]) 

        # Normalize
        self.data = (self.data - self.means) / self.stds

        

        # self.feature_df = self.data.feature_df.loc[self.IDs]

        self.masking_ratio = masking_ratio
        self.mean_mask_length = mean_mask_length
        self.mode = mode
        self.distribution = distribution
        self.exclude_feats = exclude_feats

    def __getitem__(self, ind):
        """
        For a given integer index, returns the corresponding (seq_length, feat_dim) array and a noise mask of same shape
        Args:
            ind: integer index of sample in dataset
        Returns:
            X: (seq_length, feat_dim) tensor of the multivariate time series corresponding to a sample
            mask: (seq_length, feat_dim) boolean tensor: 0s mask and predict, 1s: unaffected input
            ID: ID of sample
        """

        # X = self.feature_df.loc[self.IDs[ind]].values  # (seq_length, feat_dim) array
        X = self.data[ind].numpy()
        label = self.labels[ind]

        mask = noise_mask(X, self.masking_ratio, self.mean_mask_length, self.mode, self.distribution,
                          self.exclude_feats)  # (seq_length, feat_dim) boolean array

        return torch.from_numpy(X), torch.from_numpy(mask), label

    def update(self):
        self.mean_mask_length = min(20, self.mean_mask_length + 1)
        self.masking_ratio = min(1, self.masking_ratio + 0.05)

    def __len__(self):
        return self.data.shape[0]
    


imputation_dataset = ImputationDataset(train_data)
sample, mask, label = imputation_dataset[0]
sample[0], mask[0]

(tensor([-9.5397e-01,  6.1665e-01, -3.7655e-01, -2.6380e-01,  1.1476e+00,
          9.5103e-01,  5.6980e-01,  9.6900e-01,  1.0436e+00, -4.6374e-01,
          3.5370e-01,  8.2001e-01,  9.7326e-01,  3.7221e-01, -3.8268e-01,
          5.4273e-01,  8.8523e-01,  4.3582e-01,  7.7776e-02,  1.4664e+00,
          1.2042e+00,  8.0010e-01,  8.1634e-01,  4.8460e-01,  5.7211e-01,
          1.2540e-03,  8.8399e-01,  1.4750e-01]),
 tensor([ True,  True,  True,  True,  True,  True,  True, False, False,  True,
          True,  True,  True, False, False,  True,  True, False,  True,  True,
          True,  True,  True,  True,  True,  True,  True, False]))

## Setting up the collating function + DataLoader:

In [68]:
def compensate_masking(X, mask):
    """
    Compensate feature vectors after masking values, in a way that the matrix product W @ X would not be affected on average.
    If p is the proportion of unmasked (active) elements, X' = X / p = X * feat_dim/num_active
    Args:
        X: (batch_size, seq_length, feat_dim) torch tensor
        mask: (batch_size, seq_length, feat_dim) torch tensor: 0s means mask and predict, 1s: unaffected (active) input
    Returns:
        (batch_size, seq_length, feat_dim) compensated features
    """

    # number of unmasked elements of feature vector for each time step
    num_active = torch.sum(mask, dim=-1).unsqueeze(-1)  # (batch_size, seq_length, 1)
    # to avoid division by 0, set the minimum to 1
    num_active = torch.max(num_active, torch.ones(num_active.shape, dtype=torch.int16))  # (batch_size, seq_length, 1)
    return X.shape[-1] * X / num_active


def padding_mask(lengths, max_len=None):
    """
    Used to mask padded positions: creates a (batch_size, max_len) boolean mask from a tensor of sequence lengths,
    where 1 means keep element at this position (time step)
    """
    batch_size = lengths.numel()
    max_len = max_len or lengths.max_val()  # trick works because of overloading of 'or' operator for non-boolean types
    return (torch.arange(0, max_len, device=lengths.device)
            .type_as(lengths)
            .repeat(batch_size, 1)
            .lt(lengths.unsqueeze(1)))


def collate_unsuperv(data, max_len=None, mask_compensation=False):
    """Build mini-batch tensors from a list of (X, mask) tuples. Mask input. Create
    Args:
        data: len(batch_size) list of tuples (X, mask).
            - X: torch tensor of shape (seq_length, feat_dim); variable seq_length.
            - mask: boolean torch tensor of shape (seq_length, feat_dim); variable seq_length.
        max_len: global fixed sequence length. Used for architectures requiring fixed length input,
            where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s
    Returns:
        X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input)
        targets: (batch_size, padded_length, feat_dim) torch tensor of unmasked features (output)
        target_masks: (batch_size, padded_length, feat_dim) boolean torch tensor
            0 indicates masked values to be predicted, 1 indicates unaffected/"active" feature values
        padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 ignore (padding)
    """

    batch_size = len(data)
    features, masks, labels = zip(*data)

    # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension)
    lengths = [X.shape[0] for X in features]  # original sequence length for each time series
    if max_len is None:
        max_len = max(lengths)
    X = torch.zeros(batch_size, max_len, features[0].shape[-1])  # (batch_size, padded_length, feat_dim)
    target_masks = torch.zeros_like(X,
                                    dtype=torch.bool)  # (batch_size, padded_length, feat_dim) masks related to objective
    for i in range(batch_size):
        end = min(lengths[i], max_len)
        X[i, :end, :] = features[i][:end, :]
        target_masks[i, :end, :] = masks[i][:end, :]

    targets = X.clone()
    X = X * target_masks  # mask input
    if mask_compensation:
        X = compensate_masking(X, target_masks)

    padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16), max_len=max_len)  # (batch_size, padded_length) boolean tensor, "1" means keep
    target_masks = ~target_masks  # inverse logic: 0 now means ignore, 1 means predict
    return X, targets, target_masks, padding_masks, labels



# Setting up the dataloader: 
batch_size = 32
max_seq_len = 50

train_loader = DataLoader(
    dataset=imputation_dataset,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=lambda x : collate_unsuperv(x, max_len=max_seq_len)
)


next(iter(train_loader))




(tensor([[[ 0.6677,  1.2546,  0.0000,  ...,  0.3584,  0.0000,  0.0436],
          [ 0.0000,  1.1842,  0.0000,  ...,  0.2876,  0.0000, -0.1007],
          [ 0.0000,  1.3554,  0.5538,  ...,  0.2260,  0.3992, -0.1007],
          ...,
          [ 0.0144, -0.6780,  1.1474,  ..., -0.0000, -0.2038, -0.1440],
          [ 0.3526, -0.2113,  1.6636,  ...,  0.0598, -0.1211, -0.1036],
          [ 0.0529, -0.5724,  0.7885,  ...,  0.1090, -0.2156, -0.4729]],
 
         [[ 0.0000,  0.3258,  0.5814,  ...,  1.2574,  0.1568,  0.0000],
          [ 1.6554,  0.5228,  1.4207,  ...,  1.0111,  0.3726,  0.0000],
          [ 1.8475,  0.6190,  0.7636,  ...,  1.1096,  0.3962,  0.0000],
          ...,
          [ 0.6370, -0.5982,  2.5112,  ...,  1.1866,  0.0000,  1.0103],
          [ 0.6216, -1.1846,  0.0000,  ...,  1.0696,  0.0740,  1.0363],
          [ 0.6293, -0.0000,  0.0000,  ...,  0.9495, -0.1536,  0.9901]],
 
         [[-0.3391,  1.5759, -0.0000,  ..., -0.0000,  0.2632, -0.5912],
          [ 0.4525,  1.9043,

# This should work. Dataloader returns, Inputs, targets, targets_mask, padding_mask + Labels.