# Data
> Functions used to create pytorch `DataSet`s and `DataLoader`s.

In [None]:
# default_exp data

In [None]:
# export
import torch
from torch.utils.data import Dataset, DataLoader
from fastai.data_block import DataBunch, DatasetType

import numpy as np

In [None]:
# hide
%load_ext autoreload
%autoreload 2

In [None]:
# export
class Data(Dataset):
    """
    Create pyTorch Dataset
    parameters:
    - x: features
    - t: time elapsed
    - e: (death) event observed. 1 if observed, 0 otherwise.
    - b: breakpoints where the hazard is different to previous segment of time.
    """
    def __init__(self, t, e, b=None, x=None):
        super().__init__()
        assert isinstance(b, np.ndarray) or isinstance(b, list) or b is None\
                , "Breakpoints need to be a list"
        self.x, self.t, self.e, self.b = x, t, e, b
        
    def __len__(self):
        return len(self.t)
    
    def __getitem__(self, i):
        time = torch.Tensor([self.t[i]])
        e = torch.Tensor([self.e[i]])
        
        if self.b is None:
            x_ = (time,)
        else:
            t_section = torch.LongTensor([np.searchsorted(self.b, self.t[i])])
            x_ = (time, t_section.squeeze())
        
        if self.x is not None:
            x = torch.Tensor(self.x[i])
            x_ = x_ + (x,)
            
        return x_, e

In [None]:
# hide
np.random.seed(42)
N = 100
D = 3
p = 0.1
bs = 64

x = np.random.randn(N, D)
t = np.arange(N)
e = np.random.binomial(1, p, N)

data = Data(t, e, x=x)
batch = next(iter(DataLoader(data, bs)))
assert len(batch[-1]) == bs, (f"length of batch {len(batch)} is different" 
                          f"to intended batch size {bs}")
[b.shape for b in batch[0]], batch[1].shape

([torch.Size([64, 1]), torch.Size([64, 3])], torch.Size([64, 1]))

In [None]:
# hide
N = 100
D = 3
p = 0.1
bs = 64
breakpoints = [10, 50]

data = Data(t, e, breakpoints, x)
batch2 = next(iter(DataLoader(data, bs)))
assert len(batch2[-1]) == bs, (f"length of batch {len(batch2)} is different" 
                          f"to intended batch size {bs}")
print([b.shape for b in batch2[0]], batch2[1].shape)

assert torch.all(batch[0][0] == batch2[0][0]), ("Discrepancy between batch "
                                                "with breakpoints and without")

[torch.Size([64, 1]), torch.Size([64]), torch.Size([64, 3])] torch.Size([64, 1])


In [None]:
# export
class DataFrame(Data):
    """
    Wrapper around Data Class that takes in a dataframe instead
    parameters:
    - df: dataframe. **Must have t (time) and e (event) columns, other cols optional.
    - b: breakpoints of time (optional)
    """
    def __init__(self, df, b=None):
        t = df['t'].values
        e = df['e'].values
        x = df.drop(['t', 'e'], axis=1).values
        if x.shape[1] == 0:
            x = None
        super().__init__(t, e, b, x)

In [None]:
# hide
# testing with pandas dataframe
import pandas as pd

df = pd.DataFrame({'t': t, 'e': e})
df2 = DataFrame(df)
df2[1]

((tensor([1.]),), tensor([0.]))

In [None]:
# hide
# testing with x
new_df = pd.concat([df, pd.DataFrame(x)], axis=1)
df3 = DataFrame(new_df)
df3[1]

((tensor([1.]), tensor([ 1.5230, -0.2342, -0.2341])), tensor([0.]))

In [None]:
# hide
# testing with breakpoints
new_df = pd.concat([df, pd.DataFrame(x)], axis=1)
df3 = DataFrame(new_df, breakpoints)
df3[1]

((tensor([1.]), tensor(0), tensor([ 1.5230, -0.2342, -0.2341])), tensor([0.]))

In [None]:
# export
def create_db(df, b=None, train_p=0.8, bs=128, test_ds=False, test_p=0.2):
    """
    Take dataframe and split into train, test, val (optional)
    and convert to Fastai databunch

    parameters:
    - df: pandas dataframe
    - b: breakpoints of time (optional)
    - train_p: training percentage
    - bs: batch size
    - test_ds: whether to split into test set
    - test_p: proportion of whats left over after taking out train set
    """
    df.reset_index(drop=True, inplace=True)
    if test_ds:
        train_len = int((1-test_p)*len(df))
        test_ds = DataFrame(df.iloc[train_len:], b)
        df = df.iloc[:train_len]
        test_dl = DataLoader(test_ds, bs=bs)

    train_len = int(train_p*len(df))
    train_ds = DataFrame(df.iloc[:train_len], b)
    val_ds = DataFrame(df.iloc[train_len:], b)
    
    bs = min(bs, len(train_ds))
    val_bs = min(bs, len(val_ds))
    train_db = DataBunch.create(train_ds, val_ds, bs=bs, val_bs=val_bs)

    if test_ds is not False:
        return train_db, test_dl
    else:
        return train_db

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted Cox_Proportional_Hazard.ipynb.
Converted KaplanMeier.ipynb.
Converted Survival Analysis Theory.ipynb.
Converted data.ipynb.
Converted index.ipynb.
Converted utils.ipynb.
