In [None]:
# default_exp data

# Data
> Functions used to create pytorch `DataSet`s and `DataLoader`s.

In [None]:
# export
import torch
from torch.utils.data import Dataset, DataLoader
from fastai.data_block import DataBunch, DatasetType

import numpy as np

In [None]:
# hide
%load_ext autoreload
%autoreload 2

In [None]:
# export
def get_breakpoints(df, percentiles:list=[20, 40, 60, 80]):
    """
    Gives the times at which death events occur at given percentile
    parameters:
    df - must contain columns 't' (time) and 'e' (death event)
    percentiles - list of percentages at which breakpoints occur (do not include 0 and 100)
    """
    event_times = df.loc[df['e']==1, 't'].values
    breakpoints = np.percentile(event_times, percentiles)
    breakpoints = [0] + breakpoints.tolist() + [df['t'].max()]
    
    widths = np.diff(breakpoints).tolist()
    return breakpoints, widths

In [None]:
# hide
import pandas as pd

url = "https://raw.githubusercontent.com/CamDavidsonPilon/lifelines/master/lifelines/datasets/rossi.csv"
df = pd.read_csv(url)
df.rename(columns={'week':'t', 'arrest':'e'}, inplace=True)

[14.600000000000001, 24.0, 35.0, 43.400000000000006]

In [None]:
# export
class TestData(Dataset):
    """
    Create pyTorch Dataset
    parameters:
    - t: time elapsed
    - b: (optional) breakpoints where the hazard is different to previous segment of time. 
    **Must include 0 as first element and the maximum time as last element**
    - x: (optional) features
    """
    def __init__(self, t, b:list=None, x=None):
        super().__init__()
        assert isinstance(b, np.ndarray) or isinstance(b, list) or b is None\
                , "Breakpoints need to be a list"
        self.t, self.b, self.x = t, b, x
        if b:
            self.b = b[1:-1]
        
    def __len__(self):
        return len(self.t)
    
    def __getitem__(self, i):
        time = torch.Tensor([self.t[i]])
        
        if self.b is None:
            x_ = (time,)
        else:
            t_section = torch.LongTensor([np.searchsorted(self.b, self.t[i])])
            x_ = (time, t_section.squeeze())
        
        if self.x is not None:
            x = torch.Tensor(self.x[i])
            x_ = x_ + (x,)
            
        return x_

In [None]:
# export
class Data(TestData):
    """
    Create pyTorch Dataset
    parameters:
    - t: time elapsed
    - e: (death) event observed. 1 if observed, 0 otherwise.
    - b: (optional) breakpoints where the hazard is different to previous segment of time.
    - x: (optional) features
    """
    def __init__(self, t, e, b=None, x=None):
        super().__init__(t, b, x)
        self.e = e
        
    def __getitem__(self, i):
        x_ = super().__getitem__(i)
        e = torch.Tensor([self.e[i]])
        return x_, e

In [None]:
# hide
np.random.seed(42)
N = 100
D = 3
p = 0.1
bs = 64

x = np.random.randn(N, D)
t = np.arange(N)
e = np.random.binomial(1, p, N)

data = Data(t, e, x=x)
batch = next(iter(DataLoader(data, bs)))
assert len(batch[-1]) == bs, (f"length of batch {len(batch)} is different" 
                          f"to intended batch size {bs}")
[b.shape for b in batch[0]], batch[1].shape

([torch.Size([64, 1]), torch.Size([64, 3])], torch.Size([64, 1]))

In [None]:
# hide
N = 100
D = 3
p = 0.1
bs = 64
breakpoints = [10, 50]

data = Data(t, e, breakpoints, x)
batch2 = next(iter(DataLoader(data, bs)))
assert len(batch2[-1]) == bs, (f"length of batch {len(batch2)} is different" 
                          f"to intended batch size {bs}")
print([b.shape for b in batch2[0]], batch2[1].shape)

assert torch.all(batch[0][0] == batch2[0][0]), ("Discrepancy between batch "
                                                "with breakpoints and without")

[torch.Size([64, 1]), torch.Size([64]), torch.Size([64, 3])] torch.Size([64, 1])


In [None]:
# export
class TestDataFrame(TestData):
    """
    Wrapper around Data Class that takes in a dataframe instead
    parameters:
    - df: dataframe. **Must have t (time) and e (event) columns, other cols optional.
    - b: breakpoints of time (optional)
    """
    def __init__(self, df, b=None):
        t = df['t'].values
        remainder = list(set(df.columns) - set(['t', 'e']))
        x = df[remainder].values
        if x.shape[1] == 0:
            x = None
        super().__init__(t, b, x)

In [None]:
# export
class DataFrame(Data):
    """
    Wrapper around Data Class that takes in a dataframe instead
    parameters:
    - df: dataframe. **Must have t (time) and e (event) columns, other cols optional.
    - b: breakpoints of time (optional)
    """
    def __init__(self, df, b=None):
        t = df['t'].values
        e = df['e'].values
        x = df.drop(['t', 'e'], axis=1).values
        if x.shape[1] == 0:
            x = None
        super().__init__(t, e, b, x)

In [None]:
# hide
# testing with pandas dataframe
import pandas as pd

df = pd.DataFrame({'t': t, 'e': e})
df2 = DataFrame(df)
df2[1]

((tensor([1.]),), tensor([0.]))

In [None]:
# hide
# testing with x
new_df = pd.concat([df, pd.DataFrame(x)], axis=1)
df3 = DataFrame(new_df)
df3[1]

((tensor([1.]), tensor([ 1.5230, -0.2342, -0.2341])), tensor([0.]))

In [None]:
# hide
# testing with breakpoints
new_df = pd.concat([df, pd.DataFrame(x)], axis=1)
df3 = DataFrame(new_df, breakpoints)
df3[1]

((tensor([1.]), tensor(0), tensor([ 1.5230, -0.2342, -0.2341])), tensor([0.]))

Create iterable data loaders/ fastai databunch using above:

In [None]:
# export
def create_db(df, b=None, train_p=0.8, bs=128):
    """
    Take dataframe and split into train, test, val (optional)
    and convert to Fastai databunch

    parameters:
    - df: pandas dataframe
    - b: breakpoints of time (optional)
    - train_p: training percentage
    - bs: batch size
    """
    df.reset_index(drop=True, inplace=True)

    train_len = int(train_p*len(df))
    train_ds = DataFrame(df.iloc[:train_len], b)
    val_ds = DataFrame(df.iloc[train_len:], b)
    
    train_dl = DataLoader(train_ds, bs, shuffle=True, drop_last=False)
    val_dl = DataLoader(val_ds, bs, drop_last=False)
    db = DataBunch(train_dl, val_dl)
    
    return db

def create_test_dl(df, b=None, bs=128):
    """
    Take dataframe and return a pytorch dataloader.
    parameters:
    - df: pandas dataframe
    - b: breakpoints of time (optional)
    - bs: batch size
    """
    df.reset_index(drop=True, inplace=True)
    test_ds = TestDataFrame(df, b)
    test_dl = DataLoader(test_ds, bs, shuffle=False, drop_last=False)
    return test_dl

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted 00_index.ipynb.
Converted 10_SAT.ipynb.
Converted 20_KaplanMeier.ipynb.
Converted 50_hazard.ipynb.
Converted 55_hazard.PiecewiseHazard.ipynb.
Converted 59_hazard.Cox.ipynb.
Converted 60_AFT_models.ipynb.
Converted 65_AFT_error_distributions.ipynb.
Converted 80_data.ipynb.
Converted 90_model.ipynb.
Converted 95_Losses.ipynb.
