In [None]:
# default_exp data

# Data
> Functions used to create pytorch `DataSet`s and `DataLoader`s.

In [None]:
# export
from typing import Optional, Tuple, Union

import numpy as np
import torch
from fastai.data_block import DataBunch, DatasetType
from pandas import DataFrame
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from torch.utils.data import DataLoader, Dataset

In [None]:
# hide
%load_ext autoreload
%autoreload 2

In [None]:
# hide
import pandas as pd

url = "https://raw.githubusercontent.com/CamDavidsonPilon/lifelines/master/lifelines/datasets/rossi.csv"
df = pd.read_csv(url)
df.rename(columns={'week':'t', 'arrest':'e'}, inplace=True)

In [None]:
import sklearn
isinstance(MaxAbsScaler(), sklearn.preprocessing._data.MaxAbsScaler)

True

In [None]:
# export
class TestData(Dataset):
    """
    Create pyTorch Dataset
    parameters:
    - t: time elapsed
    - b: (optional) breakpoints where the hazard is different to previous segment of time. 
    **Must include 0 as first element and the maximum time as last element**
    - x: (optional) features
    """
    def __init__(self, t:np.array, b:Optional[np.array]=None, x:Optional[np.array]=None, 
                 t_scaler:MaxAbsScaler=None, x_scaler:StandardScaler=None) -> None:
        super().__init__()
        self.t, self.b, self.x = t, b, x
        if len(t.shape) == 1:
            self.t = t[:,None]

#         breakpoint()
        if t_scaler:
            self.t_scale = t_scaler
            self.t = self.t_scale.transform(self.t)
        else:
            self.t_scale = MaxAbsScaler()
            self.t = self.t_scale.fit_transform(self.t)
        
        if b is not None:
            b = b[1:-1]
            if len(b.shape) == 1:
                b = b[:,None]
            if t_scaler:
                self.b = t_scaler.transform(b).squeeze()
            else:
                self.b = self.t_scale.transform(b).squeeze()
            
        if x is not None:
            if len(x.shape) == 1:
                self.x = x[:,None]
            if x_scaler:
                self.x_scale = x_scaler
                self.x = self.x_scale.transform(self.x)
            else:
                self.x_scale = StandardScaler()
                self.x = self.x_scale.fit_transform(self.x)
        
    def __len__(self) -> int:
        return len(self.t)
    
    def __getitem__(self, i:int) -> Tuple:
        time = torch.Tensor(self.t[i])
        
        if self.b is None:
            x_ = (time,)
        else:
            t_section = torch.LongTensor([np.searchsorted(self.b, self.t[i])])
            x_ = (time, t_section.squeeze())
        
        if self.x is not None:
            x = torch.Tensor(self.x[i])
            x_ = x_ + (x,)
            
        return x_

In [None]:
# export
class Data(TestData):
    """
    Create pyTorch Dataset
    parameters:
    - t: time elapsed
    - e: (death) event observed. 1 if observed, 0 otherwise.
    - b: (optional) breakpoints where the hazard is different to previous segment of time.
    - x: (optional) features
    """
    def __init__(self, t:np.array, e:np.array, b:Optional[np.array]=None, x:Optional[np.array]=None,
                t_scaler:MaxAbsScaler=None, x_scaler:StandardScaler=None) -> None:
        super().__init__(t, b, x, t_scaler, x_scaler)
        self.e = e
        if len(e.shape) == 1:
            self.e = e[:,None]
        
    def __getitem__(self, i) -> Tuple:
        x_ = super().__getitem__(i)
        e = torch.Tensor(self.e[i])
        return x_, e

In [None]:
# hide
np.random.seed(42)
N = 100
D = 3
p = 0.1
bs = 64

x = np.random.randn(N, D)
t = np.arange(N)
e = np.random.binomial(1, p, N)

data = Data(t, e, x=x)
batch = next(iter(DataLoader(data, bs)))
assert len(batch[-1]) == bs, (f"length of batch {len(batch)} is different" 
                          f"to intended batch size {bs}")
[b.shape for b in batch[0]], batch[1].shape

([torch.Size([64, 1]), torch.Size([64, 3])], torch.Size([64, 1]))

In [None]:
# hide
breakpoints = np.array([0, 10, 50, N-1])

data = Data(t, e, breakpoints, x)
batch2 = next(iter(DataLoader(data, bs)))
assert len(batch2[-1]) == bs, (f"length of batch {len(batch2)} is different" 
                          f"to intended batch size {bs}")
print([b.shape for b in batch2[0]], batch2[1].shape)

assert torch.all(batch[0][0] == batch2[0][0]), ("Discrepancy between batch "
                                                "with breakpoints and without")

[torch.Size([64, 1]), torch.Size([64]), torch.Size([64, 3])] torch.Size([64, 1])


In [None]:
# export
class TestDataFrame(TestData):
    """
    Wrapper around Data Class that takes in a dataframe instead
    parameters:
    - df: dataframe. **Must have t (time) and e (event) columns, other cols optional.
    - b: breakpoints of time (optional)
    """
    def __init__(self, df:DataFrame, b:Optional[np.array]=None,
                 t_scaler:MaxAbsScaler=None, x_scaler:StandardScaler=None) -> None:
        t = df['t'].values
        remainder = list(set(df.columns) - set(['t', 'e']))
        x = df[remainder].values
        if x.shape[1] == 0:
            x = None
        super().__init__(t, b, x, t_scaler, x_scaler)

In [None]:
# export
class DataFrame(Data):
    """
    Wrapper around Data Class that takes in a dataframe instead
    parameters:
    - df: dataframe. **Must have t (time) and e (event) columns, other cols optional.
    - b: breakpoints of time (optional)
    """
    def __init__(self, df:DataFrame, b:Optional[np.array]=None,
                t_scaler:MaxAbsScaler=None, x_scaler:StandardScaler=None) -> None:
        t = df['t'].values
        e = df['e'].values
        x = df.drop(['t', 'e'], axis=1).values
        if x.shape[1] == 0:
            x = None
        super().__init__(t, e, b, x, t_scaler, x_scaler)

In [None]:
# hide
# testing with pandas dataframe
import pandas as pd

df = pd.DataFrame({'t': t, 'e': e})
df2 = DataFrame(df)
df2[1]

((tensor([0.0101]),), tensor([0.]))

In [None]:
# hide
# testing with x
new_df = pd.concat([df, pd.DataFrame(x)], axis=1)
df3 = DataFrame(new_df)
df3[1]

((tensor([0.0101]), tensor([ 1.7440, -0.0523, -0.2790])), tensor([0.]))

In [None]:
# hide
# testing with breakpoints
new_df = pd.concat([df, pd.DataFrame(x)], axis=1)
df3 = DataFrame(new_df, breakpoints)
df3[1]

((tensor([0.0101]), tensor(0), tensor([ 1.7440, -0.0523, -0.2790])),
 tensor([0.]))

Create iterable data loaders/ fastai databunch using above:

In [None]:
# export
def create_db(df, b:Optional[np.array]=None, train_p:float=0.8, bs:int=128) -> None:
    """
    Take dataframe and split into train, test, val (optional)
    and convert to Fastai databunch

    parameters:
    - df: pandas dataframe
    - b: breakpoints of time (optional)
    - train_p: training percentage
    - bs: batch size
    """
    df.reset_index(drop=True, inplace=True)

    train_len = int(train_p*len(df))
    
    train_ds = DataFrame(df.iloc[:train_len], b)
    val_ds = DataFrame(df.iloc[train_len:], b, train_ds.t_scale, train_ds.x_scale)
    
    train_dl = DataLoader(train_ds, bs, shuffle=True, drop_last=False)
    val_dl = DataLoader(val_ds, bs, drop_last=False)
    db = DataBunch(train_dl, val_dl)
    
    if b is None:
        return db
    else:
        return db, train_ds.t_scale

def create_test_dl(df, t_scaler:MaxAbsScaler, b:Optional[np.array]=None, bs:int=128,
                  x_scaler:StandardScaler=None) -> None:
    """
    Take dataframe and return a pytorch dataloader.
    parameters:
    - df: pandas dataframe
    - b: breakpoints of time (optional)
    - bs: batch size
    """
    df.reset_index(drop=True, inplace=True)
    test_ds = TestDataFrame(df, b, t_scaler, x_scaler)
    test_dl = DataLoader(test_ds, bs, shuffle=False, drop_last=False)
    return test_dl

In [None]:
# export
def get_breakpoints(df:DataFrame, percentiles:list=[20, 40, 60, 80]) -> np.array:
    """
    Gives the times at which death events occur at given percentile
    parameters:
    df - must contain columns 't' (time) and 'e' (death event)
    percentiles - list of percentages at which breakpoints occur (do not include 0 and 100)
    """
    event_times = df.loc[df['e']==1, 't'].values
    breakpoints = np.percentile(event_times, percentiles)
    breakpoints = np.array([0] + breakpoints.tolist() + [df['t'].max()])
    
    return breakpoints

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted 00_index.ipynb.
Converted 10_SAT.ipynb.
Converted 20_KaplanMeier.ipynb.
Converted 50_hazard.ipynb.
Converted 55_hazard.PiecewiseHazard.ipynb.
Converted 59_hazard.Cox.ipynb.
Converted 60_AFT_models.ipynb.
Converted 65_AFT_error_distributions.ipynb.
Converted 80_data.ipynb.
Converted 90_model.ipynb.
Converted 95_Losses.ipynb.
