In [None]:
import os
import nbimporter

root = os.getcwd().split("survival_analysis")[0]
os.chdir(root + "survival_analysis")

In [None]:
import pickle
import warnings
import numpy as np
import pandas as pd
import lifelines.datasets as ds
from lifelines import KaplanMeierFitter
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing

# Basic dataset functionality

In [None]:
class DataNormalization:

    def __init__(self, df_train, df_valid, df_test):
        self.df_train = df_train
        self.df_valid = df_valid
        self.df_test  = df_test
        self.pre_normalization_checks(df_train, df_valid, df_test)


    def pre_normalization_check(self, df):
        assert "duration" in df.columns, df.columns
        assert "event_observed" in df.columns, df.columns


    def pre_normalization_checks(self, df_train, df_valid, df_test):
        self.pre_normalization_check(df_train)
        self.pre_normalization_check(df_valid)
        self.pre_normalization_check(df_test)


    def feature_names(self):
        names = list(self.df_train.columns)
        names.remove("duration")
        names.remove("event_observed")
        return names


    def normalization_none(self):
        return self.df_train.copy(), self.df_valid.copy(), self.df_test.copy()


    def _normalize_mean_std(self, df, means, stds):
        means = np.array(means[self.feature_names()])
        stds = np.array(stds[self.feature_names()])

        data = np.array(df[self.feature_names()].astype(float))
        data = (data - means) / stds

        feature_names = self.feature_names()
        df.loc[:, feature_names] = data
        return df


    def normalize_mean_std(self):
        df_train, df_valid, df_test = self.df_train.copy(), self.df_valid.copy(), self.df_test.copy()

        means = df_train.mean().astype(float)
        stds  = df_train.std().astype(float)

        df_train = self._normalize_mean_std(df_train, means, stds)
        df_valid = self._normalize_mean_std(df_valid, means, stds)
        df_test  = self._normalize_mean_std(df_test,  means, stds)
        return df_train, df_valid, df_test


    def __call__(self, normalization):
        if normalization is None:
            return self.normalization_none()

        if normalization == "mean_std":
            return self.normalize_mean_std()

        raise ValueError(f"Bad argument: {normalization=}")

In [None]:
class KaplanMeierDatasetAdjuster:

    def __init__(self, df_generator):
        self.df_generator = df_generator
        self.kmf = self.get_KaplanMeier(df_generator)


    def get_KaplanMeier(self, df_generator):
        df_train = df_generator(horizon=None, normalization=None)["train"]

        kmf = KaplanMeierFitter()
        kmf.fit(durations=df_train.duration, event_observed=df_train.event_observed, label="train")
        return kmf


    def get_kmf_alive_fraction(self, horizon):
        kmf_alive_fraction = self.kmf.survival_function_at_times(horizon).item()
        return kmf_alive_fraction


    def get_alive_in_train_dataset_fraction(self, horizon):
        df_train = self.df_generator(horizon=horizon, normalization=None)["train"]
        alive_in_train_dataset_fraction = 1 - df_train.event_observed.mean()
        return alive_in_train_dataset_fraction


    def get_increase_factor_for_n_of_alive(self, horizon):
        kmf_alive_fraction = self.get_kmf_alive_fraction(horizon)
        alive_in_train_dataset_fraction = self.get_alive_in_train_dataset_fraction(horizon)

        if alive_in_train_dataset_fraction == 0:
            warnings.warn(f"No one alive for {horizon=}.")

        if kmf_alive_fraction in {0, 1}:
            assert alive_in_train_dataset_fraction in {0, 1}, f"{kmf_alive_fraction=} but {alive_in_train_dataset_fraction=}"
            return 1

        a = alive_in_train_dataset_fraction
        r = kmf_alive_fraction
        increase_factor_for_n_of_alive = r / (1 - r) * (1 - a) / a

        return increase_factor_for_n_of_alive


    def __call__(self, df, horizon):
        df = df.copy()
        increase_factor_for_n_of_alive = self.get_increase_factor_for_n_of_alive(horizon)
        assert 0.9 < increase_factor_for_n_of_alive, f"{increase_factor_for_n_of_alive=}"

        if increase_factor_for_n_of_alive == np.inf:
            warnings.warn(f"For {horizon=} we have {increase_factor_for_n_of_alive=}!")
            return df

        df_alive = df[~df.event_observed]
        fraction_to_add = np.clip(increase_factor_for_n_of_alive - 1, 0, np.inf)

        df_alive = df_alive.sample(frac=fraction_to_add, replace=True, random_state=0).copy()

        new_df = pd.concat([df, df_alive], ignore_index=True)
        assert len(new_df) == len(df) + len(df_alive)
        return new_df

In [None]:
class DfsGenerator:

    def __init__(self, train_size=.5, seed=1, dfs_ssids=None, samples=None):
        self.df_train, self.df_valid, self.df_test = self.get_basic_dfs(
            train_size=train_size,
            seed=seed,
            dfs_ssids=dfs_ssids,
            samples=samples
        )

        self.df_train, self.df_valid, self.df_test = self.remove_unwanted_columns(self.df_train, self.df_valid, self.df_test)

        self.df_train.event_observed = self.df_train.event_observed.astype(bool)
        self.df_valid.event_observed = self.df_valid.event_observed.astype(bool)
        self.df_test.event_observed  = self.df_test.event_observed.astype(bool)

        self._max_horizon = self._get_max_horizon()

        self.check_that_column_names_are_valid()


    def remove_unwanted_columns(self, df_train, df_valid, df_test):
        df_train = df_train.drop(columns=self.get_columns_to_drop(df_train))
        df_valid = df_valid.drop(columns=self.get_columns_to_drop(df_valid))
        df_test  =  df_test.drop(columns=self.get_columns_to_drop(df_test))
        return df_train, df_valid, df_test


    def _get_max_horizon(self):
        return self.df_train.duration.quantile(.9)


    @property
    def max_horizon(self):
        return self._max_horizon


    def check_that_column_names_are_valid(self, df=None):
        self.check_df(self.df_train)
        self.check_df(self.df_valid)
        self.check_df(self.df_test)

        if df is not None:
            if len(self.df_train.columns) == len(df.columns):
                self.check_df(df)
            else:
                assert len(self.df_train.columns) - 2 == len(df.columns), "len(df_train.columns) - 2 != len(df.columns)"

        assert (self.df_train.columns == self.df_valid.columns).all(), "df_train.colums != df_valid.colums"
        assert (self.df_train.columns == self.df_test.columns).all(), "df_train.colums != df_test.colums"

        if df is not None:
            if len(self.df_train.columns) == len(df.columns):
                assert (self.df_train.columns == df.columns).all(), "df_train.colums != df.colums"
            else:
                columns = list(self.df_train.columns)
                columns.remove("duration")
                columns.remove("event_observed")
                assert (df.columns == columns).all(), "df.columns != columns"


    def check_df(self, df):
        assert "duration" in df.columns, df.columns
        assert "event_observed" in df.columns, df.columns


    @property
    def name(self):
        raise NotImplemented()


    def get_columns_to_drop(self):
        raise NotImplemented()


    def get_basic_dfs(self, train_size, seed, dfs_ssids=None, samples=None):
        raise NotImplemented()


    def modify_for_horizon(self, df, horizon, ε=1e-4):
        df = df.copy()

        if horizon is None:
            return df

        if horizon > self.max_horizon:
            warnings.warn(f"Careful, for {horizon=} little or no data is available. See, {self.max_horizon=}.")

        saw_death    = (df.duration <= horizon) &  df.event_observed
        saw_drop_out = (df.duration <= horizon) & ~df.event_observed
        rows_to_keep = ~saw_drop_out

        df.event_observed = saw_death
        df.duration = np.clip(df.duration, ε, horizon)
        df = df[rows_to_keep]
        return df


    def adjust_alive_death_ratios_based_on_kaplan_maier(self, dfs, horizon):
        adjuster = KaplanMeierDatasetAdjuster(self)

        for name, df in dfs.items():
            dfs[name] = adjuster(df, horizon)
        return dfs


    def _get_caching_name(self, horizon, normalization, adjust):
        return f"CACHE_{self.name}_{horizon}_{normalization}_{adjust}.pickle"


    def _cache(self, dfs, horizon, normalization, adjust):
        name = self._get_caching_name(horizon, normalization, adjust)
        pickle.dump(dfs, open(f"data_and_preprocessing/cache/{name}", 'wb'))


    def _load_cached_version(self, horizon, normalization, adjust):
        name = self._get_caching_name(horizon, normalization, adjust)

        try:
            dfs = pickle.load(open(f"data_and_preprocessing/cache/{name}", 'rb'))
            return dfs
        except:
            return None


    def get_split(self, df, train_size, seed):
        test_size = 1 - train_size
        df_train, df_valid_and_test = train_test_split(df, test_size=test_size, random_state=seed)
        df_valid, df_test = train_test_split(df_valid_and_test, test_size=.5, random_state=seed)
        return df_train, df_valid, df_test


    def __call__(self, horizon, normalization="mean_std", adjust=False):
        dfs = self._load_cached_version(horizon, normalization, adjust)
        if dfs is not None:
            return dfs

        data_normalization = DataNormalization(
            self.df_train,
            self.df_valid,
            self.df_test
        )
        df_train, df_valid, df_test = data_normalization(normalization=normalization)

        ε = 1e-4
        df_train.duration = np.clip(df_train.duration, ε, np.inf)
        df_valid.duration = np.clip(df_valid.duration, ε, np.inf)
        df_test.duration  = np.clip(df_test.duration,  ε, np.inf)

        df_train = self.modify_for_horizon(df=df_train, horizon=horizon, ε=ε)
        df_valid = self.modify_for_horizon(df=df_valid, horizon=horizon, ε=ε)
        df_test  = self.modify_for_horizon(df=df_test,  horizon=horizon, ε=ε)
        dfs = {"train": df_train, "valid": df_valid, "test": df_test}

        if adjust:
            dfs = self.adjust_alive_death_ratios_based_on_kaplan_maier(dfs, horizon)

        self._cache(dfs, horizon, normalization, adjust)

        for df in dfs.values():
            self.check_that_column_names_are_valid(df)
        return dfs

# Define datasets

In [None]:
class Gbsg2Generator(DfsGenerator):

    def __init__(self, train_size=.5, seed=122, dfs_ssids=None, samples=None):
        super().__init__(train_size=train_size, seed=seed, dfs_ssids=dfs_ssids, samples=samples)


    @property
    def name(self):
        return "gbsg2"


    def get_columns_to_drop(self, df_train):
        return []


    def get_df_from_lifeline(self):
        df = ds.load_gbsg2()
        df = df.rename(columns={"cens": "event_observed", "time": "duration"})
        df.event_observed = df.event_observed.astype(bool)

        df.horTh = (df.horTh == "yes").astype(float)
        df.menostat = (df.menostat == "Post").astype(float)

        df.tgrade = df.tgrade.replace("I", 1)
        df.tgrade = df.tgrade.replace("II", 2)
        df.tgrade = df.tgrade.replace("III", 3)
        df.tgrade = df.tgrade.astype(float)

        df.age = df.age.astype(float)
        df.tsize = df.tsize.astype(float)
        df.pnodes = df.pnodes.astype(float)
        df.progrec = df.progrec.astype(float)
        df.estrec = df.estrec.astype(float)
        df.duration = df.duration / 100

        return df


    def get_basic_dfs(self, train_size, seed, dfs_ssids=None, samples=None):
        df = self.get_df_from_lifeline()

        df_train, df_valid, df_test = self.get_split(df=df, train_size=train_size, seed=seed)
        return df_train, df_valid, df_test

In [None]:
class RecurGenerator(DfsGenerator):

    def __init__(self, train_size=.5, seed=114, dfs_ssids=None, samples=None):
        super().__init__(train_size=train_size, seed=seed, dfs_ssids=dfs_ssids, samples=samples)


    @property
    def name(self):
        return "recur"


    def get_columns_to_drop(self, df_train):
        return []


    def get_df_from_lifeline(self):
        df = ds.load_recur()
        df = df.rename(columns={"CENSOR": "event_observed", "TIME1": "duration"})
        df.event_observed = df.event_observed.astype(bool)
        df = df.drop(columns=["ID"])
        df.duration = (df.duration / 10)
        return df


    def get_basic_dfs(self, train_size, seed, dfs_ssids=None, samples=None):
        df = self.get_df_from_lifeline()

        df_train, df_valid, df_test = self.get_split(df=df, train_size=train_size, seed=seed)
        return df_train, df_valid, df_test

In [None]:
class LymphGenerator(DfsGenerator):

    def __init__(self, train_size=.5, seed=139, dfs_ssids=None, samples=None):
        super().__init__(train_size=train_size, seed=seed, dfs_ssids=dfs_ssids, samples=samples)


    @property
    def name(self):
        return "lymph"


    def get_columns_to_drop(self, df_train):
        return []


    def get_df_from_lifeline(self):
        df = ds.load_lymph_node()
        df = df.reset_index()
        df = df.drop(columns=["rectime", "diagdateb", "recdate", "deathdate", "censrec", "id"])
        df = df.rename(columns={"censdead": "event_observed", "survtime": "duration"})
        df.event_observed = df.event_observed.astype(bool)
        df.duration = (df.duration / 100)
        return df


    def get_basic_dfs(self, train_size, seed, dfs_ssids=None, samples=None):
        df = self.get_df_from_lifeline()

        df_train, df_valid, df_test = self.get_split(df=df, train_size=train_size, seed=seed)
        return df_train, df_valid, df_test

In [None]:
class CaliforniaHousingGenerator(DfsGenerator):

    def __init__(self, train_size=.80, seed=881, dfs_ssids=None, samples=None):
        super().__init__(train_size=train_size, seed=seed, dfs_ssids=dfs_ssids, samples=samples)


    @property
    def name(self):
        return "california"


    def get_columns_to_drop(self, df_train):
        return []


    def _get_california_df(self):
        xs, ys = fetch_california_housing(return_X_y=True)
        df = pd.DataFrame(xs)
        df["duration"] = ys
        df["event_observed"] = True
        return df


    def get_basic_dfs(self, train_size, seed, dfs_ssids=None, samples=None):
        df = self._get_california_df()

        df_train, df_valid, df_test = self.get_split(df=df, train_size=train_size, seed=seed)
        return df_train, df_valid, df_test

# Create datasets

In [None]:
%%time
pickle.dump(
    Gbsg2Generator(),
    open("data_and_preprocessing/df_generator_gbsg2.pickle", "wb")
)

In [None]:
%%time
pickle.dump(
    RecurGenerator(),
    open("data_and_preprocessing/df_generator_recur.pickle", "wb")
)

In [None]:
%%time
pickle.dump(
    LymphGenerator(),
    open("data_and_preprocessing/df_generator_lymph.pickle", "wb")
)

In [None]:
%%time
pickle.dump(
    CaliforniaHousingGenerator(),
    open("data_and_preprocessing/df_generator_california.pickle", "wb")
)