In [10]:
import pandas as pd
from lifelines import CoxPHFitter
from sklearn.model_selection import train_test_split
import numpy as np

### First Nonlinear Dataset

In [11]:
df = pd.read_csv("data/nonlinear_sim_916.csv", index_col=[0,1])

In [12]:
df_sub = df.drop(
    columns=["treated", "control", "hazard", "q", "survival_prob", 
             "survives", "censored","corrected_survival", "critical", "first_failure"]
)
df_flat = df_sub.groupby(level=0).mean()

In [13]:
df_flat["total_hours"] = df.groupby(level=0)["corrected_survival"].sum()
df_flat["uncensored"] = (df.groupby(level=0)["corrected_survival"].min() == 0).astype(int)

In [None]:
train, rest = train_test_split(df_flat, test_size=0.7, random_state=seed)
val, test = train_test_split(rest, test_size=0.5, random_state=seed)
for lam in [0, .01, .1]:
    cph = CoxPHFitter(penalizer=lam)
    cph.fit(train, duration_col="total_hours", event_col="uncensored")
    ci_scores.append(cph.score(test, scoring_method="concordance_index"))
    y_hat = cph.predict_expectation(test)
    a = np.abs((test["total_hours"] - y_hat)[test["uncensored"].astype(bool)]).sum()
    b = np.maximum(np.zeros(test.shape[0]), test["total_hours"] - y_hat).sum()
    maes.append((a + b) / test.shape[0])

In [14]:
ci_scores = []
maes = []
for seed in [71, 72, 73, 74, 75, 76, 77]:
    train, test = train_test_split(df_flat, test_size=0.2, random_state=seed)
    cph = CoxPHFitter()
    cph.fit(train, duration_col="total_hours", event_col="uncensored")
    ci_scores.append(cph.score(test, scoring_method="concordance_index"))
    y_hat = cph.predict_expectation(test)
    a = np.abs((test["total_hours"] - y_hat)[test["uncensored"].astype(bool)]).sum()
    b = np.maximum(np.zeros(test.shape[0]), test["total_hours"] - y_hat).sum()
    maes.append((a + b) / test.shape[0])

In [15]:
np.mean(ci_scores)

0.7265341222269136

In [16]:
np.mean(maes)

16.477938545676974

### Second Nonlinear Dataset

In [4]:
df = pd.read_csv("../data/nonlinear_sim2.csv", index_col=[0,1])

In [19]:
df_sub = df.drop(
    columns=["treated", "control", "hazard", "q", "survival_prob", 
             "survives", "censored","corrected_survival", "critical", "first_failure"]
)
df_flat = df_sub.groupby(level=0).mean()

In [20]:
df_flat["total_hours"] = df.groupby(level=0)["corrected_survival"].sum()
df_flat["uncensored"] = (df.groupby(level=0)["corrected_survival"].min() == 0).astype(int)

In [21]:
ci_scores = []
maes = []
for seed in [71, 72, 73, 74, 75, 76, 77]:
    train, test = train_test_split(df_flat, test_size=0.2, random_state=seed)
    cph = CoxPHFitter()
    cph.fit(train, duration_col="total_hours", event_col="uncensored")
    ci_scores.append(cph.score(test, scoring_method="concordance_index"))
    y_hat = cph.predict_expectation(test)
    a = np.abs((test["total_hours"] - y_hat)[test["uncensored"].astype(bool)]).sum()
    b = np.maximum(np.zeros(test.shape[0]), test["total_hours"] - y_hat).sum()
    maes.append((a + b) / test.shape[0])

In [22]:
np.mean(ci_scores)

0.770620437376633

In [23]:
np.mean(maes)

13.581765611710567