In [1]:
import numpy as np
import os
import pandas as pd
import pysurvival
import rpy2.robjects as robjects
import rpy2.robjects.packages as rpackages

from pysurvival.utils.display import correlation_matrix
from pysurvival.models.survival_forest import RandomSurvivalForestModel
from pysurvival.utils.metrics import concordance_index

from sklearn.model_selection import StratifiedKFold

In [None]:
# Only need to run this once
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)
utils.install_packages("survAUC")

In [None]:
def gh_c_index(risk_pred):
    """
    Calculate Gonen and Hiller's c-index using function from R (using rpy2) 

    Args:
        risk_pred: np.ndarray or torch.Tensor, risk score predictions from model

    Source: Gonen, M. and G. Heller (2005). 
    Concordance probability and discriminatory power in proportional hazards regression.
    Biometrika 92, 965–970.
    """

    # check for NaNs
    if not isinstance(risk_pred, np.ndarray):
        risk_pred = risk_pred.detach().cpu().numpy()
    for a in risk_pred:
        if np.isnan(a).any():
            raise ValueError("NaNs detected in inputs, please correct or drop.")

    # Use Gonen and Hiller's c-index via the survAUC library in R
    survAUC = rpackages.importr('survAUC')

    # Get data into right format
    R_risk_pred = robjects.vectors.FloatVector(risk_pred)

    # this doesn't work yet, need to get the list to numeric type
    # in R, this is accomplished with as.numeric and unlist()
    R_cind = survAUC.GHCI(R_risk_pred)

    # Convert back to Python list with single value
    cind = list(R_cind)

    # Return the only value in the cind list
    return cind[0]

In [16]:
def train_survival_model(X, t, e, num_trees, max_depth, min_node_size, seed):
    """
    Function to create and run Random Survival Forest with given attributes on data.

    Args:
        X: array -- input features, rows as samples
        t: array -- time labels for X, when event of interest or censoring occurred
        e: array -- event labels for X, if event occurred (1=event, 0=censoring)
        num_trees: int -- number of trees that will be built in forest model, used in initialization of model
        max_depth: int -- maximum number of levels allowed in tree, used in model fit
        min_node_size: int -- minimum number of samples required to be at leaf node, used in model fit
        seed: int -- random seed used by random number generator in model fit

    Returns: 
        rsf: pysurvival.model.RandomSurvivalForestModel -- model fit to input data
    """

    # Create instance of the model
    rsf = RandomSurvivalForestModel(num_trees=num_trees)

    # Fit model to data
    # Arguments not used from function input are defaults except importance_mode
    # TODO: need to find out what importance mode is 
    rsf.fit(X, t, e, max_features='all', max_depth=max_depth, min_node_size=min_node_size,
            num_threads=-1, sample_size_pct=0.63,
            seed=seed, save_memory=False)


    return rsf

In [None]:
def gridsearch(X, t, e):
    """
    Function to run a gridsearch on various Random Survival Forest hyperparameters

    Args:
        X: array -- input features, rows as samples
        t: array -- time labels for X, when event of interest or censoring occurred
        e: array -- event labels for X, if event occurred (1=event, 0=censoring)
    """

    

    return None

# Main Script

In [19]:
data_folder = "/Data/FeatureSelection/"

train_liver_data = pd.read_excel(os.path.join(data_folder, "train_liver_feats_and_labels.xlsx"))
test_liver_data = pd.read_excel(os.path.join(data_folder, "test_liver_feats_and_labels.xlsx"))

X_liver = train_liver_data.drop(labels=["ScoutID", "HDFS_Time", "HDFS_Code", "Cancer_Type"], axis=1)
t_liver = train_liver_data["HDFS_Time"]
e_liver = train_liver_data["HDFS_Code"]

XT_liver = test_liver_data.drop(labels=["ScoutID", "HDFS_Time", "HDFS_Code", "Cancer_Type"], axis=1)
tT_liver = test_liver_data["HDFS_Time"]
eT_liver = test_liver_data["HDFS_Code"]

liver_rsf = train_survival_model(X_liver, t_liver, e_liver, 10, 10, 10, seed=16)
risk = liver_rsf.predict_risk(XT_liver)
h_c_ind = concordance_index(liver_rsf, XT_liver, tT_liver, eT_liver)
gh_c_ind = gh_c_index(risk)

print("Harrel's C-index: ", h_c_ind)
print("GH C-index: ", gh_c_ind)

Harrel's C-index:  0.5629333691857613
GH C-index:  0.9954262190633624


In [20]:
data_folder = "/Data/FeatureSelection/"

train_tumor_data = pd.read_excel(os.path.join(data_folder, "train_tumor_feats_and_labels.xlsx"))
test_tumor_data = pd.read_excel(os.path.join(data_folder, "test_tumor_feats_and_labels.xlsx"))

X_tumor = train_tumor_data.drop(labels=["ScoutID", "HDFS_Time", "HDFS_Code", "Cancer_Type"], axis=1)
t_tumor = train_tumor_data["HDFS_Time"]
e_tumor = train_tumor_data["HDFS_Code"]

XT_tumor = test_tumor_data.drop(labels=["ScoutID", "HDFS_Time", "HDFS_Code", "Cancer_Type"], axis=1)
tT_tumor = test_tumor_data["HDFS_Time"]
eT_tumor = test_tumor_data["HDFS_Code"]

tumor_rsf = train_survival_model(X_tumor, t_tumor, e_tumor, 10, 10, 10, seed=16)
risk = tumor_rsf.predict_risk(XT_tumor)
h_c_ind = concordance_index(tumor_rsf, XT_tumor, tT_tumor, eT_tumor)
gh_c_ind = gh_c_index(risk)

print("Harrel's C-index: ", h_c_ind)
print("GH C-index: ", gh_c_ind)

Harrel's C-index:  0.5290137975751902
GH C-index:  0.9933289922791773


# Other Code

In [None]:
# correlation_matrix(X_liver, figure_size=(30,15), text_fontsize=10)