In [1]:
import numpy as np
import os
import pandas as pd
import pysurvival
# import rpy2.robjects as robjects
# import rpy2.robjects.packags as rpackages

from pysurvival.utils.display import correlation_matrix
from pysurvival.models.survival_forest import RandomSurvivalForestModel
from pysurvival.utils.metrics import concordance_index

from sklearn.model_selection import StratifiedKFold



ModuleNotFoundError: No module named 'rpy2'

In [3]:
def train_survival_model(X, t, e, num_trees, max_depth, min_node_size, seed):
    """
    Function to create and run Random Survival Forest with given attributes on data.

    Args:
        X: array -- input features, rows as samples
        t: array -- time labels for X, when event of interest or censoring occurred
        e: array -- event labels for X, if event occurred (1=event, 0=censoring)
        num_trees: int -- number of trees that will be built in forest model, used in initialization of model
        max_depth: int -- maximum number of levels allowed in tree, used in model fit
        min_node_size: int -- minimum number of samples required to be at leaf node, used in model fit
        seed: int -- random seed used by random number generator in model fit

    Returns: 
        rsf: pysurvival.model.RandomSurvivalForestModel -- model fit to input data
    """

    # Create instance of the model
    rsf = RandomSurvivalForestModel(num_trees=num_trees)

    # Fit model to data
    # Arguments not used from function input are defaults except importance_mode
    # TODO: need to find out what importance mode is 
    rsf.fit(X, t, e, max_features='all', max_depth=max_depth, min_node_size=min_node_size,
            num_threads=-1, sample_size_pct=0.63, importance_mode='normalized_permutation',
            seed=seed, save_memory=False)


    return rsf

In [None]:
def gridsearch(X, t, e):
    """
    Function to run a gridsearch on various Random Survival Forest hyperparameters

    Args:
        X: array -- input features, rows as samples
        t: array -- time labels for X, when event of interest or censoring occurred
        e: array -- event labels for X, if event occurred (1=event, 0=censoring)
    """

    

    return None

# Main Script

In [16]:
data_folder = "/Users/katyscott/Desktop/HDFS_Project/Data/FeatureSelection/"

train_liver_data = pd.read_excel(os.path.join(data_folder, "train_liver_feats_and_labels.xlsx"))
test_liver_data = pd.read_excel(os.path.join(data_folder, "test_liver_feats_and_labels.xlsx"))

X_liver = train_liver_data.drop(labels=["ScoutID", "HDFS_Time", "HDFS_Code", "Cancer_Type"], axis=1)
t_liver = train_liver_data["HDFS_Time"]
e_liver = train_liver_data["HDFS_Code"]

XT_liver = test_liver_data.drop(labels=["ScoutID", "HDFS_Time", "HDFS_Code", "Cancer_Type"], axis=1)
tT_liver = test_liver_data["HDFS_Time"]
eT_liver = test_liver_data["HDFS_Code"]

liver_rsf = train_survival_model(X_liver, t_liver, e_liver, 100, 10, 10, seed=16)
risk = liver_rsf.predict_risk(XT_liver).tolist()
c_ind = concordance_index(liver_rsf, XT_liver, tT_liver, eT_liver)

0.5671744712675267

# Other Code

In [None]:
# correlation_matrix(X_liver, figure_size=(30,15), text_fontsize=10)