In [4]:
import pandas as pd
import numpy as np
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Assuming your data is in a DataFrame called 'df'
# Replace this with your actual data loading step
# For example: df = pd.read_csv('your_data.csv')
# For this example, I'll simulate loading your document structure

df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')
# Step 1: Prepare the survival outcome (structured array for scikit-survival)
y = Surv.from_arrays(event=df['efs'], time=df['efs_time'])

# Step 2: Select features (excluding target variables 'efs' and 'efs_time')
feature_cols = [col for col in df.columns if col not in ['ID', 'efs', 'efs_time']]
X = df[feature_cols]

# Step 3: Preprocess the data
# Separate categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Impute missing values
# For categorical: use most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
X_cat = pd.DataFrame(cat_imputer.fit_transform(X[categorical_cols]), 
                     columns=categorical_cols)

# For numerical: use median (if any numerical features are added)
num_imputer = SimpleImputer(strategy='median')
X_num = pd.DataFrame(num_imputer.fit_transform(X[numerical_cols]), 
                     columns=numerical_cols) if not numerical_cols.empty else pd.DataFrame()

# Apply one-hot encoding to categorical variables
X_cat_encoded = pd.get_dummies(X_cat, columns=categorical_cols, drop_first=True)

# Combine processed categorical and numerical data
X_processed = pd.concat([X_cat_encoded, X_num], axis=1) if not X_num.empty else X_cat_encoded

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Step 5: Train the Random Survival Forest
rsf = RandomSurvivalForest(n_estimators=100,  # Number of trees
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_depth=None,    # Allow trees to grow fully if desired
                           random_state=42)
rsf.fit(X_train, y_train)

# Step 6: Evaluate the model (Concordance Index)
c_index = rsf.score(X_test, y_test)
print(f"Concordance Index on test set: {c_index:.3f}")

# Optional: Predict survival function for test data
surv_funcs = rsf.predict_survival_function(X_test)
for i, surv_func in enumerate(surv_funcs[:5]):  # Show first 5 for brevity
    print(f"Patient {i+1} survival probabilities at time points:")
    print(surv_func(rsf.event_times_[:5]))  # First 5 time points

Concordance Index on test set: 0.662
Patient 1 survival probabilities at time points:


AttributeError: 'RandomSurvivalForest' object has no attribute 'event_times_'