In [3]:
import numpy as np
import pandas as pd
import os
import csv
import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
import shap
from scipy.stats import mstats
from scipy.stats.mstats import winsorize
from scipy import stats
from pyampute.ampute import MultivariateAmputation
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from lifelines import CoxPHFitter, WeibullFitter, WeibullAFTFitter
from datetime import datetime, date, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from os.path import isfile, join
from sklearn.metrics import mean_absolute_error, roc_auc_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from statsmodels.gam.tests.test_penalized import df_autos
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import norm
from scipy.spatial import distance

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.model_selection import StratifiedKFold
from time import time
from sksurv.ensemble import RandomSurvivalForest
from sksurv.functions import StepFunction
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score)
from sksurv.metrics import brier_score
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.preprocessing import OneHotEncoder, encode_categorical
from sksurv.util import Surv
from lifelines.utils import concordance_index
from lifelines import KaplanMeierFitter
from data import load_dataset 
from sklearn.metrics import make_scorer
from sksurv.metrics import concordance_index_censored
from sklearn.pipeline import Pipeline
from sklearn.isotonic import IsotonicRegression

import warnings 
warnings.filterwarnings('ignore')

drive = 'H'
main_path = drive + ':/Shared drives/CKD_Progression/data/CKD_COHORT_Jan2010_Mar2024_v3.csv'
data_path = drive + ':/Shared drives/CKD_Progression/data/'
docs_path = drive + ':/Shared drives/CKD_Progression/docs/'
save_path = drive + ':/Shared drives/CKD_Progression/save/'
resu_path = drive + ':/Shared drives/CKD_Progression/result/'
covariates_path = docs_path + 'covariates.csv'
removecols_path = docs_path + 'remove_columns.csv'

covariates, order_covariates, long_df = load_dataset(get_columns = True)

def concordance_scorer(y_true, y_pred):
    events = y_true['event']
    times  = y_true['time']
    return concordance_index_censored(events, times, y_pred)[0]

def concordance_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return concordance_index_censored(y['status'], y['time'], y_pred)[0]

def brier_score_scorer(estimator, X, y):
    y_pred = estimator.predict_survival_function(X)
    return brier_score(y['status'], y['time'], y_pred, time_point = 1)

def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data['variable'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
    return vif_data

def calculate_nll_loss(model, X, y_true):
    survival_probs = model.predict_survival_function(X)  
    nll = 0  
    for i, fn in enumerate(survival_probs):
        time_at_event = y_true['time'][i]
        if time_at_event <= 0:
            continue 
        prob_of_survival = fn(time_at_event)
        event_status = y_true['status'][i]
        if event_status:
            nll -= np.log(1 - prob_of_survival)
        else:
            nll -= np.log(prob_of_survival)
    return nll / len(survival_probs) 

def calculate_median_survival_time(surv_func):
    for t, prob in zip(surv_func.x, surv_func.y):
        if prob <= 0.5:
            return t
    return surv_func.x[-1] 

def calculate_cindex(model, X, y_true):
    surv_funcs = model.predict_survival_function(X) 
    times = y_true['time']
    events = y_true['status']
    predicted_survival_times = np.array([calculate_median_survival_time(fn) for fn in surv_funcs])
    c_index = concordance_index_censored(events.astype(bool), times, predicted_survival_times)[0]
    return c_index

def permutation_importance_rsf(model, X, y, metric_func, n_repeats = 2, random_state = None):
    rng = np.random.RandomState(random_state)
    baseline_score = metric_func(model, X, y)  
    scores = np.zeros((n_repeats, X.shape[1]))

    for i in tqdm(range(X.shape[1])):  
        X_permuted = X.copy()
        for n in range(n_repeats):  
            X_permuted[:, i] = rng.permutation(X[:, i])  
            permuted_score = metric_func(model, X_permuted, y)  
            scores[n, i] = permuted_score - baseline_score  
    importances_mean = np.mean(scores, axis = 0)
    importances_std  = np.std(scores,  axis = 0)
    return importances_mean, importances_std

concordance_scorer = make_scorer(concordance_scorer, greater_is_better = True)
brier_score_scorer = make_scorer(brier_score_scorer, greater_is_better = True)

In [13]:
log_file = resu_path + 'modeling/randomsurvivalforest/rsf_results_log_significance.csv'
get_variance_inflation, get_interaction = False, False
if not os.path.exists(log_file):
    with open(log_file, mode = 'w', newline = '') as file:
        writer = csv.writer(file)
        writer.writerow(['pathway', 'best_params', 'fit_predict_time',  'covariates', 'num_covariates',
                         'survival_auc', 'survival_mauc', 'c_index', 'brier'])
        
pathways = long_df['pathway'].unique().tolist()
for path in ['CKD3A_to_DEAD', 'CKD3A_to_CKD4', 'CKD3A_to_CKD5A', 'CKD3A_to_CKD5B', 'CKD3B_to_CKD5A', 'CKD3B_to_CKD5B', 'CKD4_to_CKD5B']:
    pathways.remove(path)

scaler = MinMaxScaler()
multicollinear_threshold = 10
for pathway in tqdm(pathways):
    print(f'Processing pathway: {pathway}')
    if   'CKD' in pathway.split('_')[0] and 'CKD' in pathway.split('_')[2]:
        sheet_pathway = 'CKD_to_CKD'
    elif 'CKD' in pathway.split('_')[0] and 'CVD' in pathway.split('_')[2]:
        sheet_pathway = 'CKD_to_CVD'
    else:
        sheet_pathway = 'CKD_to_CKD'

    transition_df = long_df[long_df['pathway'] == pathway]

    univariate  = pd.read_excel(resu_path + 'univariate/univariate_ML.xlsx', sheet_name = pathway)
    interaction = pd.read_excel(resu_path + 'univariate/univariate_interaction.xlsx', sheet_name = sheet_pathway)
    multivariate_covariates = univariate[univariate['included'] == 1]['variable'].tolist()
    multivariate_primary    = univariate[univariate['included'] == 1]['variable'].tolist()

    if get_variance_inflation:
        if get_interaction:
            multivariate_covariates_set = set(multivariate_covariates)
            univariate_predictors = list(set(interaction['predictor1'].unique().tolist() + interaction['predictor2'].unique().tolist()))
            for index, row in interaction.iterrows():
                interaction_name = row['variable']
                predictor1 = row['predictor1']
                predictor2 = row['predictor2']
                
                transition_df[interaction_name] = transition_df[predictor1] * transition_df[predictor2]
                multivariate_covariates_set.discard(predictor1)
                multivariate_covariates_set.discard(predictor2)
            multivariate_covariates_set.update(interaction['variable'].tolist())
            multivariate_covariates = list(multivariate_covariates_set)

        multivariate_covariates = list(set(multivariate_primary + multivariate_covariates))
    X = transition_df[multivariate_covariates]
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns = multivariate_covariates)
    vif_result = calculate_vif(X_scaled_df)
    vif_result.to_csv(resu_path + f'modeling/randomsurvivalforest/variance_inflation/{pathway}_TODEL.csv')
    multicollinear = vif_result[vif_result['VIF'] >= multicollinear_threshold]['variable'].tolist()
    multivariate_covariates = vif_result[vif_result['VIF'] < multicollinear_threshold]['variable'].tolist()
    print(f'There are {len(multicollinear)} covariates with high multicollinearity.')

    independent_df = transition_df.drop(columns = ['status'])
    independent_df = transition_df[multivariate_covariates]
    groundtruth_df = transition_df[['status', 'time']]
    groundtruth_df['status'] = groundtruth_df['status'].astype(bool)
    groundtruth_df['time'] = pd.to_numeric(groundtruth_df['time'])

    censored_weight = len(groundtruth_df) / groundtruth_df['status'].value_counts()[0]
    event_weight    = len(groundtruth_df) / groundtruth_df['status'].value_counts()[1]

    groundtruth_df['sample_weight'] = groundtruth_df['status'].apply(lambda x: event_weight if x else censored_weight)

    X, y = independent_df, Surv.from_dataframe('status', 'time', groundtruth_df)
    X_train, X_test, y_train, y_test, sample_weight_train, sample_weight_test = train_test_split(X, y, groundtruth_df['sample_weight'].to_numpy(), test_size = 0.30, random_state = 42)
    X_train = scaler.fit_transform(X_train) 

    param_grid = {
        'n_estimators': [100, 300, 500],
        'max_depth': [3, 5, None],
        'min_samples_split': [2, 3],
        'min_samples_leaf': [1, 2],
        'max_features': ['sqrt', 0.5, 0.75],
        'bootstrap': [True]}

    start = time()
    cv = StratifiedKFold(n_splits = 5, shuffle = True,  random_state = 42)
    random_forest = RandomSurvivalForest(n_jobs = -1,   random_state = 42)
    pipeline = Pipeline([('scaler', scaler), ('rsf', random_forest)])

    kmf = KaplanMeierFitter()
    kmf.fit(y_train['time'], event_observed = 1 - y_train['status'])  
    ipcw_weights = 1 / kmf.survival_function_at_times(y_train['time']).values.flatten()

    survival_forest = RandomizedSearchCV(random_forest,
                                         param_grid,
                                         cv = cv,
                                         n_iter = 2,
                                         scoring = 'neg_log_loss')

    survival_forest.fit(X_train, y_train, sample_weight = sample_weight_train)
    best_survival = survival_forest.best_estimator_
    fit_predict_time = np.round(time() - start, 3)

    period_max = 100
    times = np.arange(y_test['time'].min(), period_max, 12)
    risk_score = best_survival.predict(X_test) 
    surv_preds = best_survival.predict_survival_function(X_test)
    prediction = [fn(period_max) for fn in surv_preds]

    survival_auc, survival_mauc = cumulative_dynamic_auc(y_train, y_test,  risk_score, times)
    c_index = concordance_index_censored(y_test['status'], y_test['time'], risk_score)[0] 
    _, brier_score_val = brier_score(y_train, y_test, prediction, period_max)

    with open(log_file, mode = 'a', newline = '') as file:
        writer = csv.writer(file)
        writer.writerow([pathway,
                         best_survival.get_params(),
                         fit_predict_time,
                         multivariate_covariates,
                         len(multivariate_covariates),
                         survival_auc,
                         survival_mauc,
                         c_index,
                         brier_score_val[0]])
    print(f'\tFit and predict time of RSF at transition {pathway}: {fit_predict_time} seconds with {len(multivariate_covariates)} covariates.')
    print(f'\tC-index: {np.round(c_index, 2)} \tBrier score: {np.round(brier_score_val[0], 2)}')
    break
    importances_mean, importances_std = permutation_importance_rsf(best_survival,
                                                                    X_test.values, y_test,
                                                                    calculate_cindex,  
                                                                    n_repeats = 3, 
                                                                    random_state = 42)
    feature_importance_df = pd.DataFrame({'feature': X_test.columns, 'importance_mean': importances_mean, 'importance_std' : importances_std})
    feature_importance_df = feature_importance_df.sort_values(by = 'importance_mean', ascending = False)
    feature_importance_df.to_csv(resu_path + f'modeling/randomsurvivalforest/feature_importance/{pathway}_TODEL.csv', index = False)