In [24]:
# basic imports
import sys
sys.path.append("/home/raygoza/anaconda3/envs/humann/lib/python3.8/site-packages/")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# models and metrics
import xgboost as xgb
from sklearn.model_selection import train_test_split
from xgbse.metrics import concordance_index
from xgbse.non_parametric import get_time_bins
import xgbse
from xgbse import (
    XGBSEKaplanNeighbors,
    XGBSEKaplanTree,
    XGBSEDebiasedBCE,
    XGBSEBootstrapEstimator
)
from xgbse.converters import (
    convert_data_to_xgb_format,
    convert_to_structured
)

# better plots
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
plt.style.use('bmh')

# setting seed
np.random.seed(42)
from sklearn import tree
from sksurv.datasets import get_x_y
from sksurv.io.arffread import loadarff
import pandas as pd
print(xgbse.__version__)

0.2.1


In [25]:
# to easily plot confidence intervals

def plot_ci(mean, upper_ci, lower_ci, i=42, title='Probability of survival $P(T \geq t)$'):
    
    # plotting mean and confidence intervals
    plt.figure(figsize=(12, 4), dpi=120)
    plt.plot(mean.columns,mean.iloc[i])
    plt.fill_between(mean.columns, lower_ci.iloc[i], upper_ci.iloc[i], alpha=0.2)

    plt.title(title)
    plt.xlabel('Time [days]')
    plt.ylabel('Probability')
    plt.tight_layout()

In [26]:
# to write data as markdown for publication

def df_to_markdown(df, float_format='%.2g'):
    """
    Export a pandas.DataFrame to markdown-formatted text.
    DataFrame should not contain any `|` characters.
    """
    from os import linesep
    df.columns = df.columns.astype(str)
    return linesep.join([
        '|'.join(df.columns),
        '|'.join(4 * '-' for i in df.columns),
        df.to_csv(sep='|', index=False, header=False, float_format=float_format)
    ]).replace('|', ' | ')

In [27]:
## pre selected params for models ##

PARAMS_XGB_AFT = {
    'objective': 'survival:aft',
    'eval_metric': 'aft-nloglik',
    'aft_loss_distribution': 'normal',
    'aft_loss_distribution_scale': 1.0,
    'tree_method': 'hist', 
    'learning_rate': 5e-2, 
    'max_depth': 5, 
    'booster':'dart',
    'subsample':0.5,
    'min_child_weight': 58,
    'colsample_bynode':0.5
}



N_NEIGHBORS = 50

TIME_BINS = np.arange(15, 315, 15)

In [28]:
gem_train = loadarff('training_rsf_fin.arff')



gem_test = loadarff('testing_rsf_fin.arff')

X_train, y_train=get_x_y(gem_train,attr_labels=['events','TimeInStudy'],pos_label='TRUE')

X_valid, y_valid=get_x_y(gem_test,attr_labels=['events','TimeInStudy'],pos_label='TRUE')

feature_names=list(X_train.columns)



random_state=20

#random_state=30

In [29]:
# converting to xgboost format
dtrain = convert_data_to_xgb_format(X_train, y_train, 'survival:aft')
dval = convert_data_to_xgb_format(X_valid, y_valid, 'survival:aft')

# training model
bst = xgb.train(
    PARAMS_XGB_AFT,
    dtrain,
    num_boost_round=100,
    early_stopping_rounds=100,
    evals=[(dval, 'val')],
    verbose_eval=0
)

# predicting and evaluating
preds = bst.predict(dval)

cind = concordance_index(y_valid, -preds, risk_strategy='precomputed')
print(f"C-index: {cind:.3f}")


def pred(df):
    b=convert_data_to_xgb_format(X_valid, y_valid, 'survival:aft')
    return()

C-index: 0.608
