In [10]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
import numpy as np
from sklearn.metrics import roc_auc_score  # Import for ROC-AUC

In [40]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lifelines.utils import concordance_index

# Basic read
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

def prepare_data(df, categorical_cols):
    # Create a copy of the dataframe
    data = df.copy()
    
    # Convert efs to integer
    data['efs'] = data['efs'].apply(int)
    
    # # Handle categorical variables by converting to 'category' dtype
    for col in categorical_cols:
        if col in data.columns:
            # Convert to string first (handles mixed types), then to category
            data[col] = data[col].astype(str).replace('nan', 'Missing').astype('category')
    
    # # Handle numerical columns (fill NaN with median)
    numerical_cols = [col for col in data.columns if col not in categorical_cols]
    for col in numerical_cols:
        if col in data.columns:
            data[col] = data[col].fillna(data[col].median())
    
    return data

def split_train_and_evaluate_with_cindex(df, train_size=0.7, val_size=0.15, test_size=0.15,
                                         categorical_cols=[
        'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
        'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
        'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
        'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
        'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
        'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
        'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related',
        'melphalan_dose', 'cardiac', 'pulm_moderate'
    ]):
    assert train_size + val_size + test_size == 1.0, "Split sizes must sum to 1"
    
    # Prepare the data
    print(df['efs'].dtype)  # Should show int64 after apply(int)
    data = prepare_data(df, categorical_cols)
    print(data['efs'].dtype)  # Should show int64
    
    X = data.drop(['efs', 'efs_time'], axis=1)
    y = data['efs']  # Event indicator (0 for Censoring, 1 for Event)
    event_times = data['efs_time']  # Time to event
    
    # First split: Train + (Val + Test)
    X_train, X_temp, y_train, y_temp, t_train, t_temp = train_test_split(
        X, y, event_times,
        test_size=(val_size + test_size),
        random_state=42
    )
    
    # Second split: Validation and Test
    val_proportion = val_size / (val_size + test_size)
    X_val, X_test, y_val, y_test, t_val, t_test = train_test_split(
        X_temp, y_temp, t_temp,
        test_size=(1 - val_proportion),
        random_state=42
    )
    
    # Print sizes
    print(f"Training set size: {len(X_train)} ({len(X_train)/len(X):.2%})")
    print(f"Validation set size: {len(X_val)} ({len(X_val)/len(X):.2%})")
    print(f"Test set size: {len(X_test)} ({len(X_test)/len(X):.2%})")
    
    # Create DMatrix objects with enable_categorical=True
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)
    dtest = xgb.DMatrix(X_test, label=y_test, enable_categorical=True)
    
    # Define parameters
    params = {
        'objective': 'rank:pairwise',
        'eval_metric': 'auc',
        'max_depth': 6,
        'learning_rate': 0.06,
        'n_estimators': 150,
        'min_child_weight': 4,
        'gamma': 0.1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'reg_alpha': 1.0,
        'reg_lambda': 2.0,
        'random_state': 42
    }
    
    # Train the model with early stopping
    evals = [(dtrain, 'train'), (dval, 'val')]
    xgb_model = xgb.train(
        params,
        dtrain,
        num_boost_round=params['n_estimators'],
        evals=evals,
        early_stopping_rounds=10,
        verbose_eval=False,
        # enable_categorical=True
    )
    
    # Predict probabilities for evaluation
    train_pred_proba = xgb_model.predict(dtrain)
    val_pred_proba = xgb_model.predict(dval)
    test_pred_proba = xgb_model.predict(dtest)
    
    # Calculate accuracy (threshold at 0.5)
    train_score = ((train_pred_proba > 0.5).astype(int) == y_train).mean()
    val_score = ((val_pred_proba > 0.5).astype(int) == y_val).mean()
    test_score = ((test_pred_proba > 0.5).astype(int) == y_test).mean()
    print(f"\nTraining Accuracy: {train_score:.4f}")
    print(f"Validation Accuracy: {val_score:.4f}")
    print(f"Test Accuracy: {test_score:.4f}")
    
    # Calculate C-index for the test set
    c_index = concordance_index(event_times=t_test, 
                               predicted_scores=test_pred_proba, 
                               event_observed=y_test)
    print(f"\nTest Set C-index: {c_index:.4f}")
    
    # Calculate ROC-AUC
    roc_auc = roc_auc_score(y_test, test_pred_proba)
    print(f"Test Set ROC-AUC: {roc_auc:.4f}")
    
    # Feature importance
    importance_dict = xgb_model.get_score(importance_type='weight')
    feature_importance = pd.DataFrame({
        'feature': list(importance_dict.keys()),
        'importance': list(importance_dict.values())
    }).sort_values('importance', ascending=False)

    print("\nTop 10 most important features:")
    print(feature_importance.head(10))
    
    return xgb_model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test

# Load and evaluate the model
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

# Call the function
model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test = split_train_and_evaluate_with_cindex(
    df,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15
)

float64
int64
Training set size: 20160 (70.00%)
Validation set size: 4320 (15.00%)
Test set size: 4320 (15.00%)


Parameters: { "n_estimators" } are not used.




Training Accuracy: 0.4605
Validation Accuracy: 0.4683
Test Accuracy: 0.4542

Test Set C-index: 0.4130
Test Set ROC-AUC: 0.6126

Top 10 most important features:
             feature  importance
4         age_at_hct         6.0
2  cyto_score_detail         5.0
1         cyto_score         2.0
3            mrd_hct         2.0
0                 ID         1.0
5    karnofsky_score         1.0
6      tce_div_match         1.0
7     melphalan_dose         1.0


# Bin together dri categories
- dri categories are 
  ['Intermediate' 'High' 'N/A - non-malignant indication' 'N/A - pediatric'
 'High - TED AML case <missing cytogenetics' 'TBD cytogenetics' 'Low'
 'Intermediate - TED AML case <missing cytogenetics'
 'N/A - disease not classifiable' nan 'Very high' 'Missing disease status']

 new categories:
 High - 'High'
 
 Medium - 'Intermediate', 'High - TED AML case <missing cytogenetics', 'Intermediate - TED AML case <missing cytogenetics', 'Low', 'Missing disease status'

Low -  'N/A - disease not classifiable', 'N/A - non-malignant indication', 'N/A - pediatric''TBD cytogenetics', 

In [26]:
# Define the mapping for the bins
dri_bins = {
    'High': ['High'],
    'Medium': ['Intermediate', 'High - TED AML case <missing cytogenetics', 
               'Intermediate - TED AML case <missing cytogenetics', 'Low', 
               'Missing disease status'],
    'Low': ['N/A - disease not classifiable', 'N/A - non-malignant indication', 
            'N/A - pediatric', 'TBD cytogenetics', 'Very high']
}

# Function to map dri_score to new bins
def bin_dri_score(score):
    if pd.isna(score):  # Handle NaN values
        return 'Low'  # Assuming NaN goes to 'Low', adjust if needed
    for bin_name, values in dri_bins.items():
        if score in values:
            return bin_name
    return 'Low'  # Default for any unmapped values (e.g., edge cases)


df['dri_score'] = df['dri_score'].apply(bin_dri_score)

model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test = split_train_and_evaluate_with_cindex(
    df,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15
)

float64
int64
Training set size: 20160 (70.00%)
Validation set size: 4320 (15.00%)
Test set size: 4320 (15.00%)


Parameters: { "n_estimators" } are not used.




Training Accuracy: 0.4605
Validation Accuracy: 0.4683
Test Accuracy: 0.4542

Test Set C-index: 0.4013
Test Set ROC-AUC: 0.6455

Top 10 most important features:
                  feature  importance
1  conditioning_intensity         3.0
2               donor_age         3.0
3              age_at_hct         3.0
5       comorbidity_score         3.0
4              gvhd_proph         2.0
0              cmv_status         1.0


- drop tce_imm_match



In [27]:
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')
df = df.drop(["tce_imm_match"], axis=1)
df['dri_score'] = df['dri_score'].apply(bin_dri_score)

model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test = split_train_and_evaluate_with_cindex(
    df,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
     categorical_cols = [
        'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
        'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
        'prim_disease_hct', 'cmv_status', 'rituximab',
        'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
        'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
        'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
        'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related',
        'melphalan_dose', 'cardiac', 'pulm_moderate'
    ]
)

float64
int64
Training set size: 20160 (70.00%)
Validation set size: 4320 (15.00%)
Test set size: 4320 (15.00%)


Parameters: { "n_estimators" } are not used.




Training Accuracy: 0.4605
Validation Accuracy: 0.4683
Test Accuracy: 0.4542

Test Set C-index: 0.4202
Test Set ROC-AUC: 0.6161

Top 10 most important features:
             feature  importance
4          donor_age         7.0
0         cyto_score         2.0
1   prim_disease_hct         1.0
2  cyto_score_detail         1.0
3           year_hct         1.0
5         age_at_hct         1.0


In [28]:


df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

# df = df.drop(["tce_imm_match"], axis=1)
# df['dri_score'] = df['dri_score'].apply(bin_dri_score)

df['has_hodgekins'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HD' else 0)
df['has_hemophagocyticImmuneSyndrome'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HIS' else 0)

# Drop the original prim_disease_hct column
# df = df.drop('prim_disease_hct', axis=1)

model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test = split_train_and_evaluate_with_cindex(
    df,
    train_size=0.8,
    val_size=0.08,
    test_size=0.12,
    categorical_cols= [
        'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
        'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
        'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
        'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
        'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
        'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
        'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related',
        'melphalan_dose', 'cardiac', 'pulm_moderate','has_hodgekins','has_hemophagocyticImmuneSyndrome'
    ]
)

float64
int64
Training set size: 23040 (80.00%)
Validation set size: 2303 (8.00%)
Test set size: 3457 (12.00%)


Parameters: { "n_estimators" } are not used.




Training Accuracy: 0.4594
Validation Accuracy: 0.4907
Test Accuracy: 0.4492

Test Set C-index: 0.4164
Test Set ROC-AUC: 0.6322

Top 10 most important features:
                  feature  importance
5               donor_age         5.0
3  conditioning_intensity         4.0
0                      ID         1.0
1              cyto_score         1.0
2           tce_imm_match         1.0
4             in_vivo_tcd         1.0
6              age_at_hct         1.0
7           pulm_moderate         1.0


# drop 'hla_high_res_8', 'hla_low_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_match_dqb1_high'

In [29]:



df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

# df = df.drop(["tce_imm_match"], axis=1)
# df['dri_score'] = df['dri_score'].apply(bin_dri_score)

df['has_hodgekins'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HD' else 0)
df['has_hemophagocyticImmuneSyndrome'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HIS' else 0)
df = df.drop(['hla_high_res_8', 'hla_low_res_8', 'hla_low_res_6', 'hla_high_res_6', 'hla_match_dqb1_high'], axis=1)
# Drop the original prim_disease_hct column
df = df.drop('prim_disease_hct', axis=1)

model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test = split_train_and_evaluate_with_cindex(
    df,
    train_size=0.8,
    val_size=0.08,
    test_size=0.12,
    categorical_cols= [
        'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
        'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
        'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
        'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
        'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
        'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
        'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related',
        'melphalan_dose', 'cardiac', 'pulm_moderate','has_hodgekins','has_hemophagocyticImmuneSyndrome'
    ]
)

float64
int64
Training set size: 23040 (80.00%)
Validation set size: 2303 (8.00%)
Test set size: 3457 (12.00%)


Parameters: { "n_estimators" } are not used.




Training Accuracy: 0.4594
Validation Accuracy: 0.4907
Test Accuracy: 0.4492

Test Set C-index: 0.4231
Test Set ROC-AUC: 0.6143

Top 10 most important features:
                  feature  importance
2               donor_age         5.0
1  conditioning_intensity         4.0
3              age_at_hct         3.0
0                      ID         1.0
4       comorbidity_score         1.0


# convert the specified HLA features (hla_high_res_8, hla_match_a_high, hla_match_b_high, hla_low_res_6) in a DataFrame into categorical features based on their 25th percentile

In [23]:
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')


# Assuming df is the DataFrame with the data
# List of HLA features to convert
hla_features = ['hla_high_res_8', 'hla_match_a_high', 'hla_match_b_high', 'hla_low_res_6']

# Function to categorize based on 25th percentile
def categorize_by_percentile(series):
    threshold = series.quantile(0.25)  # Calculate the 25th percentile
    return np.where(series <= threshold, 0, 1)  # 0 if <= threshold, 1 if above

# Convert each HLA feature to categorical (0 or 1)
for feature in hla_features:
    # new_column = f"{feature}_cat"  # Create a new column name for the categorical version
    df[feature] = categorize_by_percentile(df[feature])

# Display the first few rows to verify
# print("First 5 rows with original and categorical features:")
    
    
# df = df.drop(["tce_imm_match"], axis=1)
df['dri_score'] = df['dri_score'].apply(bin_dri_score)

df['has_hodgekins'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HD' else 0)
df['has_hemophagocyticImmuneSyndrome'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HIS' else 0)
# Drop the original prim_disease_hct column
# df = df.drop('prim_disease_hct', axis=1)

df['pediatric_and_arrhythmia'] = ((df['dri_score'] == 'N/A - pediatric') & (df['arrhythmia'] == 'Yes')).astype(int)

model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test = split_train_and_evaluate_with_cindex(
    df,
    train_size=0.8,
    val_size=0.08,
    test_size=0.12,
    categorical_cols= [
        'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
        'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
        'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
        'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
        'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
        'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
        'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related',
        'melphalan_dose', 'cardiac', 'pulm_moderate','has_hodgekins','has_hemophagocyticImmuneSyndrome', 'pediatric_and_arrhythmia'
    ]
)

float64
int64
Training set size: 23040 (80.00%)
Validation set size: 2303 (8.00%)
Test set size: 3457 (12.00%)


Parameters: { "n_estimators" } are not used.




Training Accuracy: 0.4594
Validation Accuracy: 0.4907
Test Accuracy: 0.4492

Test Set C-index: 0.3976
Test Set ROC-AUC: 0.6510

Top 10 most important features:
                  feature  importance
3              age_at_hct         3.0
5       comorbidity_score         3.0
2  conditioning_intensity         2.0
0                      ID         1.0
1       cyto_score_detail         1.0
4              gvhd_proph         1.0
