In [2]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lifelines.utils import concordance_index
import numpy as np

In [5]:
def prepare_data(df):
    data = df.copy()
    categorical_cols = [
        'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
        'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
        'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
        'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
        'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
        'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
        'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related',
        'melphalan_dose', 'cardiac', 'pulm_moderate', 'efs'
    ]
    le = LabelEncoder()
    for col in categorical_cols:
        if col in data.columns:
            data[col] = data[col].astype(str).fillna('Missing')
            data[col] = le.fit_transform(data[col])
    numerical_cols = [col for col in data.columns if col not in categorical_cols]
    for col in numerical_cols:
        if col in data.columns:
            data[col] = data[col].fillna(data[col].median())
    return data

def train_lightgbm(df, train_size=0.7, val_size=0.15, test_size=0.15):
    assert train_size + val_size + test_size == 1.0, "Split sizes must sum to 1"
    
    # Prepare the data
    data = prepare_data(df)
    X = data.drop(['efs', 'efs_time'], axis=1)
    y = data['efs']  # Event indicator
    event_times = data['efs_time']  # Time to event
    
    # Split data
    X_train, X_temp, y_train, y_temp, t_train, t_temp = train_test_split(
        X, y, event_times, test_size=(val_size + test_size), random_state=42
    )
    val_proportion = val_size / (val_size + test_size)
    X_val, X_test, y_val, y_test, t_val, t_test = train_test_split(
        X_temp, y_temp, t_temp, test_size=(1 - val_proportion), random_state=42
    )
    
    print(f"Training set size: {len(X_train)} ({len(X_train)/len(X):.2%})")
    print(f"Validation set size: {len(X_val)} ({len(X_val)/len(X):.2%})")
    print(f"Test set size: {len(X_test)} ({len(X_test)/len(X):.2%})")
    
    # Define LightGBM classifier with parameters to prevent overfitting
    lgb_model = lgb.LGBMClassifier(
        objective='binary',         # Binary classification
        metric='binary_logloss',    # Evaluation metric
        max_depth=4,                # Limit tree depth
        learning_rate=0.05,         # Slow learning rate
        n_estimators=200,           # Number of trees
        min_child_samples=20,       # Minimum samples per leaf (similar to min_child_weight)
        subsample=0.7,              # Fraction of data per tree
        colsample_bytree=0.7,       # Fraction of features per tree
        reg_alpha=1.0,              # L1 regularization
        reg_lambda=2.0,             # L2 regularization
        random_state=42,
        verbose=-1                  # Suppress warnings
    )
    
    # Train with early stopping
    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='binary_logloss',
        callbacks=[lgb.early_stopping(stopping_rounds=20)],  # Early stopping
    )
    
    # Evaluate
    train_score = lgb_model.score(X_train, y_train)
    val_score = lgb_model.score(X_val, y_val)
    test_score = lgb_model.score(X_test, y_test)
    print(f"\nTraining Accuracy: {train_score:.4f}")
    print(f"Validation Accuracy: {val_score:.4f}")
    print(f"Test Accuracy: {test_score:.4f}")
    
    # Predict probabilities for C-index
    y_test_pred_proba = lgb_model.predict_proba(X_test)[:, 1]
    le = LabelEncoder().fit(df['efs'])  # Re-fit to get original encoding
        # Calculate C-index for the test set using lifelines
    c_index = concordance_index(event_times=t_test, 
                               predicted_scores=y_test_pred_proba, 
                               event_observed=y_test)
    
    print(f"\nTest Set C-index: {c_index:.4f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': lgb_model.feature_importances_
    }).sort_values('importance', ascending=False)
    print("\nTop 10 Features:")
    print(feature_importance.head(10))
    
    return lgb_model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test

# Load and evaluate the model
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

# Train the model
model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test = train_lightgbm(
    df,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15
)

Training set size: 20160 (70.00%)
Validation set size: 4320 (15.00%)
Test set size: 4320 (15.00%)
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[200]	training's binary_logloss: 0.562588	valid_1's binary_logloss: 0.584943

Training Accuracy: 0.7106
Validation Accuracy: 0.6910
Test Accuracy: 0.6810

Test Set C-index: 0.3474

Top 10 Features:
                   feature  importance
40              age_at_hct         229
29                year_hct         185
36               donor_age         174
42              gvhd_proph         155
0                       ID         155
14        prim_disease_hct         151
48         karnofsky_score         139
47       comorbidity_score         132
44               sex_match         125
27  conditioning_intensity         116


In [7]:

df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

# df = df.drop(["tce_imm_match"], axis=1)
# df['dri_score'] = df['dri_score'].apply(bin_dri_score)

df['has_hodgekins'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HD' else 0)
df['has_hemophagocyticImmuneSyndrome'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HIS' else 0)

# Drop the original prim_disease_hct column
# df = df.drop('prim_disease_hct', axis=1)

model, X_train, X_val, X_test, y_train, y_val, y_test, t_train, t_val, t_test = train_lightgbm(
    df,
    train_size=0.8,
    val_size=0.10,
    test_size=0.10
)

Training set size: 23040 (80.00%)
Validation set size: 2880 (10.00%)
Test set size: 2880 (10.00%)
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[194]	training's binary_logloss: 0.566034	valid_1's binary_logloss: 0.598775

Training Accuracy: 0.7077
Validation Accuracy: 0.6719
Test Accuracy: 0.6969

Test Set C-index: 0.3458

Top 10 Features:
              feature  importance
29           year_hct         194
40         age_at_hct         178
36          donor_age         157
47  comorbidity_score         144
42         gvhd_proph         143
48    karnofsky_score         143
14   prim_disease_hct         139
0                  ID         137
44          sex_match         136
16         cmv_status         109
