In [None]:
git push -u origin main

In [12]:


# Load data
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

def prepare_data(df, categorical_cols, id_col='ID'):
    # Create a copy of the dataframe
    data = df.copy()
    
    # Ensure efs is integer (event indicator: 0 or 1)
    data['efs'] = data['efs'].astype(int)
    
    # Drop the ID column if it exists
    if id_col in data.columns:
        data = data.drop(columns=[id_col])
        print(f"Dropped column: {id_col}")
    else:
        print(f"No column named '{id_col}' found in the dataset")
    
    # Separate features and target
    X = data.drop(['efs', 'efs_time'], axis=1)
    y = Surv.from_arrays(event=data['efs'], time=data['efs_time'])
    
    # Define preprocessing for categorical and numerical columns
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    numerical_cols = [col for col in X.columns if col not in categorical_cols]
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_cols),
            ('num', numerical_transformer, numerical_cols)
        ])
    
    # Fit and transform the data
    X_preprocessed = preprocessor.fit_transform(X)
    
    # Get feature names after one-hot encoding
    cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
    feature_names = np.concatenate([cat_feature_names, numerical_cols])
    
    return X_preprocessed, y, feature_names, preprocessor

def split_train_and_evaluate_with_sksurv(df, train_size=0.7, val_size=0.15, test_size=0.15,
                                         categorical_cols=[
        'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
        'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
        'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
        'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
        'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
        'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match',
        'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related',
        'melphalan_dose', 'cardiac', 'pulm_moderate'
    ], id_col='ID'):
    assert train_size + val_size + test_size == 1.0, "Split sizes must sum to 1"
    
    # Prepare data
    X, y, feature_names, preprocessor = prepare_data(df, categorical_cols, id_col)
    
    # Split into train + (val + test)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(val_size + test_size), random_state=42
    )
    
    # Split temp into validation and test
    val_proportion = val_size / (val_size + test_size)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=(1 - val_proportion), random_state=42
    )
    
    # Print sizes
    print(f"Training set size: {len(X_train)} ({len(X_train)/len(X):.2%})")
    print(f"Validation set size: {len(X_val)} ({len(X_val)/len(X):.2%})")
    print(f"Test set size: {len(X_test)} ({len(X_test)/len(X):.2%})")
    
    # Define and train the model
    model = GradientBoostingSurvivalAnalysis(
        n_estimators=150,
        learning_rate=0.06,
        max_depth=8,
        min_samples_split=4,
        subsample=0.7,
        random_state=42,
        max_features='log2', # 'sqrt', 0.3-0.7, 
        n_iter_no_change=10, # Set to 10, 20, or 50, and pair with a validation fraction
        validation_fraction=0.1, # 0.1–0.3
        verbose=1
    )
    
    model.fit(X_train, y_train)
    
    # Predict risk scores
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)
    test_pred = model.predict(X_test)
    
    # Extract event times and indicators for evaluation
    t_train = y_train['time']
    e_train = y_train['event']
    t_val = y_val['time']
    e_val = y_val['event']
    t_test = y_test['time']
    e_test = y_test['event']
    
    # Calculate C-index
    c_index_train = concordance_index(t_train, -train_pred, e_train)
    c_index_val = concordance_index(t_val, -val_pred, e_val)
    c_index_test = concordance_index(t_test, -test_pred, e_test)
    print(f"\nTraining C-index: {c_index_train:.4f}")
    print(f"Validation C-index: {c_index_val:.4f}")
    print(f"Test C-index: {c_index_test:.4f}")
    
    # Calculate ROC-AUC
    roc_auc_test = roc_auc_score(e_test, test_pred)
    print(f"Test Set ROC-AUC: {roc_auc_test:.4f}")
    
    # Feature importance
    importance = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 most important features:")
    print(importance.head(10))
    
    return model, preprocessor, X_train, X_val, X_test, y_train, y_val, y_test

# Load and evaluate the model
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

# Call the function, specifying the ID column to remove
model, preprocessor, X_train, X_val, X_test, y_train, y_val, y_test = split_train_and_evaluate_with_sksurv(
    df,
    train_size=0.75,
    val_size=0.10,
    test_size=0.15,
    id_col='ID'  # Adjust this to match your actual ID column name
)

Dropped column: ID
Training set size: 21600 (75.00%)
Validation set size: 2880 (10.00%)
Test set size: 4320 (15.00%)
      Iter       Train Loss      OOB Improve   Remaining Time 
         1       75577.4952          31.9504           16.06m
         2       75678.3447          27.6487           16.28m
         3       75825.7507          26.3442           15.96m
         4       75653.7739          25.3938           15.65m
         5       75098.3863          23.1405           15.50m
         6       75205.2447          22.0882           15.36m
         7       75199.7085          21.3058           15.19m
         8       75202.5603          19.8694           15.07m
         9       75134.2842          20.1340           14.96m
        10       75257.0674          17.7319           14.91m
        20       74743.8373          10.5010           13.87m
        30       73795.7747           7.0241           12.83m
        40       73363.5955           4.7301           11.75m
        50    

In [3]:
# Load and evaluate the model
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

# Define the mapping for the bins
dri_bins = {
    'High': ['High'],
    'Medium': ['Intermediate', 'High - TED AML case <missing cytogenetics', 
               'Intermediate - TED AML case <missing cytogenetics', 'Low', 
               'Missing disease status'],
    'Low': ['N/A - disease not classifiable', 'N/A - non-malignant indication', 
            'N/A - pediatric', 'TBD cytogenetics', 'Very high']
}

# Function to map dri_score to new bins
def bin_dri_score(score):
    if pd.isna(score):  # Handle NaN values
        return 'Low'  # Assuming NaN goes to 'Low', adjust if needed
    for bin_name, values in dri_bins.items():
        if score in values:
            return bin_name
    return 'Low'  # Default for any unmapped values (e.g., edge cases)


df['dri_score'] = df['dri_score'].apply(bin_dri_score)



# Call the function, specifying the ID column to remove
model, preprocessor, X_train, X_val, X_test, y_train, y_val, y_test = split_train_and_evaluate_with_sksurv(
    df,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    id_col='ID'  # Adjust this to match your actual ID column name
)

Dropped column: ID
Training set size: 20160 (70.00%)
Validation set size: 4320 (15.00%)
Test set size: 4320 (15.00%)
      Iter       Train Loss      OOB Improve   Remaining Time 
         1       70358.7919          25.3387           13.83m
         2       69998.1983          24.1738           13.74m
         3       69726.3241          21.9602           13.64m
         4       69891.6005          21.9284           13.52m
         5       70081.8751          19.8657           13.46m
         6       69508.4244          19.8745           13.36m
         7       69415.7921          17.5501           13.25m
         8       70364.4059          16.5063           13.16m
         9       69662.1926          16.0336           13.06m
        10       69502.9004          15.4836           12.97m
        20       69211.3332           7.2526           12.04m
        30       68485.3863           5.7107           11.17m
        40       69326.0684           3.5123           10.27m
        50    

In [4]:
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')


df['has_hodgekins'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HD' else 0)
df['has_hemophagocyticImmuneSyndrome'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HIS' else 0)

# df['dri_score'] = df['dri_score'].apply(bin_dri_score)

# Call the function, specifying the ID column to remove
model, preprocessor, X_train, X_val, X_test, y_train, y_val, y_test = split_train_and_evaluate_with_sksurv(
    df,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    id_col='ID'  # Adjust this to match your actual ID column name
)

def categorize_hla_by_percantile(df):
    
    hla_features = ['hla_high_res_8', 'hla_match_a_high', 'hla_match_b_high', 'hla_low_res_6']

    # Function to categorize based on 25th percentile
    def categorize_by_percentile(series):
        threshold = series.quantile(0.25)  # Calculate the 25th percentile
        return np.where(series <= threshold, 0, 1)  # 0 if <= threshold, 1 if above

    # Convert each HLA feature to categorical (0 or 1)
    for feature in hla_features:
        # new_column = f"{feature}_cat"  # Create a new column name for the categorical version
        df[feature] = categorize_by_percentile(df[feature])
        
    return df

df = categorize_hla_by_percantile(df)


    
# Call the function, specifying the ID column to remove
model, preprocessor, X_train, X_val, X_test, y_train, y_val, y_test = split_train_and_evaluate_with_sksurv(
    df,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    id_col='ID'  # Adjust this to match your actual ID column name
)

Dropped column: ID
Training set size: 20160 (70.00%)
Validation set size: 4320 (15.00%)
Test set size: 4320 (15.00%)
      Iter       Train Loss      OOB Improve   Remaining Time 
         1       70358.7919          25.3388           13.63m
         2       69998.1983          24.1876           13.58m
         3       69726.2457          21.9754           13.70m
         4       69891.5447          22.1225           13.83m
         5       70081.8732          19.6981           14.09m
         6       69508.3196          19.5528           13.87m
         7       69415.7827          17.4511           13.84m
         8       70364.7218          16.4501           13.65m
         9       69662.3271          16.0417           13.52m
        10       69502.5947          15.1632           13.37m
        20       69211.7872           7.2756           12.15m
        30       68483.6621           6.1609           11.20m
        40       69322.7656           3.0109           10.25m
        50    

KeyboardInterrupt: 

In [5]:
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

df['dri_score'] = df['dri_score'].apply(bin_dri_score)

df['has_hodgekins'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HD' else 0)
df['has_hemophagocyticImmuneSyndrome'] = df['prim_disease_hct'].apply(lambda x: 1 if x == 'HIS' else 0)


df['pediatric_and_arrhythmia'] = ((df['dri_score'] == 'N/A - pediatric') & (df['arrhythmia'] == 'Yes')).astype(int)

df = categorize_hla_by_percantile(df)

# Call the function, specifying the ID column to remove
model, preprocessor, X_train, X_val, X_test, y_train, y_val, y_test = split_train_and_evaluate_with_sksurv(
    df,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    id_col='ID'  # Adjust this to match your actual ID column name
)

Dropped column: ID
Training set size: 20160 (70.00%)
Validation set size: 4320 (15.00%)
Test set size: 4320 (15.00%)
      Iter       Train Loss      OOB Improve   Remaining Time 
         1       70358.7919          25.3388           13.98m
         2       69998.0170          24.1502           13.86m
         3       69726.1121          21.8861           13.69m
         4       69891.5344          21.9805           13.57m
         5       70081.8364          19.9264           13.45m
         6       69508.5773          20.0008           13.34m
         7       69415.6389          17.6574           13.24m
         8       70364.2840          16.7081           13.14m
         9       69661.6153          15.9291           13.20m
        10       69502.7763          15.3668           13.21m
        20       69211.9050           7.2304           12.19m
        30       68485.2945           5.9317           11.19m
        40       69326.4869           3.1606           10.25m
        50    

In [10]:
# Load and evaluate the model
df = pd.read_csv('./data/equity-post-HCT-survival-predictions/train.csv')

# Define the mapping for the bins
dri_bins = {
    'High': ['High'],
    'Medium': ['Intermediate', 'High - TED AML case <missing cytogenetics', 
               'Intermediate - TED AML case <missing cytogenetics', 'Low', 
               'Missing disease status'],
    'Low': ['N/A - disease not classifiable', 'N/A - non-malignant indication', 
            'N/A - pediatric', 'TBD cytogenetics', 'Very high']
}

# Function to map dri_score to new bins
def bin_dri_score(score):
    if pd.isna(score):  # Handle NaN values
        return 'Low'  # Assuming NaN goes to 'Low', adjust if needed
    for bin_name, values in dri_bins.items():
        if score in values:
            return bin_name
    return 'Low'  # Default for any unmapped values (e.g., edge cases)


df['dri_score'] = df['dri_score'].apply(bin_dri_score)

df = df.drop(["race_group"], axis=1)

# Call the function, specifying the ID column to remove
model, preprocessor, X_train, X_val, X_test, y_train, y_val, y_test = split_train_and_evaluate_with_sksurv(
    df,
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    id_col='ID',  # Adjust this to match your actual ID column name
    categorical_cols = [
        'dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status',
        'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe',
        'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab',
        'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity',
        'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe',
        'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hepatic_mild', 'tce_div_match', 'donor_related',
        'melphalan_dose', 'cardiac', 'pulm_moderate'
    ]
)

Dropped column: ID
Training set size: 20160 (70.00%)
Validation set size: 4320 (15.00%)
Test set size: 4320 (15.00%)
      Iter       Train Loss      OOB Improve   Remaining Time 
         1       70358.8190          25.4356           13.54m
         2       69998.2960          24.1827           13.53m
         3       69726.6986          22.2303           13.43m
         4       69891.9708          21.6603           13.32m
         5       70082.3080          19.7763           13.26m
         6       69508.6662          19.9060           13.16m
         7       69415.7453          17.7379           13.08m
         8       70365.4704          16.4794           12.99m
         9       69662.3624          15.8926           12.90m
        10       69503.8917          15.3288           12.81m
        20       69214.0825           7.2036           11.88m
        30       68487.4745           6.1151           10.95m
        40       69325.7869           3.3005           10.05m
        50    