# Simple XGBoost for Diagnosis Prediction
Loads data, 80:20 train/test split, vanilla XGBoost, outputs test AUC for each diag_type

In [1]:
import pandas as pd
import numpy as np
import random
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore')

data_path = "/orcd/pool/003/dbertsim_shared/ukb/"
RANDOM_STATE = 42
PREDICTION_HORIZON = 1.0  # years

np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [6]:
demo_cols = [
    'Age at recruitment',
    'Sex_male',
    'Body mass index (BMI)',
    'Systolic blood pressure, automated reading',
    'Diastolic blood pressure, automated reading',
    'Townsend deprivation index at recruitment',
]

## Load Data

In [2]:
df = pd.read_csv(f"{data_path}blood_protein_diagnoses_clean_new.csv", low_memory=False)
print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
df.head()

Loaded 52995 rows, 3020 columns


Unnamed: 0,eid,Age at recruitment,Sex_male,Ethnic background,Body mass index (BMI),"Systolic blood pressure, automated reading","Diastolic blood pressure, automated reading",Townsend deprivation index at recruitment,Smoking status,Alcohol intake frequency.,...,stroke,t2d,alzheimers_time_to_diagnosis,copd_time_to_diagnosis,hhd_time_to_diagnosis,ischemia_time_to_diagnosis,kidney_time_to_diagnosis,lower_resp_time_to_diagnosis,stroke_time_to_diagnosis,t2d_time_to_diagnosis
0,1000083,49,0,British,24.7295,116.0,71.0,-3.96,Previous,Three or four times a week,...,,,,,,,,,,
1,1000380,62,0,British,31.2026,124.0,81.0,-5.0,Never,Daily or almost daily,...,,,,,,,,,,
2,1001803,47,0,Any other white background,24.2187,98.0,57.0,2.0,Never,Never,...,,,,,,,,,,
3,1002917,52,1,British,20.1477,132.0,67.0,-4.23,Current,Special occasions only,...,,,,,,,,,,
4,1003287,69,0,British,28.1479,166.0,61.0,6.38,Previous,Three or four times a week,...,,,,,,,,,,


In [3]:
# Find diagnosis types (columns ending with _time_to_diagnosis)
time_cols = [c for c in df.columns if c.endswith('_time_to_diagnosis')]
diag_types = [c.replace('_time_to_diagnosis', '') for c in time_cols]
print(f"Found {len(diag_types)} diagnosis types: {diag_types}")

Found 11 diagnosis types: ['colorectal_cancer', 'lung_cancer', 'stomach_cancer', 'alzheimers', 'copd', 'hhd', 'ischemia', 'kidney', 'lower_resp', 'stroke', 't2d']


## Define Features

In [4]:
# Get feature columns (olink proteins + blood biomarkers + demographics)
olink_cols = [c for c in df.columns if c.startswith('olink_')]
blood_cols = [c for c in df.columns if c.startswith('blood_')]
demo_cols = [c for c in demo_cols if c in df.columns]

feature_cols = olink_cols + blood_cols + demo_cols
print(f"Features: {len(olink_cols)} olink, {len(blood_cols)} blood, {len(demo_cols)} demo = {len(feature_cols)} total")

Features: 2923 olink, 61 blood, 6 demo = 2990 total


## Helper Functions

In [2]:
def get_labels(df, diag_type, horizon=1.0):
    """Get binary labels: 1 if diagnosed within horizon years, 0 otherwise."""
    time_col = f"{diag_type}_time_to_diagnosis"
    if time_col not in df.columns:
        return None
    y = ((df[time_col] > 30/365.25) & (df[time_col] <= horizon)).astype(int)
    return y


def filter_already_diagnosed(df, diag_type):
    """Remove patients already diagnosed at baseline."""
    time_col = f"{diag_type}_time_to_diagnosis"
    if time_col not in df.columns:
        return df
    mask = (df[time_col] > 0) | (df[time_col].isna())
    return df[mask].copy()

## Run XGBoost for Each Diagnosis Type (80:20 Split)

In [8]:
results = []

for diag_type in diag_types:
    print(f"\n{'='*50}")
    print(f"Processing: {diag_type}")
    print(f"{'='*50}")
    
    # Filter out already diagnosed patients
    df_filtered = filter_already_diagnosed(df, diag_type)
    
    # Get features and labels
    X = df_filtered[feature_cols].copy()
    y = get_labels(df_filtered, diag_type, PREDICTION_HORIZON)
    
    if y is None or y.sum() < 10:
        print(f"  Skipping: insufficient positive samples ({y.sum() if y is not None else 0})")
        results.append({'diag_type': diag_type, 'status': 'skipped'})
        continue
    
    print(f"  Samples: {len(y)}, Positive: {y.sum()} ({y.mean()*100:.2f}%)")
    
    # 80:20 train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )
    print(f"  Train: {len(y_train)}, Test: {len(y_test)}")
    
    # Handle class imbalance
    n_neg = (y_train == 0).sum()
    n_pos = (y_train == 1).sum()
    scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
    
    # Vanilla XGBoost
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        tree_method='hist',
        enable_categorical=True,
        colsample_bytrue=0.8,
        # scale_pos_weight=scale_pos_weight,
        random_state=RANDOM_STATE,
        n_estimators=100,
        max_depth=5,
        learning_rate=0.01,
        verbosity=0
    )
    
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred)
    
    print(f"  Test AUC: {test_auc:.4f}")
    
    results.append({
        'diag_type': diag_type,
        'n_samples': len(y),
        'n_positive': int(y.sum()),
        'prevalence': y.mean(),
        'test_auc': test_auc,
        'status': 'completed'
    })


Processing: colorectal_cancer
  Samples: 52750, Positive: 240 (0.45%)
  Train: 42200, Test: 10550
  Test AUC: 0.6359

Processing: lung_cancer
  Samples: 52950, Positive: 186 (0.35%)
  Train: 42360, Test: 10590
  Test AUC: 0.8385

Processing: stomach_cancer
  Samples: 52978, Positive: 31 (0.06%)
  Train: 42382, Test: 10596
  Test AUC: 0.7628

Processing: alzheimers
  Samples: 52989, Positive: 83 (0.16%)
  Train: 42391, Test: 10598
  Test AUC: 0.8716

Processing: copd
  Samples: 52619, Positive: 663 (1.26%)
  Train: 42095, Test: 10524
  Test AUC: 0.8691

Processing: hhd
  Samples: 52982, Positive: 12 (0.02%)
  Train: 42385, Test: 10597
  Test AUC: 0.2965

Processing: ischemia
  Samples: 50678, Positive: 1461 (2.88%)
  Train: 40542, Test: 10136
  Test AUC: 0.7659

Processing: kidney
  Samples: 52682, Positive: 865 (1.64%)
  Train: 42145, Test: 10537
  Test AUC: 0.8610

Processing: lower_resp
  Samples: 51215, Positive: 1645 (3.21%)
  Train: 40972, Test: 10243
  Test AUC: 0.6961

Processi

## Summary (80:20 Split)

In [9]:
results_df = pd.DataFrame(results)
completed = results_df[results_df['status'] == 'completed'].sort_values('test_auc', ascending=False)

print("\n" + "="*60)
print("SUMMARY - Test AUC by Diagnosis Type (80:20 Split)")
print("="*60)
print(completed[['diag_type', 'n_positive', 'prevalence', 'test_auc']].to_string(index=False))


SUMMARY - Test AUC by Diagnosis Type (80:20 Split)
        diag_type  n_positive  prevalence  test_auc
              t2d        1191    0.022951  0.926935
       alzheimers          83    0.001566  0.871623
             copd         663    0.012600  0.869073
           kidney         865    0.016419  0.861050
      lung_cancer         186    0.003513  0.838476
           stroke         543    0.010331  0.769320
         ischemia        1461    0.028829  0.765878
   stomach_cancer          31    0.000585  0.762827
       lower_resp        1645    0.032119  0.696079
colorectal_cancer         240    0.004550  0.635899
              hhd          12    0.000226  0.296461


In [18]:
# Save results
results_df.to_csv(f'simple_xgboost_results_{PREDICTION_HORIZON}yrs.csv', index=False)
print(f"\nSaved to simple_xgboost_results_{PREDICTION_HORIZON}yrs.csv")


Saved to simple_xgboost_results_5.0yrs.csv


---
# Using Pre-Split Train/Test Data
Now run the same model using the pre-split ukb_diag_train.csv and ukb_diag_test.csv files

In [3]:
train_df = pd.read_csv(f"{data_path}ukb_diag_train.csv", low_memory=False)
test_df = pd.read_csv(f"{data_path}ukb_diag_test.csv", low_memory=False)

print(f"Train: {len(train_df)} rows, {len(train_df.columns)} columns")
print(f"Test: {len(test_df)} rows, {len(test_df.columns)} columns")

Train: 37032 rows, 3028 columns
Test: 9258 rows, 3028 columns


In [4]:
# Find diagnosis types
time_cols_presplit = [c for c in train_df.columns if c.endswith('_time_to_diagnosis')]
diag_types_presplit = [c.replace('_time_to_diagnosis', '') for c in time_cols_presplit]
print(f"Found {len(diag_types_presplit)} diagnosis types: {diag_types_presplit}")

Found 11 diagnosis types: ['colorectal_cancer', 'lung_cancer', 'stomach_cancer', 'alzheimers', 'copd', 'hhd', 'ischemia', 'kidney', 'lower_resp', 'stroke', 't2d']


In [7]:
# Get feature columns
olink_cols_ps = [c for c in train_df.columns if c.startswith('olink_')]
blood_cols_ps = [c for c in train_df.columns if c.startswith('blood_')]
demo_cols_ps = [c for c in demo_cols if c in train_df.columns]
feature_cols_ps = olink_cols_ps + blood_cols_ps + demo_cols_ps
print(f"Features: {len(olink_cols_ps)} olink, {len(blood_cols_ps)} blood, {len(demo_cols_ps)} demo = {len(feature_cols_ps)} total")

Features: 2920 olink, 61 blood, 6 demo = 2987 total


In [21]:
results_presplit = []
top_features_per_outcome = {}  # Store top 50 protein features for each outcome

feature_cols_ps = olink_cols_ps


for diag_type in diag_types_presplit:
    if diag_type == "alzheimers":
        PREDICTION_HORIZON = 5.0
    else:
        PREDICTION_HORIZON = 1.0
    print(f"\n{'='*50}")
    print(f"Processing: {diag_type}")
    print(f"{'='*50}")
    
    # Filter out already diagnosed patients
    train_filtered = filter_already_diagnosed(train_df, diag_type)
    test_filtered = filter_already_diagnosed(test_df, diag_type)
    
    # Get features and labels
    X_train = train_filtered[feature_cols_ps].copy()
    X_test = test_filtered[feature_cols_ps].copy()
    y_train = get_labels(train_filtered, diag_type, PREDICTION_HORIZON)
    y_test = get_labels(test_filtered, diag_type, PREDICTION_HORIZON)
    
    imputer = SimpleImputer(strategy="median")
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.fit_transform(X_test)
    
    if y_train is None or y_train.sum() < 10:
        print(f"  Skipping: insufficient positive samples ({y_train.sum() if y_train is not None else 0})")
        results_presplit.append({'diag_type': diag_type, 'status': 'skipped'})
        continue
    
    print(f"  Train: {len(y_train)}, Positive: {y_train.sum()} ({y_train.mean()*100:.2f}%)")
    print(f"  Test: {len(y_test)}, Positive: {y_test.sum()} ({y_test.mean()*100:.2f}%)")
    
    # Handle class imbalance
    n_neg = (y_train == 0).sum()
    n_pos = (y_train == 1).sum()
    scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
    
    # Vanilla XGBoost
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        tree_method='hist',
        enable_categorical=True,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE,
        n_estimators=100,
        max_depth=5,
        learning_rate=0.01,
        verbosity=0,
        # For GPU determinism:
        # deterministic_histogram=True
    )
    
    model.fit(X_train, y_train)
    
    # Extract top 50 protein features
    feature_importance = pd.DataFrame({
        'feature': feature_cols_ps,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Filter to only protein (olink) features and get top 50
    protein_importance = feature_importance[feature_importance['feature'].str.startswith('olink_')]
    top_50_proteins = protein_importance.head(50)['feature'].tolist()
    top_features_per_outcome[diag_type] = top_50_proteins
    print(f"  Top 5 proteins: {top_50_proteins[:5]}")
    
    # Evaluate
    y_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred)
    
    print(f"  Test AUC: {test_auc:.4f}")
    
    results_presplit.append({
        'diag_type': diag_type,
        'prediction_horizon': PREDICTION_HORIZON,
        'n_train': len(y_train),
        'n_test': len(y_test),
        'n_positive_train': int(y_train.sum()),
        'n_positive_test': int(y_test.sum()),
        'prevalence': y_train.mean(),
        'test_auc': test_auc,
        'status': 'completed'
    })
    
    feature_cols_top_50 = top_50_proteins
    
    # Get features and labels
    X_train = train_filtered[feature_cols_top_50].copy()
    X_test = test_filtered[feature_cols_top_50].copy()
    
    imputer = SimpleImputer(strategy="median")
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.fit_transform(X_test)

    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred)
    
    print(f"  Test AUC (top 50): {test_auc:.4f}")


Processing: colorectal_cancer
  Train: 36862, Positive: 30 (0.08%)
  Test: 9215, Positive: 8 (0.09%)
  Top 5 proteins: ['olink_pbld', 'olink_adgrf5', 'olink_ddx4', 'olink_tcl1b', 'olink_ambn']
  Test AUC: 0.7597
  Test AUC (top 50): 0.7280

Processing: lung_cancer
  Train: 36998, Positive: 23 (0.06%)
  Test: 9250, Positive: 6 (0.06%)
  Top 5 proteins: ['olink_mmp12', 'olink_slitrk2', 'olink_wfdc2', 'olink_epb41l5', 'olink_esam']
  Test AUC: 0.8270
  Test AUC (top 50): 0.8625

Processing: stomach_cancer
  Skipping: insufficient positive samples (2)

Processing: alzheimers
  Train: 37029, Positive: 58 (0.16%)
  Test: 9257, Positive: 16 (0.17%)
  Top 5 proteins: ['olink_gfap', 'olink_cd40lg', 'olink_snap25', 'olink_ca7', 'olink_raly']
  Test AUC: 0.9147
  Test AUC (top 50): 0.8618

Processing: copd
  Train: 36769, Positive: 72 (0.20%)
  Test: 9191, Positive: 18 (0.20%)
  Top 5 proteins: ['olink_smpdl3b', 'olink_psme1', 'olink_tnfrsf10b', 'olink_wfdc2', 'olink_scgb3a2']
  Test AUC: 0.7771

## Summary (Pre-Split Data)

In [14]:
results_presplit_df = pd.DataFrame(results_presplit)
completed_ps = results_presplit_df[results_presplit_df['status'] == 'completed'].sort_values('test_auc', ascending=False)

print("\n" + "="*60)
print("SUMMARY - Test AUC by Diagnosis Type (Pre-Split Data)")
print("="*60)
print(completed_ps[['diag_type', 'n_positive_train', 'n_positive_test', 'prevalence', 'test_auc']].to_string(index=False))


SUMMARY - Test AUC by Diagnosis Type (Pre-Split Data)
        diag_type  n_positive_train  n_positive_test  prevalence  test_auc
              t2d             129.0             32.0    0.003555  0.953293
       alzheimers              58.0             16.0    0.001566  0.916980
           kidney              42.0             10.0    0.001139  0.846459
      lung_cancer              23.0              6.0    0.000622  0.842583
colorectal_cancer              30.0              8.0    0.000814  0.772578
             copd              72.0             18.0    0.001958  0.751711
           stroke              56.0             14.0    0.001524  0.732778
         ischemia             162.0             41.0    0.004568  0.676925
       lower_resp             190.0             47.0    0.005308  0.664961


In [15]:
# Save results
results_presplit_df.to_csv(f'simple_xgboost_results.csv', index=False)
print(f"\nSaved to simple_xgboost_results_presplit.csv")


Saved to simple_xgboost_results_presplit.csv


---
# Re-run with Union of Top 50 Protein Features
Using demo + blood + union of all top 50 protein features from each outcome

In [22]:
# Compute union of all top 50 protein features
all_top_proteins = set()
for diag_type, proteins in top_features_per_outcome.items():
    all_top_proteins.update(proteins)

union_protein_cols = list(all_top_proteins)
print(f"Union of top 50 proteins across all outcomes: {len(union_protein_cols)} unique proteins")

# New feature set: demo + blood + union of top proteins
feature_cols_union = demo_cols_ps + blood_cols_ps + union_protein_cols
print(f"New feature set: {len(demo_cols_ps)} demo + {len(blood_cols_ps)} blood + {len(union_protein_cols)} proteins = {len(feature_cols_union)} total")

Union of top 50 proteins across all outcomes: 391 unique proteins
New feature set: 6 demo + 61 blood + 391 proteins = 458 total


In [23]:
# Re-run XGBoost with union of top protein features
results_union = []

for diag_type in diag_types_presplit:
    print(f"\n{'='*50}")
    print(f"Processing: {diag_type}")
    print(f"{'='*50}")
    
    if diag_type == "alzheimers":
        PREDICTION_HORIZON = 5.0
    else:
        PREDICTION_HORIZON = 1.0
    
    # Filter out already diagnosed patients
    train_filtered = filter_already_diagnosed(train_df, diag_type)
    test_filtered = filter_already_diagnosed(test_df, diag_type)
    
    # Get features and labels with union feature set
    X_train = train_filtered[feature_cols_union].copy()
    X_test = test_filtered[feature_cols_union].copy()
    y_train = get_labels(train_filtered, diag_type, PREDICTION_HORIZON)
    y_test = get_labels(test_filtered, diag_type, PREDICTION_HORIZON)
    
    if y_train is None or y_train.sum() < 10:
        print(f"  Skipping: insufficient positive samples ({y_train.sum() if y_train is not None else 0})")
        results_union.append({'diag_type': diag_type, 'status': 'skipped'})
        continue
    
    print(f"  Train: {len(y_train)}, Positive: {y_train.sum()} ({y_train.mean()*100:.2f}%)")
    print(f"  Test: {len(y_test)}, Positive: {y_test.sum()} ({y_test.mean()*100:.2f}%)")
    
    # Handle class imbalance
    n_neg = (y_train == 0).sum()
    n_pos = (y_train == 1).sum()
    scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
    
    # Vanilla XGBoost
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        tree_method='hist',
        enable_categorical=True,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE,
        n_estimators=100,
        max_depth=5,
        learning_rate=0.01,
        verbosity=0
    )
    
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred)
    
    print(f"  Test AUC: {test_auc:.4f}")
    
    results_union.append({
        'diag_type': diag_type,
        'prediction_horizon': PREDICTION_HORIZON,
        'n_train': len(y_train),
        'n_test': len(y_test),
        'n_positive_train': int(y_train.sum()),
        'n_positive_test': int(y_test.sum()),
        'prevalence': y_train.mean(),
        'test_auc': test_auc,
        'status': 'completed'
    })


Processing: colorectal_cancer
  Train: 36862, Positive: 30 (0.08%)
  Test: 9215, Positive: 8 (0.09%)
  Test AUC: 0.7307

Processing: lung_cancer
  Train: 36998, Positive: 23 (0.06%)
  Test: 9250, Positive: 6 (0.06%)
  Test AUC: 0.8293

Processing: stomach_cancer
  Skipping: insufficient positive samples (2)

Processing: alzheimers
  Train: 37029, Positive: 58 (0.16%)
  Test: 9257, Positive: 16 (0.17%)
  Test AUC: 0.9222

Processing: copd
  Train: 36769, Positive: 72 (0.20%)
  Test: 9191, Positive: 18 (0.20%)
  Test AUC: 0.7544

Processing: hhd
  Skipping: insufficient positive samples (0)

Processing: ischemia
  Train: 35461, Positive: 162 (0.46%)
  Test: 8863, Positive: 41 (0.46%)
  Test AUC: 0.6748

Processing: kidney
  Train: 36864, Positive: 42 (0.11%)
  Test: 9216, Positive: 10 (0.11%)
  Test AUC: 0.8124

Processing: lower_resp
  Train: 35795, Positive: 190 (0.53%)
  Test: 8947, Positive: 47 (0.53%)
  Test AUC: 0.6650

Processing: stroke
  Train: 36740, Positive: 56 (0.15%)
  Tes

## Summary (Union of Top Proteins)

In [18]:
results_union_df = pd.DataFrame(results_union)
completed_union = results_union_df[results_union_df['status'] == 'completed'].sort_values('test_auc', ascending=False)

print("\n" + "="*60)
print(f"SUMMARY - Test AUC (Union of Top {len(union_protein_cols)} Proteins + Demo + Blood)")
print("="*60)
print(completed_union[['diag_type', 'n_positive_train', 'n_positive_test', 'prevalence', 'test_auc']].to_string(index=False))


SUMMARY - Test AUC (Union of Top 402 Proteins + Demo + Blood)
        diag_type  n_positive_train  n_positive_test  prevalence  test_auc
              t2d             129.0             32.0    0.003555  0.950427
       alzheimers              58.0             16.0    0.001566  0.918637
      lung_cancer              23.0              6.0    0.000622  0.836128
           kidney              42.0             10.0    0.001139  0.806040
colorectal_cancer              30.0              8.0    0.000814  0.754535
           stroke              56.0             14.0    0.001524  0.743714
             copd              72.0             18.0    0.001958  0.717256
       lower_resp             190.0             47.0    0.005308  0.687772
         ischemia             162.0             41.0    0.004568  0.659579


In [19]:
# Compare results: All features vs Union of top proteins
comparison = results_presplit_df[results_presplit_df['status'] == 'completed'][['diag_type', 'test_auc']].merge(
    results_union_df[results_union_df['status'] == 'completed'][['diag_type', 'test_auc']],
    on='diag_type',
    suffixes=('_all_features', '_union_top50')
)
comparison['diff'] = comparison['test_auc_union_top50'] - comparison['test_auc_all_features']
comparison = comparison.sort_values('test_auc_union_top50', ascending=False)

print("\n" + "="*70)
print("COMPARISON: All Features vs Union of Top 50 Proteins")
print(f"All features: {len(feature_cols_ps)} | Union features: {len(feature_cols_union)}")
print("="*70)
print(comparison.to_string(index=False))


COMPARISON: All Features vs Union of Top 50 Proteins
All features: 2987 | Union features: 469
        diag_type  test_auc_all_features  test_auc_union_top50      diff
              t2d               0.953293              0.950427 -0.002866
       alzheimers               0.916980              0.918637  0.001657
      lung_cancer               0.842583              0.836128 -0.006455
           kidney               0.846459              0.806040 -0.040419
colorectal_cancer               0.772578              0.754535 -0.018043
           stroke               0.732778              0.743714  0.010936
             copd               0.751711              0.717256 -0.034455
       lower_resp               0.664961              0.687772  0.022811
         ischemia               0.676925              0.659579 -0.017346


In [20]:
# Save results and top features
results_union_df.to_csv(f'simple_xgboost_results_union_top50_{PREDICTION_HORIZON}yrs.csv', index=False)
print(f"Saved results to simple_xgboost_results_union_top50_{PREDICTION_HORIZON}yrs.csv")

# Save top 50 proteins per outcome
top_features_df = pd.DataFrame([
    {'diag_type': diag, 'protein': prot, 'rank': i+1}
    for diag, prots in top_features_per_outcome.items()
    for i, prot in enumerate(prots)
])
top_features_df.to_csv(f'top50_proteins_per_outcome_{PREDICTION_HORIZON}yrs.csv', index=False)
print(f"Saved top 50 proteins per outcome to top50_proteins_per_outcome_{PREDICTION_HORIZON}yrs.csv")

# Save union protein list
pd.DataFrame({'protein': union_protein_cols}).to_csv(f'union_top50_proteins_{PREDICTION_HORIZON}yrs.csv', index=False)
print(f"Saved union protein list ({len(union_protein_cols)} proteins) to union_top50_proteins_{PREDICTION_HORIZON}yrs.csv")

Saved results to simple_xgboost_results_union_top50_1.0yrs.csv
Saved top 50 proteins per outcome to top50_proteins_per_outcome_1.0yrs.csv
Saved union protein list (402 proteins) to union_top50_proteins_1.0yrs.csv


# Blood only

In [26]:
feature_cols_ps = blood_cols_ps + demo_cols_ps
results_blood = []

for diag_type in diag_types_presplit:
    if diag_type == "alzheimers":
        PREDICTION_HORIZON = 5.0
    else:
        PREDICTION_HORIZON = 1.0
    print(f"\n{'='*50}")
    print(f"Processing: {diag_type}")
    print(f"{'='*50}")
    
    # Filter out already diagnosed patients
    train_filtered = filter_already_diagnosed(train_df, diag_type)
    test_filtered = filter_already_diagnosed(test_df, diag_type)
    
    # Get features and labels
    X_train = train_filtered[feature_cols_ps].copy()
    X_test = test_filtered[feature_cols_ps].copy()
    y_train = get_labels(train_filtered, diag_type, PREDICTION_HORIZON)
    y_test = get_labels(test_filtered, diag_type, PREDICTION_HORIZON)
    
    imputer = SimpleImputer(strategy="median")
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.fit_transform(X_test)
    
    if y_train is None or y_train.sum() < 10:
        print(f"  Skipping: insufficient positive samples ({y_train.sum() if y_train is not None else 0})")
        results_presplit.append({'diag_type': diag_type, 'status': 'skipped'})
        continue
    
    print(f"  Train: {len(y_train)}, Positive: {y_train.sum()} ({y_train.mean()*100:.2f}%)")
    print(f"  Test: {len(y_test)}, Positive: {y_test.sum()} ({y_test.mean()*100:.2f}%)")
    
    # Handle class imbalance
    n_neg = (y_train == 0).sum()
    n_pos = (y_train == 1).sum()
    scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
    
    # Vanilla XGBoost
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        tree_method='hist',
        enable_categorical=True,
        colsample_bytree=0.8,
        random_state=RANDOM_STATE,
        n_estimators=100,
        max_depth=5,
        learning_rate=0.01,
        verbosity=0,
        # For GPU determinism:
        # deterministic_histogram=True
    )
    
    model.fit(X_train, y_train)
    
    # Extract top 50 protein features
    feature_importance = pd.DataFrame({
        'feature': feature_cols_ps,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Filter to only protein (olink) features and get top 50
    top_10_features = feature_importance.head(10)['feature'].tolist()
    print(f"  Top 10 features: {top_10_features}")
    
    # Evaluate
    y_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred)
    
    print(f"  Test AUC: {test_auc:.4f}")
    
    results_blood.append({
        'diag_type': diag_type,
        'prediction_horizon': PREDICTION_HORIZON,
        'n_train': len(y_train),
        'n_test': len(y_test),
        'n_positive_train': int(y_train.sum()),
        'n_positive_test': int(y_test.sum()),
        'prevalence': y_train.mean(),
        'test_auc': test_auc,
        'status': 'completed'
    })


Processing: colorectal_cancer
  Train: 36862, Positive: 30 (0.08%)
  Test: 9215, Positive: 8 (0.09%)
  Top 10 features: ['blood_White blood cell (leukocyte) count', 'Sex_male', 'blood_Lymphocyte count', 'blood_Neutrophill count', 'blood_IGF-1', 'blood_Mean corpuscular haemoglobin', 'blood_Reticulocyte count', 'blood_Testosterone', 'blood_LDL direct', 'Age at recruitment']
  Test AUC: 0.7656

Processing: lung_cancer
  Train: 36998, Positive: 23 (0.06%)
  Test: 9250, Positive: 6 (0.06%)
  Top 10 features: ['blood_Mean sphered cell volume', 'blood_Mean corpuscular volume', 'Age at recruitment', 'blood_Alkaline phosphatase', 'blood_Immature reticulocyte fraction', 'blood_Neutrophill count', 'blood_Glycated haemoglobin (HbA1c)', 'blood_Cholesterol', 'blood_Triglycerides', 'blood_C-reactive protein']
  Test AUC: 0.7599

Processing: stomach_cancer
  Skipping: insufficient positive samples (2)

Processing: alzheimers
  Train: 37029, Positive: 58 (0.16%)
  Test: 9257, Positive: 16 (0.17%)
  To

In [27]:
results_blood_df = pd.DataFrame(results_blood)
completed_ps = results_blood_df[results_blood_df['status'] == 'completed'].sort_values('test_auc', ascending=False)

print("\n" + "="*60)
print("SUMMARY - Test AUC by Diagnosis Type (Pre-Split Data)")
print("="*60)
print(completed_ps[['diag_type', 'n_positive_train', 'n_positive_test', 'prevalence', 'test_auc']].to_string(index=False))


SUMMARY - Test AUC by Diagnosis Type (Pre-Split Data)
        diag_type  n_positive_train  n_positive_test  prevalence  test_auc
              t2d               129               32    0.003555  0.958541
           kidney                42               10    0.001139  0.791853
colorectal_cancer                30                8    0.000814  0.765552
      lung_cancer                23                6    0.000622  0.759862
           stroke                56               14    0.001524  0.725775
             copd                72               18    0.001958  0.719264
       alzheimers                58               16    0.001566  0.714641
         ischemia               162               41    0.004568  0.698669
       lower_resp               190               47    0.005308  0.655329
