# Simple XGBoost for Diagnosis Prediction
Loads data, 80:20 train/test split, vanilla XGBoost, outputs test AUC for each diag_type

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

data_path = "/orcd/pool/003/dbertsim_shared/ukb/"
RANDOM_STATE = 42
PREDICTION_HORIZON = 1.0  # years

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


## Load Data

In [None]:
df = pd.read_csv(f"{data_path}blood_protein_cancers_clean.csv", low_memory=False)
print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
df.head()

In [5]:
# Find diagnosis types (columns ending with _time_to_diagnosis)
time_cols = [c for c in df.columns if c.endswith('_time_to_diagnosis')]
diag_types = [c.replace('_time_to_diagnosis', '') for c in time_cols]
print(f"Found {len(diag_types)} diagnosis types: {diag_types}")

Found 8 diagnosis types: ['bladder', 'breast', 'colorectal', 'liver', 'lung', 'pancreatic', 'prostate', 'cancer']


## Define Features

In [6]:
# Get feature columns (olink proteins + blood biomarkers + demographics)
olink_cols = [c for c in df.columns if c.startswith('olink_')]
blood_cols = [c for c in df.columns if c.startswith('blood_')]

demo_cols = [
    'Age at recruitment',
    'Sex_male',
    'Body mass index (BMI)',
    'Systolic blood pressure, automated reading',
    'Diastolic blood pressure, automated reading',
    'Townsend deprivation index at recruitment',
]
demo_cols = [c for c in demo_cols if c in df.columns]

feature_cols = olink_cols + blood_cols + demo_cols
print(f"Features: {len(olink_cols)} olink, {len(blood_cols)} blood, {len(demo_cols)} demo = {len(feature_cols)} total")

Features: 2923 olink, 61 blood, 6 demo = 2990 total


## Helper Functions

In [10]:
def get_labels(df, diag_type, horizon=1.0):
    """Get binary labels: 1 if diagnosed within horizon years, 0 otherwise."""
    time_col = f"{diag_type}_time_to_diagnosis"
    if time_col not in df.columns:
        return None
    y = ((df[time_col] > 30/365.25) & (df[time_col] <= horizon)).astype(int)
    return y


def filter_already_diagnosed(df, diag_type):
    """Remove patients already diagnosed at baseline."""
    time_col = f"{diag_type}_time_to_diagnosis"
    if time_col not in df.columns:
        return df
    mask = (df[time_col] > 0) | (df[time_col].isna())
    return df[mask].copy()

## Run XGBoost for Each Diagnosis Type

In [11]:
results = []

for diag_type in diag_types:
    print(f"\n{'='*50}")
    print(f"Processing: {diag_type}")
    print(f"{'='*50}")
    
    # Filter out already diagnosed patients
    df_filtered = filter_already_diagnosed(df, diag_type)
    
    # Get features and labels
    X = df_filtered[feature_cols].copy()
    y = get_labels(df_filtered, diag_type, PREDICTION_HORIZON)
    
    if y is None or y.sum() < 10:
        print(f"  Skipping: insufficient positive samples ({y.sum() if y is not None else 0})")
        results.append({'diag_type': diag_type, 'status': 'skipped'})
        continue
    
    print(f"  Samples: {len(y)}, Positive: {y.sum()} ({y.mean()*100:.2f}%)")
    
    # 80:20 train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )
    print(f"  Train: {len(y_train)}, Test: {len(y_test)}")
    
    # Handle class imbalance
    n_neg = (y_train == 0).sum()
    n_pos = (y_train == 1).sum()
    scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
    
    # Vanilla XGBoost
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='auc',
        tree_method='hist',
        enable_categorical=True,
        scale_pos_weight=scale_pos_weight,
        random_state=RANDOM_STATE,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        verbosity=0
    )
    
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, y_pred)
    
    print(f"  Test AUC: {test_auc:.4f}")
    
    results.append({
        'diag_type': diag_type,
        'n_samples': len(y),
        'n_positive': int(y.sum()),
        'prevalence': y.mean(),
        'test_auc': test_auc,
        'status': 'completed'
    })


Processing: bladder
  Skipping: insufficient positive samples (9)

Processing: breast
  Samples: 51989, Positive: 87 (0.17%)
  Train: 41591, Test: 10398
  Test AUC: 0.7118

Processing: colorectal
  Samples: 52775, Positive: 42 (0.08%)
  Train: 42220, Test: 10555
  Test AUC: 0.6603

Processing: liver
  Skipping: insufficient positive samples (4)

Processing: lung
  Samples: 52951, Positive: 30 (0.06%)
  Train: 42360, Test: 10591
  Test AUC: 0.9386

Processing: pancreatic
  Skipping: insufficient positive samples (9)

Processing: prostate
  Samples: 52648, Positive: 79 (0.15%)
  Train: 42118, Test: 10530
  Test AUC: 0.8631

Processing: cancer
  Samples: 48072, Positive: 526 (1.09%)
  Train: 38457, Test: 9615
  Test AUC: 0.5896


## Summary

In [12]:
results_df = pd.DataFrame(results)
completed = results_df[results_df['status'] == 'completed'].sort_values('test_auc', ascending=False)

print("\n" + "="*60)
print("SUMMARY - Test AUC by Diagnosis Type")
print("="*60)
print(completed[['diag_type', 'n_positive', 'prevalence', 'test_auc']].to_string(index=False))


SUMMARY - Test AUC by Diagnosis Type
 diag_type  n_positive  prevalence  test_auc
      lung        30.0    0.000567  0.938592
  prostate        79.0    0.001501  0.863069
    breast        87.0    0.001673  0.711758
colorectal        42.0    0.000796  0.660283
    cancer       526.0    0.010942  0.589605
