In [1]:
import numpy as np
import xgboost as xgb
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import pandas as pd
models_dir = "output/"

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [18]:
DIR = "/../../orcd/pool/003/dbertsim_shared/ukb/"
demo_embeddings = pd.read_csv(f"{DIR}demo_embeddings.csv")
embedding_cols = [col for col in demo_embeddings.columns if col.startswith("cn_")]
cancer_df = pd.read_csv(f"{DIR}ukb_cancer_test.csv")


In [29]:
y = ((cancer_df["breast_time_to_diagnosis"] > 0) & (cancer_df["breast_time_to_diagnosis"] <= 1)).astype(int)

In [30]:
len(y)

10599

In [31]:
sum(y)

17

In [52]:
def preprocess(df: pd.DataFrame, onehot = False):
    df['Ethnic background'] = df['Ethnic background'].astype("category")
    df['Smoking status'] = df['Smoking status'].astype("category")
    df['Alcohol intake frequency.'] = df['Alcohol intake frequency.'].astype("category")
    df['Medication for cholesterol, blood pressure or diabetes'] = df['Medication for cholesterol, blood pressure or diabetes'].astype('category')
    
    if onehot:
        for col in ['Illnesses of father', 'Illnesses of mother', 'Illnesses of siblings']:
            df[col] = df[col].str.replace(r'None of the above \(group 1\)', '', regex=True) \
            .str.replace(r'None of the above \(group 2\)', '', regex=True) \
            .str.replace(r'Do not know \(group 1\)', '', regex=True) \
            .str.replace(r'Do not know \(group 2\)', '', regex=True) \
            .str.replace(r'Prefer not to answer \(group 1\)', '', regex=True) \
            .str.replace(r'Prefer not to answer \(group 2\)', '', regex=True) \
            .str.strip()

            # One hot encoding
            illness_dummies = df[col].str.get_dummies(sep='|')
            illness_dummies.columns = col + '_' + illness_dummies.columns.str.strip()

            # Insert new columns in place of original 'illnesses'
            ill_idx = df.columns.get_loc(col)  # get index position
            df.drop(columns=[col], inplace=True)

            # Insert dummy columns at the same position
            for i, col in enumerate(illness_dummies.columns):
                df.insert(ill_idx + i, col, illness_dummies[col])
                
    return df

def get_X(df):
    feat_cols = list(df.columns[1:45]) + [c for c in df.columns if c.startswith(('blood', 'olink'))]
    return df.loc[:, feat_cols]

def train_xgboost(dtrain, dvalid, num_epochs=100):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist",
        # add other params here
    }
    params = {
        # "objective": "multi:softprob",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist",  # Efficient for large datasets
        "max_depth": 10,
        "learning_rate": 0.1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
    }

    evals = [(dtrain, "train"), (dvalid, "valid")]

#     model = xgb.train(
#         params,
#         dtrain,
#         num_boost_round=num_epochs,
#         evals=evals,
#         early_stopping_rounds=20,   # uses validation to stop
#         verbose_eval=False
#     )
#     return model

def train_xgboost(
    dtrain, dvalid, num_epochs=5
):
    params = {
        # "objective": "multi:softprob",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist",  # Efficient for large datasets
        "max_depth": 10,
        "learning_rate": 0.1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
    }
    
    model = None
    evals = [(dtrain, "train"), (dvalid, "valid")]
    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch + 1}/{num_epochs}...")

        # Train incrementally
        if model is None:
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=200,
                evals=evals,
                early_stopping_rounds=20,
                verbose_eval=False
            )
        else:
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=200,
                evals=evals,
                early_stopping_rounds=20,
                xgb_model=model,
                verbose_eval=False
            )
    print("Training complete!")
    return model

In [24]:
label_cols = [
    "cancer",
    "breast_cancer",
    "prostate_cancer",
    "lung_cancer",
    "colorectal_cancer",
    "bladder_cancer", 
    "pancreatic_cancer"
]

label_time_cols = [
    "cancer",
    "breast",
    "prostate",
    "lung",
    "colorectal",
    "bladder", 
    "pancreatic"
]

In [None]:
## Create train, validation, and test datasets for predicting current cancer
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

def multilabel_stratified_split(
    df: pd.DataFrame,
    label_cols,
    test_size=0.4,
    random_state=42,
    time=0
):
    """
    Splits df into train/test so that each label in label_cols
    has (approximately) the same prevalence in both splits,
    accounting for multi-label rows.
    """
    # 1) Build the multi-label target matrix (n_samples x n_labels)
    # Y = df[label_cols].astype(int).to_numpy()
    if time > 0:
        new_df = df[df["cancer_time_to_diagnosis"] > 0].copy() # need to rethink this
        time_mask_df = pd.DataFrame(
            c: ((new_df[f"{c}_time_to_diagnosis"] > 0) & (new_df[f"{c}_time_to_diagnosis"] <= time)).astype(int)
            for c in label_time_cols
        )
        Y = time_mask_df.to_numpy()
    else:
        new_df = df
        Y = new_df[label_cols].astype(int).to_numpy()

    # 2) Set up the multi-label stratified splitter
    msss = MultilabelStratifiedShuffleSplit(
        n_splits=1, test_size=test_size, random_state=random_state
    )

    # 3) Run the split; indices refer to rows of df
    (train_idx, test_idx), = msss.split(new_df, Y)

    train_df = new_df.iloc[train_idx].copy()
    test_df  = new_df.iloc[test_idx].copy()

    return train_df, test_df

# Make the split
train_df, validtest_df = multilabel_stratified_split(df, label_cols, test_size=0.4, random_state=42)
valid_df, test_df = multilabel_stratified_split(validtest_df, label_cols, test_size=0.5, random_state=42)

# Sanity check: compare prevalences per label in full vs train vs test
def prevalence(table, cols):
    n = len(table)
    return pd.Series({c: table[c].mean() for c in cols}).to_frame("prevalence").assign(n=n)

summary = pd.concat(
    {
        "full": prevalence(df, label_cols),
        "train": prevalence(train_df, label_cols),
        "valid": prevalence(valid_df, label_cols),
        "test": prevalence(test_df, label_cols),
    },
    axis=1,
)
print(summary)

# train_df.to_csv("data/cancer_now_train.csv", index=False)
# valid_df.to_csv("data/cancer_now_valid.csv", index=False)
# test_df.to_csv("data/cancer_now_test.csv", index=False)

In [22]:
DIR = "/../../orcd/pool/003/dbertsim_shared/ukb/"
train_df = pd.read_csv(f'{DIR}ukb_cancer_train.csv')
valid_df = pd.read_csv(f'{DIR}ukb_cancer_valid.csv')
test_df = pd.read_csv(f'{DIR}ukb_cancer_test.csv')

  train_df = pd.read_csv(f'{DIR}ukb_cancer_train.csv')
  valid_df = pd.read_csv(f'{DIR}ukb_cancer_valid.csv')
  test_df = pd.read_csv(f'{DIR}ukb_cancer_test.csv')


## Each cancer: current diagnosis

In [45]:
def filter_cohort_by_time(df, c, start_year):
    df = df.loc[(df[f'{c}_time_to_diagnosis']>start_year) | (df[f'{c}_time_to_diagnosis'].isna())]
    return df

def get_y(df, c):
    return (df[f"{c}_time_to_diagnosis"] > 0) & (df[f"{c}_time_to_diagnosis"] <= 1)

## Predict 0-1 diagnosis
# train_df = preprocess(train_df, onehot=True)
# valid_df = preprocess(valid_df, onehot=True)
# test_df = preprocess(test_df, onehot=True)

for c in label_time_cols:
    print(c)
    
    filtered_train_df = filter_cohort_by_time(train_df, c, 0)
    filtered_valid_df = filter_cohort_by_time(valid_df, c, 0)
    filtered_test_df = filter_cohort_by_time(test_df, c, 0)
    
    if c == "breast":
        filtered_train_df = filtered_train_df.loc[filtered_train_df['Sex_male'] == 0]
        filtered_valid_df = filtered_valid_df.loc[filtered_valid_df['Sex_male'] == 0]
        filtered_test_df = filtered_test_df.loc[filtered_test_df['Sex_male'] == 0]
    elif c == "prostate":
        filtered_train_df = filtered_train_df.loc[filtered_train_df['Sex_male'] == 1]
        filtered_valid_df = filtered_valid_df.loc[filtered_valid_df['Sex_male'] == 1]
        filtered_test_df = filtered_test_df.loc[filtered_test_df['Sex_male'] == 1]

    X_train = get_X(filtered_train_df)
    X_valid = get_X(filtered_valid_df)
    X_test = get_X(filtered_test_df)

    y_train = get_y(filtered_train_df, c)
    y_valid = get_y(filtered_valid_df, c)
    y_test = get_y(filtered_test_df, c)

    dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid, enable_categorical=True)
    dtest = xgb.DMatrix(data=X_test, label=y_test, enable_categorical=True)

    model = train_xgboost(dtrain, dvalid)
    
    train_auc = roc_auc_score(y_train, model.predict(dtrain))
    valid_auc = roc_auc_score(y_valid, model.predict(dvalid))
    test_auc = roc_auc_score(y_test, model.predict(dtest))
    print((sum(y_train) + sum(y_test) + sum(y_valid))/(len(y_train) + len(y_test) + len(y_valid)))
    print(f"Train AUC: {round(train_auc,3)}")
    print(f"Validation AUC: {round(valid_auc,3)}")
    print(f"Test AUC: {round(test_auc,3)}")
    print("---------------------------------\n")
    model.save_model(f"{models_dir}/xgb_model_0-1_{c}.json")
    

cancer
0.011669995007488767
Train AUC: 1.0
Validation AUC: 0.579
Test AUC: 0.606
---------------------------------

breast
0.003156176310538727
Train AUC: 1.0
Validation AUC: 0.572
Test AUC: 0.507
---------------------------------

prostate
0.003280594659690212
Train AUC: 1.0
Validation AUC: 0.718
Test AUC: 0.808
---------------------------------

lung
0.0005665615380257219
Train AUC: 1.0
Validation AUC: 0.8
Test AUC: 0.757
---------------------------------

colorectal
0.0007958313595452393
Train AUC: 1.0
Validation AUC: 0.648
Test AUC: 0.702
---------------------------------

bladder
0.00016997167138810198
Train AUC: 1.0
Validation AUC: 0.585
Test AUC: 0.686
---------------------------------

pancreatic
0.00016985298280710362
Train AUC: 1.0
Validation AUC: 0.165
Test AUC: 0.312
---------------------------------



In [47]:
def filter_cohort_by_time(df, c, start_year):
    df = df.loc[(df[f'{c}_time_to_diagnosis']>start_year) | (df[f'{c}_time_to_diagnosis'].isna())]
    return df

def get_y(df, c):
    return (df[f"{c}_time_to_diagnosis"] > 0) & (df[f"{c}_time_to_diagnosis"] <= 1)

## Predict 0-1 diagnosis
# train_df = preprocess(train_df, onehot=True)
# valid_df = preprocess(valid_df, onehot=True)
# test_df = preprocess(test_df, onehot=True)

for c in label_time_cols:
    print(c)
    
    filtered_train_df = filter_cohort_by_time(train_df, c, 0)
    filtered_valid_df = filter_cohort_by_time(valid_df, c, 0)
    filtered_test_df = filter_cohort_by_time(test_df, c, 0)
    
    if c == "breast":
        filtered_train_df = filtered_train_df.loc[filtered_train_df['Sex_male'] == 0]
        filtered_valid_df = filtered_valid_df.loc[filtered_valid_df['Sex_male'] == 0]
        filtered_test_df = filtered_test_df.loc[filtered_test_df['Sex_male'] == 0]
    elif c == "prostate":
        filtered_train_df = filtered_train_df.loc[filtered_train_df['Sex_male'] == 1]
        filtered_valid_df = filtered_valid_df.loc[filtered_valid_df['Sex_male'] == 1]
        filtered_test_df = filtered_test_df.loc[filtered_test_df['Sex_male'] == 1]

    X_train = get_X(filtered_train_df)
    X_valid = get_X(filtered_valid_df)
    X_test = get_X(filtered_test_df)

    y_train = get_y(filtered_train_df, c)
    y_valid = get_y(filtered_valid_df, c)
    y_test = get_y(filtered_test_df, c)

    dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid, enable_categorical=True)
    dtest = xgb.DMatrix(data=X_test, label=y_test, enable_categorical=True)

    model = train_xgboost(dtrain, dvalid)
    
    train_auc = roc_auc_score(y_train, model.predict(dtrain))
    valid_auc = roc_auc_score(y_valid, model.predict(dvalid))
    test_auc = roc_auc_score(y_test, model.predict(dtest))
    print((sum(y_train) + sum(y_test) + sum(y_valid))/(len(y_train) + len(y_test) + len(y_valid)))
    print(f"Train AUC: {round(train_auc,3)}")
    print(f"Validation AUC: {round(valid_auc,3)}")
    print(f"Test AUC: {round(test_auc,3)}")
    print("---------------------------------\n")
    model.save_model(f"{models_dir}/xgb_model_0-1_{c}.json")
    

cancer
0.011669995007488767
Train AUC: 1.0
Validation AUC: 0.609
Test AUC: 0.636
---------------------------------

breast
0.003156176310538727
Train AUC: 1.0
Validation AUC: 0.591
Test AUC: 0.522
---------------------------------

prostate
0.003280594659690212
Train AUC: 1.0
Validation AUC: 0.828
Test AUC: 0.908
---------------------------------

lung
0.0005665615380257219
Train AUC: 1.0
Validation AUC: 0.714
Test AUC: 0.651
---------------------------------

colorectal
0.0007958313595452393
Train AUC: 1.0
Validation AUC: 0.58
Test AUC: 0.881
---------------------------------

bladder
0.00016997167138810198
Train AUC: 1.0
Validation AUC: 0.501
Test AUC: 0.52
---------------------------------

pancreatic
0.00016985298280710362
Train AUC: 1.0
Validation AUC: 0.53
Test AUC: 0.904
---------------------------------



In [None]:
def filter_cohort_by_time(df, c, start_year):
    df = df.loc[(df[f'{c}_time_to_diagnosis']>start_year) | (df[f'{c}_time_to_diagnosis'].isna())]
    return df

def get_y(df, c):
    return (df[f"{c}_time_to_diagnosis"] > 0) & (df[f"{c}_time_to_diagnosis"] <= 1)

## Predict 0-1 diagnosis
# train_df = preprocess(train_df, onehot=True)
# valid_df = preprocess(valid_df, onehot=True)
# test_df = preprocess(test_df, onehot=True)

for c in label_time_cols:
    print(c)
    
    filtered_train_df = filter_cohort_by_time(train_df, c, 0)
    filtered_valid_df = filter_cohort_by_time(valid_df, c, 0)
    filtered_test_df = filter_cohort_by_time(test_df, c, 0)
    
    if c == "breast":
        filtered_train_df = filtered_train_df.loc[filtered_train_df['Sex_male'] == 0]
        filtered_valid_df = filtered_valid_df.loc[filtered_valid_df['Sex_male'] == 0]
        filtered_test_df = filtered_test_df.loc[filtered_test_df['Sex_male'] == 0]
    elif c == "prostate":
        filtered_train_df = filtered_train_df.loc[filtered_train_df['Sex_male'] == 1]
        filtered_valid_df = filtered_valid_df.loc[filtered_valid_df['Sex_male'] == 1]
        filtered_test_df = filtered_test_df.loc[filtered_test_df['Sex_male'] == 1]

    X_train = get_X(filtered_train_df)
    X_valid = get_X(filtered_valid_df)
    X_test = get_X(filtered_test_df)

    y_train = get_y(filtered_train_df, c)
    y_valid = get_y(filtered_valid_df, c)
    y_test = get_y(filtered_test_df, c)

    dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid, enable_categorical=True)
    dtest = xgb.DMatrix(data=X_test, label=y_test, enable_categorical=True)

    model = train_xgboost(dtrain, dvalid)
    
    train_auc = roc_auc_score(y_train, model.predict(dtrain))
    valid_auc = roc_auc_score(y_valid, model.predict(dvalid))
    test_auc = roc_auc_score(y_test, model.predict(dtest))
    print((sum(y_train) + sum(y_test) + sum(y_valid))/(len(y_train) + len(y_test) + len(y_valid)))
    print(f"Train AUC: {round(train_auc,3)}")
    print(f"Validation AUC: {round(valid_auc,3)}")
    print(f"Test AUC: {round(test_auc,3)}")
    print("---------------------------------\n")
    

cancer
Starting epoch 1/5...
Starting epoch 2/5...
Starting epoch 3/5...
Starting epoch 4/5...
Starting epoch 5/5...
Training complete!
0.011669995007488767
Train AUC: 1.0
Validation AUC: 0.605
Test AUC: 0.644
---------------------------------

breast
Starting epoch 1/5...
Starting epoch 2/5...
Starting epoch 3/5...
Starting epoch 4/5...
Starting epoch 5/5...
Training complete!
0.003156176310538727
Train AUC: 1.0
Validation AUC: 0.611
Test AUC: 0.512
---------------------------------

prostate
Starting epoch 1/5...
Starting epoch 2/5...
Starting epoch 3/5...
Starting epoch 4/5...
Starting epoch 5/5...
Training complete!
0.003280594659690212
Train AUC: 1.0
Validation AUC: 0.85
Test AUC: 0.874
---------------------------------

lung
Starting epoch 1/5...
Starting epoch 2/5...
Starting epoch 3/5...
Starting epoch 4/5...
Starting epoch 5/5...
Training complete!
0.0005665615380257219
Train AUC: 1.0
Validation AUC: 0.801
Test AUC: 0.606
---------------------------------

colorectal
Starting 

In [38]:
(sum(y_train) + sum(y_test) + sum(y_valid))/(len(y_train) + len(y_test) + len(y_valid))

87

In [39]:
len(y_train) + len(y_test) + len(y_valid)

27565

In [40]:
87/276565

0.0003145734275848354

In [12]:
## Predict current diagnosis
train_df = pd.read_csv('data/cancer_now_train.csv')
# valid_df = pd.read_csv('data/cancer_now_valid.csv')
test_df = pd.read_csv('data/cancer_now_test.csv')
NUM_EPOCHS = 3

preprocess(train_df)
preprocess(test_df)

X_train = get_X(train_df)
X_test = get_X(test_df)

for k in label_cols:
    print(k)
    y_train = train_df[k]
    model = train_xgboost(X_train, y_train, num_epochs = NUM_EPOCHS)

    dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
    train_auc = roc_auc_score(y_train, model.predict(dtrain))
    
    y_test = test_df[k]
    model = xgb.XGBClassifier()
    model.load_model(f"{models_dir}/xgb_model_{k}.json")
    y_test = test_df[k]

    dtest = xgb.DMatrix(data=X_test, label=y_test, enable_categorical=True)
    y_pred_proba = model.predict(dtest)
    test_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"Train AUC: {round(train_auc,3)}")
    print(f"Test AUC: {round(test_auc,3)}")
    print("---------------------------------\n")
    # model.save_model(f"{models_dir}/xgb_model_{k}.json")
    

cancer
Test AUC: 0.67
---------------------------------

breast_cancer
Test AUC: 0.876
---------------------------------

prostate_cancer
Test AUC: 0.924
---------------------------------

lung_cancer
Test AUC: 0.616
---------------------------------

colorectal_cancer
Test AUC: 0.722
---------------------------------

bladder_cancer
Test AUC: 0.609
---------------------------------

pancreatic_cancer
Test AUC: 0.765
---------------------------------



In [5]:
test_df = pd.read_csv('data/cancer_now_test.csv')
models = []
for k in label_cols:
    model = xgb.XGBClassifier()
    model.load_model(f"{models_dir}/xgb_model_{k}.json")
    models.append(model)

model_names = [f"model_{k}" for k in label_cols]  # names for the columns

# --- 1) Collect each model’s predicted probabilities on the *training* set ---
#    We’ll use these to *train* a policy tree that chooses among the 7 models
#    (or optionally defers) *based on patient features X*.
def proba_df(models, X, names):
    # Each XGB model’s predict_proba returns shape (n, 2); we keep P(y=1)
    cols = {}
    for m, name in zip(models, names):
        p = m.predict_proba(X)[:, 1]
        cols[name] = p
    return pd.DataFrame(cols, index=getattr(X, "index", None))

preprocess(test_df)
X_test = get_X(test_df)
pred_proba = proba_df(models, X_test, model_names)
pred_proba.to_csv("data/xgb_test_pred_proba.csv",index = False)


## Each cancer: Predict within 5 years

In [23]:
## Predict diagnosis within 5 years

for cancer in ["breast", "prostate", "lung", "colorectal", "bladder", "pancreatic", "liver"]:
    print(cancer)
    y = (df[f"{cancer}_time_to_diagnosis"] <= 5).astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = train_xgboost(X_train, y_train, num_epochs = 3)

    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    train_auc = roc_auc_score(y_train, model.predict(dtrain))

    dtest = xgb.DMatrix(data=X_test, label=y_test)
    y_pred_proba = model.predict(dtest)
    test_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"Train AUC: {round(train_auc,3)}")
    print(f"Test AUC: {round(test_auc,3)}")
    print("---------------------------------\n")

breast
Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.837
---------------------------------

prostate
Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.914
---------------------------------

lung
Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.85
---------------------------------

colorectal
Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.664
---------------------------------

bladder
Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.673
---------------------------------

pancreatic
Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.693
---------------------------------

liver
Starting epoch 1/3...
Starting epoch 2

In [None]:
## Excluding patients already positive at time of diagnosis

for cancer in ["breast", "prostate", "lung", "colorectal", "bladder", "pancreatic", "liver"]:
    print(cancer)
    df_sub = df.loc[df[f"{cancer}_cancer"] == 0]
    X_sub = df_sub.drop(columns = ['eid', 'assessment_date', 'bladder_cancer', 'breast_cancer',
       'colorectal_cancer', 'liver_cancer', 'lung_cancer', 'pancreatic_cancer',
       'prostate_cancer', 'bladder_time_to_diagnosis',
       'breast_time_to_diagnosis', 'colorectal_time_to_diagnosis',
       'liver_time_to_diagnosis', 'lung_time_to_diagnosis',
       'pancreatic_time_to_diagnosis', 'prostate_time_to_diagnosis'])
    y_sub = (df_sub[f"{cancer}_time_to_diagnosis"] <= 5).astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X_sub, y_sub, test_size=0.2, random_state=42)
    model = train_xgboost(X_train, y_train, num_epochs = 3)

    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    train_auc = roc_auc_score(y_train, model.predict(dtrain))

    dtest = xgb.DMatrix(data=X_test, label=y_test)
    y_pred_proba = model.predict(dtest)
    test_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"Train AUC: {round(train_auc,3)}")
    print(f"Test AUC: {round(test_auc,3)}")
    print("---------------------------------")

breast
Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.744
---------------------------------
prostate
Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.931
---------------------------------
lung
Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.869
---------------------------------
colorectal
Starting epoch 1/3...


## Any cancer: current diagnosis

In [13]:
# y = df["cancer"] == 1
y = df["cancer_time_to_diagnosis"] <= 2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = train_xgboost(X_train, y_train, num_epochs = 3)

dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
train_auc = roc_auc_score(y_train, model.predict(dtrain))

dtest = xgb.DMatrix(data=X_test, label=y_test, enable_categorical=True)
y_pred_proba = model.predict(dtest)
test_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Train AUC: {round(train_auc,3)}")
print(f"Test AUC: {round(test_auc,3)}")

Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
Train AUC: 1.0
Test AUC: 0.679


In [29]:
fmap = {f"f{i}": col for i, col in enumerate(X_train.columns)}

raw_imp = model.get_score(importance_type='total_gain')  # dict like {'f12': 1.23, ...}
imp_series = (
    pd.Series({fmap.get(k, k): v for k, v in raw_imp.items()})
      .reindex(X_train.columns, fill_value=0.0)  # keep order; zero for unused
)
top_feats = imp_series.sort_values(ascending=False).head(100).index.tolist()

with open('output/xgb_breast_cancer_top_100_features.txt', 'w') as f:
    for feat in top_feats:
        f.write(f"{feat}\n")

In [75]:
models_dir = "output/"
model.save_model(f"{models_dir}/xgb_model_cancer.json")

### Feature Selection

In [None]:
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier

# X_train_rfe = X_train.drop(columns = ['Ethnic background', 'Smoking status', 'Alcohol intake frequency.', 'Medication for cholesterol, blood pressure or diabetes'])

# model = XGBClassifier(random_state=42)
# rfe = RFE(estimator=model, n_features_to_select=300)
# rfe.fit(X_train_rfe, y_train)
# model_selected = XGBClassifier(random_state=42)
# model_selected.fit(X_train[:, rfe.support_], y_train)[:,1]
model_selected.predict_proba(X_test)

roc_auc_score(y_test, y_pred_proba)

In [None]:
rfe.support_

In [77]:
# --- 1) Get top-300 important features from the trained model ---
# Map f0,f1,... back to real column names
fmap = {f"f{i}": col for i, col in enumerate(X_train.columns)}

# Use 'total_gain' (more informative than count-only 'weight')
raw_imp = model.get_score(importance_type='total_gain')  # dict like {'f12': 1.23, ...}
imp_series = (
    pd.Series({fmap.get(k, k): v for k, v in raw_imp.items()})
      .reindex(X_train.columns, fill_value=0.0)  # keep order; zero for unused
)

top_k = min(300, X_train.shape[1])
top_feats = imp_series.sort_values(ascending=False).head(top_k).index.tolist()
print(f"Selected top {len(top_feats)} features.")

# --- 2) Subset data to top features ---
X_train_top = X_train[top_feats].copy()
X_test_top  = X_test[top_feats].copy()

# --- 3) Retrain and evaluate on top features ---
model_top = train_xgboost(X_train_top, y_train, num_epochs=3)

dtrain_top = xgb.DMatrix(data=X_train_top, label=y_train, enable_categorical=True)
train_auc_top = roc_auc_score(y_train, model_top.predict(dtrain_top))

dtest_top = xgb.DMatrix(data=X_test_top, label=y_test, enable_categorical=True)
y_pred_proba_top = model_top.predict(dtest_top)
test_auc_top = roc_auc_score(y_test, y_pred_proba_top)

model_top.save_model(f"{models_dir}/xgb_model_cancer_top_300.json")

print(f"[Top-300] Train AUC: {train_auc_top:.3f} | Test AUC: {test_auc_top:.3f}")

[Top-300] Train AUC: 1.000 | Test AUC: 0.675


In [84]:
# --- 1) Get top-30 important features from the trained model ---

# Use 'total_gain' (more informative than count-only 'weight')
raw_imp = model_top.get_score(importance_type='total_gain')  # dict like {'f12': 1.23, ...}
imp_series = (
    pd.Series({fmap.get(k, k): v for k, v in raw_imp.items()})
      .reindex(X_train.columns, fill_value=0.0)  # keep order; zero for unused
)

top_k = min(30, X_train.shape[1])
top_feats = imp_series.sort_values(ascending=False).head(top_k).index.tolist()

# --- 2) Subset data to top features ---
X_train_top = X_train[top_feats].copy()
X_test_top  = X_test[top_feats].copy()

# --- 3) Retrain and evaluate on top features ---
model_top = train_xgboost(X_train_top, y_train, num_epochs=3)

dtrain_top = xgb.DMatrix(data=X_train_top, label=y_train, enable_categorical=True)
train_auc_top = roc_auc_score(y_train, model_top.predict(dtrain_top))

dtest_top = xgb.DMatrix(data=X_test_top, label=y_test, enable_categorical=True)
y_pred_proba_top = model_top.predict(dtest_top)
test_auc_top = roc_auc_score(y_test, y_pred_proba_top)

print(f"[Top-30] Train AUC: {train_auc_top:.3f} | Test AUC: {test_auc_top:.3f}")

Starting epoch 1/3...
Starting epoch 2/3...
Starting epoch 3/3...
Training complete!
[Top-30] Train AUC: 1.000 | Test AUC: 0.648


In [86]:
sum(y)

4958

## Any cancer: within 5 years

In [None]:
df_sub = df.loc[(df["breast_cancer"] == 0) & (df["prostate_cancer"] == 0) & (df["lung_cancer"] == 0) \
            & (df["colorectal_cancer"] == 0) & (df["bladder_cancer"] == 0) \
            & (df["pancreatic_cancer"] == 0) & (df["liver_cancer"] == 0)]
X_sub = df_sub.drop(columns = ['eid', 'assessment_date', 'bladder_cancer', 'breast_cancer',
   'colorectal_cancer', 'liver_cancer', 'lung_cancer', 'pancreatic_cancer',
   'prostate_cancer', 'bladder_time_to_diagnosis',
   'breast_time_to_diagnosis', 'colorectal_time_to_diagnosis',
   'liver_time_to_diagnosis', 'lung_time_to_diagnosis',
   'pancreatic_time_to_diagnosis', 'prostate_time_to_diagnosis'])
y_sub = (df_sub[f"breast_time_to_diagnosis"] <= 5) | (df_sub[f"prostate_time_to_diagnosis"] <= 5) \
            | (df_sub[f"lung_time_to_diagnosis"] <= 5) | (df_sub[f"colorectal_time_to_diagnosis"] <= 5) \
            | (df_sub[f"bladder_time_to_diagnosis"] <= 5) | (df_sub[f"pancreatic_time_to_diagnosis"] <= 5) \
            | (df_sub[f"liver_time_to_diagnosis"] <= 5)

X_train, X_test, y_train, y_test = train_test_split(X_sub, y_sub, test_size=0.2, random_state=42)
model = train_xgboost(X_train, y_train, num_epochs = 3)

dtrain = xgb.DMatrix(data=X_train, label=y_train)
train_auc = roc_auc_score(y_train, model.predict(dtrain))

dtest = xgb.DMatrix(data=X_test, label=y_test)
y_pred_proba = model.predict(dtest)
test_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Train AUC: {round(train_auc,3)}")
print(f"Test AUC: {round(test_auc,3)}")