In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, GroupKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, f1_score
from sklearn.utils import class_weight
from datetime import datetime
import time
import sys
import gc
import os
import psutil
import signal
import re
import xgboost as xgb

OUTCOME = "breast"
# if len(sys.argv) >= 2:
#         TIME = int(sys.argv[2])

RESULTS_DIR = f'results/{datetime.now().strftime("%d-%m-%Y-%H-%M-%S")}/'
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

def get_X(df):
    feat_cols = [c for c in df.columns if c.startswith('c18')]
    return df.loc[:, feat_cols]

def get_y(df):
    # if len(sys.argv) >= 2:
    #     return (df[f"{c}_time_to_diagnosis"] > 0) & (df[f"{c}_time_to_diagnosis"] <= {TIME})
    # else:
    return df[f"{OUTCOME}_cancer"] == 1

def train_xgboost(
    X_train, y_train, X_valid, y_valid, num_epochs=5
):
    dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid, enable_categorical=True)
    params = {
        # "objective": "multi:softprob",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist",  # Efficient for large datasets
        "max_depth": 10,
        "learning_rate": 0.1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
    }
    
    model = None
    evals = [(dtrain, "train"), (dvalid, "valid")]
    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch + 1}/{num_epochs}...")

        # Train incrementally
        if model is None:
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=200,
                evals=evals,
                early_stopping_rounds=20,
                verbose_eval=False
            )
        else:
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=200,
                evals=evals,
                early_stopping_rounds=20,
                xgb_model=model,
                verbose_eval=False
            )
    print("Training complete!")
    return model



def model_report():   
    # get relevant columns
    # train = pd.read_csv("/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/train_c18_selected_features_score0.03_with_outcome.csv")
    # valid = pd.read_csv("/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/valid_c18_selected_features_score0.03_with_outcome.csv")
    # test = pd.read_csv("/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/test_c18_selected_features_score0.03_with_outcome.csv")
    train_X = get_X(train)
    valid_X = get_X(valid)
    test_X = get_X(test)
    
    train_y = get_y(train)
    valid_y = get_y(valid)
    test_y = get_y(valid)
    
    # Train the models
    start = time.time()

    print("Doing xgb")
    model, params = train_xgboost(train_X, train_y, valid_X, valid_y)
    dtest = xgb.DMatrix(data=X_test, label=y_test, enable_categorical=True)
    y_prob = model.predict(dtest)
    y_pred = (y_prob > 0.5).astype(int)
        
    importances = pd.Series(model.feature_importances_, index=train_X.columns).sort_values(ascending=False)

    elapsed = time.time() - start
    
    # Print out the report
    model.save_model(f"output/xgb_ch18_{OUTCOME}.json")
    
    original_stdout = sys.stdout
    file = open(f'{RESULTS_DIR}{title}.txt', 'w')
    sys.stdout = file

    print(f'# {title if title else "Experiment Report"} #')
    print(f'Report generated at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}')
    print('\n\n')

    print(f'## Metadata')
    print(f'Total time elapsed: {elapsed : .2f} sec')
    print(f'Train rows: {len(train_X)}')
    print(f'Validation rows: {len(valid_X)}')
    print(f'Test rows: {len(test_X)}')
    print(f'Total rows: {len(train_X) + len(valid_X + len(test_X))}')
    print('\n\n')

    print(f'## Metrics')
    # print(f'F1: {f1_score(test_y, y_pred, average="weighted"):.4f}')
    print(f'AUC: {roc_auc_score(test_y, y_prob):.4f}')
    
    print(f'Confusion matrix:\n{confusion_matrix(test_y, y_pred)}')

    print(f'Classification report:\n{classification_report(test_y, y_pred)}')

    print('\n\n')

    if importances is not None:
        importances.to_csv(f'{LABELS_DIR}/{title}_feature_importances.csv', header=True)
        print(f'## Importances')
        print(f'\nTop:\n{importances.head(20)}')
        print(f'\nBottom:\n{importances.tail(20)}')
    
        
    file.close()  
    sys.stdout = original_stdout
       
    # Save predicted probabilities
    np.save(f'{LABELS_DIR}{title}_y_prob_test_{TEST_DATASET}', y_prob)
    np.save(f'{LABELS_DIR}{title}_y_prob_train_{TRAIN_DATASET}', y_prob_train)
    # Save true
    np.save(f'{LABELS_DIR}{title}_y_true_test_{TEST_DATASET}', y_test_this)
    np.save(f'{LABELS_DIR}{title}_y_true_train_{TRAIN_DATASET}', y_train_this)
    
    result = [roc_auc_score(y_test_this, y_prob[:,1]), elapsed]
    return result



In [52]:
model_report()

KeyError: 'breast_cancer'

In [67]:
def train_xgboost(
    X_train, y_train, num_epochs=5
):
    dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
    # dvalid = xgb.DMatrix(data=X_valid, label=y_valid, enable_categorical=True)
    params = {
        # "objective": "multi:softprob",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "tree_method": "hist",  # Efficient for large datasets
        "max_depth": 10,
        "learning_rate": 0.1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
    }
    
    model = None
    # evals = [(dtrain, "train"), (dvalid, "valid")]
    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch + 1}/{num_epochs}...")

        # Train incrementally
        if model is None:
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=200,
                # evals=evals,
                verbose_eval=False
            )
        else:
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=200,
                # evals=evals,
                xgb_model=model,
                verbose_eval=False
            )
    print("Training complete!")
    return model, params

In [69]:
X_test = get_X(test)
y_test = get_y(test)
dtest = xgb.DMatrix(data=X_test, label=y_test)
y_prob = model.predict(dtest)
y_pred = (y_prob > 0.5).astype(int)
roc_auc_score(y_test, y_prob) # 0 years

0.5159621777268837

In [57]:
roc_auc_score(y_test, y_prob) # 0-1 years

0.4970535856972112

In [27]:
# train = pd.read_csv("/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/train_c18_selected_features_score0.03_with_outcome.csv")
# train_X = get_X(train)
# train_y = get_y(train)
# Train the models
start = time.time()

print("Doing xgb")
model, params = train_xgboost(train_X, train_y)


Doing xgb
Starting epoch 1/5...
Starting epoch 2/5...
Starting epoch 3/5...
Starting epoch 4/5...
Starting epoch 5/5...
Training complete!


In [68]:
c = "breast"
def get_y(df):
    # if len(sys.argv) >= 2:
    return (df[f"{c}_time_to_diagnosis"] > 0) & (df[f"{c}_time_to_diagnosis"] <= 1)
    # else:
    # return df[f"{OUTCOME}_cancer"] == 1

train_X = get_X(train)
train_y = get_y(train)
# Train the models
start = time.time()

print("Doing xgb")
model, params = train_xgboost(train_X, train_y)

Doing xgb
Starting epoch 1/5...
Starting epoch 2/5...
Starting epoch 3/5...
Starting epoch 4/5...
Starting epoch 5/5...
Training complete!


In [54]:
test = pd.read_csv("/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/test_c18_selected_features_score0.03_final.csv")

In [63]:
train = pd.read_csv("/orcd/pool/003/dbertsim_shared/ukb/bgen/ch18/train_c18_selected_features_score0.03_with_outcome.csv")

In [43]:
train_samples = train['eid']
valid_samples = valid['eid']
test_samples = test['eid']
train = df.loc[df['eid'].isin(train_samples)]
valid = df.loc[df['eid'].isin(valid_samples)]
test = df.loc[df['eid'].isin(test_samples)]