In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import random as rd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import ray
from ray import tune
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [8]:
df = pd.read_excel('symile_labs_complete.xlsx')
df.head()

Unnamed: 0,subject_id,51221_percentile,51265_percentile,50912_percentile,50971_percentile,51222_percentile,51301_percentile,51249_percentile,51279_percentile,51250_percentile,...,50813_percentile,50863_percentile,50885_percentile,50820_percentile,50862_percentile,50802_percentile,50821_percentile,50804_percentile,50818_percentile,50910_percentile
0,12138413,0.653935,0.471492,0.66817,0.122564,0.58566,0.379092,0.273604,0.508236,0.829349,...,0.32833,0.188457,0.750168,0.371901,0.539988,0.660286,0.141536,0.662118,0.72287,0.642922
1,12988457,0.744261,0.549488,0.764983,0.87392,0.600321,0.128942,0.115307,0.782644,0.367417,...,0.557945,0.414605,0.663706,0.807163,0.60397,0.469087,0.234647,0.287035,0.146159,0.226079
2,18624683,0.074887,0.071672,0.197234,0.012156,0.120607,0.56417,0.834371,0.080755,0.603556,...,0.875884,0.103988,0.750168,0.485908,0.374253,0.273487,0.058992,0.287035,0.304645,0.898581
3,11914968,0.870576,0.616041,0.66817,0.122564,0.921972,0.945371,0.875151,0.799719,0.661008,...,0.9746,0.565034,0.396615,0.154482,0.770091,0.108031,0.535329,0.117103,0.471935,0.878056
4,17957742,0.433584,0.264405,0.56364,0.886779,0.430006,0.735389,0.424468,0.512455,0.30675,...,0.119353,0.475349,0.780999,0.872643,0.42513,0.660286,0.916355,0.436496,0.173674,0.886206


In [9]:

all_columns = df.columns.tolist()
all_columns.remove('subject_id')
input_columns = rd.sample(all_columns, 25)

remaining_columns = [col for col in all_columns if col not in input_columns]
output_columns = rd.sample(remaining_columns, 5)


In [10]:
# Common, inexpensive blood markers (good inputs)
INPUT_MARKERS = [
    "51221_percentile",  # Hematocrit
    "51222_percentile",  # Hemoglobin
    "51265_percentile",  # Platelet Count
    "51301_percentile",  # White Blood Cells
    "51279_percentile",  # Red Blood Cells
    "51250_percentile",  # MCV
    "51248_percentile",  # MCH
    "51249_percentile",  # MCHC
    # "51277_percentile",  # RDW
    "50983_percentile",  # Sodium
    "50971_percentile",  # Potassium
    "50902_percentile",  # Chloride
    "50882_percentile",  # Bicarbonate
    "50893_percentile",  # Calcium, Total
    "50931_percentile",  # Glucose
    "50912_percentile",  # Creatinine
    "51006_percentile",  # Urea Nitrogen (BUN)
    "50960_percentile",  # Magnesium
    "50970_percentile",  # Phosphate
    "50862_percentile",  # Albumin
    "50820_percentile",  # pH
]

# Less common, more expensive blood markers (good outputs)
OUTPUT_MARKERS = [
    "51237_percentile",  # INR (PT)
    "51274_percentile",  # PT
    "51275_percentile",  # PTT
    "52172_percentile",  # RDW-SD
    "50813_percentile",  # Lactate
    "50885_percentile",  # Bilirubin, Total
    "50861_percentile",  # Alanine Aminotransferase (ALT)
    "50878_percentile",  # Aspartate Aminotransferase (AST)
    "50863_percentile",  # Alkaline Phosphatase
    
    "50821_percentile",  # pO2
    "50818_percentile",  # pCO2
    "50804_percentile",  # Calculated Total CO2
    "50910_percentile",  # Creatine Kinase (CK)
]


In [28]:
ray.shutdown()
ray.init()  # Use only 2 CPU cores
param_space = {
    "n_estimators": tune.randint(50, 500),
    "learning_rate": tune.loguniform(0.001, 0.2),
    "max_depth": tune.randint(3, 10),
    "subsample": tune.uniform(0.5, 1.0),
    "colsample_bytree": tune.uniform(0.5, 1.0),
    "gamma": tune.loguniform(0.001, 10),
    "reg_alpha": tune.loguniform(0.0001, 10),  # L1 regularization
    "reg_lambda": tune.loguniform(0.0001, 10),  # L2 regularization
}

# 📌 Define objective function for Ray Tune
def train_xgboost(config):
    model = xgb.XGBRegressor(
        n_estimators=config["n_estimators"],
        learning_rate=config["learning_rate"],
        max_depth=config["max_depth"],
        subsample=config["subsample"],
        colsample_bytree=config["colsample_bytree"],
        gamma=config["gamma"],
        reg_alpha=config["reg_alpha"],
        reg_lambda=config["reg_lambda"],
        objective="reg:squarederror",
        tree_method="hist",  # Faster training
        random_state=42
    )

    # Train model
    model.fit(X_train, y_train)
    
    # Predict and calculate MSE
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    # Report back to Ray Tune
    return {"mse": mse}

# 📌 Run Ray Tune's hyperparameter search

tuner = tune.Tuner(train_xgboost, param_space=param_space, tune_config=tune.TuneConfig(num_samples=20))  # ③

results = tuner.fit()
best_config = results.get_best_result(metric="mse", mode="min").config
print("Best Parameters Found:", best_config)

# 📌 Train final XGBoost model using best params
best_model = xgb.XGBRegressor(**best_config, objective="reg:squarederror", tree_method="hist")
best_model.fit(X_train, y_train)

# 📌 Evaluate the best model
y_pred = best_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_pred)
print(f"Final Test MSE: {final_mse:.4f}")

0,1
Current time:,2025-03-12 08:02:44
Running for:,00:00:07.99
Memory:,10.0/16.0 GiB

Trial name,status,loc,colsample_bytree,gamma,learning_rate,max_depth,n_estimators,reg_alpha,reg_lambda,subsample,iter,total time (s),mse
train_xgboost_45393_00000,TERMINATED,127.0.0.1:55328,0.582149,1.70837,0.0201816,8,101,0.100169,0.000171219,0.743356,1,0.12166,0.0745239
train_xgboost_45393_00001,TERMINATED,127.0.0.1:55327,0.590236,1.73732,0.00370167,3,112,0.0262619,0.590624,0.697843,1,0.0802219,0.0817618
train_xgboost_45393_00002,TERMINATED,127.0.0.1:55326,0.834119,0.259217,0.0331295,9,465,0.309394,0.000103432,0.934757,1,0.426264,0.0673634
train_xgboost_45393_00003,TERMINATED,127.0.0.1:55324,0.811217,3.38787,0.00712334,7,218,0.000121612,0.0618517,0.58036,1,0.174964,0.0791589
train_xgboost_45393_00004,TERMINATED,127.0.0.1:55325,0.684428,0.562773,0.00217662,8,236,4.46511,0.0979391,0.82549,1,0.207211,0.0806095
train_xgboost_45393_00005,TERMINATED,127.0.0.1:55319,0.691828,0.107617,0.0846719,9,454,0.374775,0.109444,0.905853,1,0.370641,0.0685018
train_xgboost_45393_00006,TERMINATED,127.0.0.1:55320,0.537336,3.19986,0.0549228,7,204,1.02157,0.000668129,0.947207,1,0.164593,0.0780025
train_xgboost_45393_00007,TERMINATED,127.0.0.1:55323,0.658496,0.179006,0.00451612,9,80,0.0016265,0.000569149,0.966517,1,0.580201,0.0781063
train_xgboost_45393_00008,TERMINATED,127.0.0.1:55322,0.951701,0.991914,0.022111,8,92,0.0141497,0.32457,0.757673,1,0.103752,0.0705354
train_xgboost_45393_00009,TERMINATED,127.0.0.1:55321,0.926231,1.04444,0.0226245,7,215,0.000356161,9.03961,0.941183,1,0.1573,0.0713316


2025-03-12 08:02:44,602	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/ssen23/ray_results/train_xgboost_2025-03-12_08-02-36' in 0.0190s.
2025-03-12 08:02:44,606	INFO tune.py:1041 -- Total run time: 8.00 seconds (7.97 seconds for the tuning loop).


Best Parameters Found: {'n_estimators': 465, 'learning_rate': 0.03312951798222195, 'max_depth': 9, 'subsample': 0.9347572685230339, 'colsample_bytree': 0.8341186145756794, 'gamma': 0.2592171513671164, 'reg_alpha': 0.3093942512487369, 'reg_lambda': 0.00010343201730027426}
Final Test MSE: 0.0671


In [27]:
LABS = {
    "51221": "Hematocrit",
    "51265": "Platelet Count",
    "50912": "Creatinine",
    "50971": "Potassium",
    "51222": "Hemoglobin",
    "51301": "White Blood Cells",
    "51249": "MCHC",
    "51279": "Red Blood Cells",
    "51250": "MCV",
    "51248": "MCH",
    "51277": "RDW",
    "51006": "Urea Nitrogen",
    "50983": "Sodium",
    "50902": "Chloride",
    "50882": "Bicarbonate",
    "50868": "Anion Gap",
    "50931": "Glucose",
    "50960": "Magnesium",
    "50893": "Calcium, Total",
    "50970": "Phosphate",
    "51237": "INR(PT)",
    "51274": "PT",
    "51275": "PTT",
    "51146": "Basophils",
    "51256": "Neutrophils",
    "51254": "Monocytes",
    "51200": "Eosinophils",
    "51244": "Lymphocytes",
    "52172": "RDW-SD",
    "50934": "H",
    "51678": "L",
    "50947": "I",
    "50861": "Alanine Aminotransferase (ALT)",
    "50878": "Asparate Aminotransferase (AST)",
    "50813": "Lactate",
    "50863": "Alkaline Phosphatase",
    "50885": "Bilirubin, Total",
    "50820": "pH",
    "50862": "Albumin",
    "50802": "Base Excess",
    "50821": "pO2",
    "50804": "Calculated Total CO2",
    "50818": "pCO2",
    "52075": "Absolute Neutrophil Count",
    "52073": "Absolute Eosinophil Count",
    "52074": "Absolute Monocyte Count",
    "52069": "Absolute Basophil Count",
    "51133": "Absolute Lymphocyte Count",
    "50910": "Creatine Kinase (CK)",
    "52135": "Immature Granulocytes"
}

# Replace IDs with ID_percentile
LABS_PERCENTILE = {f"{key}_percentile": value for key, value in LABS.items()}
print("inputs\n")
for id_p in INPUT_MARKERS:
    if id_p in LABS_PERCENTILE:
        print(LABS_PERCENTILE[id_p])
print("outputs\n")
for id_p in OUTPUT_MARKERS:
    if id_p in LABS_PERCENTILE:
        print(LABS_PERCENTILE[id_p])


inputs

Hematocrit
Hemoglobin
Platelet Count
White Blood Cells
Red Blood Cells
MCV
MCH
MCHC
Sodium
Potassium
Chloride
Bicarbonate
Calcium, Total
Glucose
Creatinine
Urea Nitrogen
Magnesium
Phosphate
Albumin
pH
outputs

INR(PT)
PT
PTT
RDW-SD
Lactate
Bilirubin, Total
Alanine Aminotransferase (ALT)
Asparate Aminotransferase (AST)
Alkaline Phosphatase
pO2
pCO2
Calculated Total CO2
Creatine Kinase (CK)
