In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import random as rd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import ray
from ray import tune
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [7]:
df = pd.read_excel('symile_labs_complete.xlsx')
df.head()

Unnamed: 0,subject_id,51221_percentile,51265_percentile,50912_percentile,50971_percentile,51222_percentile,51301_percentile,51249_percentile,51279_percentile,51250_percentile,...,50813_percentile,50863_percentile,50885_percentile,50820_percentile,50862_percentile,50802_percentile,50821_percentile,50804_percentile,50818_percentile,50910_percentile
0,12138413,0.653935,0.471492,0.66817,0.122564,0.58566,0.379092,0.273604,0.508236,0.829349,...,0.32833,0.188457,0.750168,0.371901,0.539988,0.660286,0.141536,0.662118,0.72287,0.642922
1,12988457,0.744261,0.549488,0.764983,0.87392,0.600321,0.128942,0.115307,0.782644,0.367417,...,0.557945,0.414605,0.663706,0.807163,0.60397,0.469087,0.234647,0.287035,0.146159,0.226079
2,18624683,0.074887,0.071672,0.197234,0.012156,0.120607,0.56417,0.834371,0.080755,0.603556,...,0.875884,0.103988,0.750168,0.485908,0.374253,0.273487,0.058992,0.287035,0.304645,0.898581
3,11914968,0.870576,0.616041,0.66817,0.122564,0.921972,0.945371,0.875151,0.799719,0.661008,...,0.9746,0.565034,0.396615,0.154482,0.770091,0.108031,0.535329,0.117103,0.471935,0.878056
4,17957742,0.433584,0.264405,0.56364,0.886779,0.430006,0.735389,0.424468,0.512455,0.30675,...,0.119353,0.475349,0.780999,0.872643,0.42513,0.660286,0.916355,0.436496,0.173674,0.886206


In [8]:

all_columns = df.columns.tolist()
all_columns.remove('subject_id')
input_columns = rd.sample(all_columns, 25)

remaining_columns = [col for col in all_columns if col not in input_columns]
output_columns = rd.sample(remaining_columns, 5)


In [29]:
# Common, inexpensive blood markers (good inputs)
INPUT_MARKERS = [
    "51221_percentile",  # Hematocrit
    "51222_percentile",  # Hemoglobin
    "51265_percentile",  # Platelet Count
    "51301_percentile",  # White Blood Cells
    "51279_percentile",  # Red Blood Cells
    "51250_percentile",  # MCV
    "51248_percentile",  # MCH
    "51249_percentile",  # MCHC
    # "51277_percentile",  # RDW
    "50983_percentile",  # Sodium
    "50971_percentile",  # Potassium
    "50902_percentile",  # Chloride
    "50882_percentile",  # Bicarbonate
    "50893_percentile",  # Calcium, Total
    "50931_percentile",  # Glucose
    "50912_percentile",  # Creatinine
    "51006_percentile",  # Urea Nitrogen (BUN)
    "50960_percentile",  # Magnesium
    "50970_percentile",  # Phosphate
    "50862_percentile",  # Albumin
    "50820_percentile",  # pH
]

# Less common, more expensive blood markers (good outputs)
OUTPUT_MARKERS = [
    "51237_percentile",  # INR (PT)
    "51274_percentile",  # PT
    "51275_percentile",  # PTT
    "52172_percentile",  # RDW-SD
    "50813_percentile",  # Lactate
    "50885_percentile",  # Bilirubin, Total
    "50861_percentile",  # Alanine Aminotransferase (ALT)
    "50878_percentile",  # Aspartate Aminotransferase (AST)
    "50863_percentile",  # Alkaline Phosphatase
    
    "50821_percentile",  # pO2
    "50818_percentile",  # pCO2
    "50804_percentile",  # Calculated Total CO2
    "50910_percentile",  # Creatine Kinase (CK)
]

X = df[INPUT_MARKERS]
y = df["50813_percentile"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
ray.shutdown()
ray.init()  # Use only 2 CPU cores
param_space = {
    "n_estimators": tune.randint(50, 500),
    "learning_rate": tune.loguniform(0.001, 0.2),
    "max_depth": tune.randint(3, 10),
    "subsample": tune.uniform(0.5, 1.0),
    "colsample_bytree": tune.uniform(0.5, 1.0),
    
    
}

# 📌 Define objective function for Ray Tune
def train_xgboost(config):
    model = xgb.XGBRegressor(
        n_estimators=config["n_estimators"],
        learning_rate=config["learning_rate"],
        max_depth=config["max_depth"],
        subsample=config["subsample"],
        colsample_bytree=config["colsample_bytree"],
        objective="reg:squarederror",
        tree_method="hist",  # Faster training
        random_state=42
    )

    # Train model
    model.fit(X_train, y_train)
    
    # Predict and calculate MSE
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    # Report back to Ray Tune
    return {"mse": mse}

# 📌 Run Ray Tune's hyperparameter search

tuner = tune.Tuner(train_xgboost, param_space=param_space, tune_config=tune.TuneConfig(num_samples=50))  # ③

results = tuner.fit()
best_config = results.get_best_result(metric="mse", mode="min").config
print("Best Parameters Found:", best_config)

# 📌 Train final XGBoost model using best params
best_model = xgb.XGBRegressor(**best_config, objective="reg:squarederror", tree_method="hist")
best_model.fit(X_train, y_train)

# 📌 Evaluate the best model
y_pred = best_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_pred)
print(f"Final Test MSE: {final_mse:.4f}")

0,1
Current time:,2025-03-13 12:48:34
Running for:,00:00:14.42
Memory:,10.1/16.0 GiB

Trial name,status,loc,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,iter,total time (s),mse
train_xgboost_59d16_00000,TERMINATED,127.0.0.1:64814,0.501149,0.045225,8,226,0.704936,1,0.143727,0.0740758
train_xgboost_59d16_00001,TERMINATED,127.0.0.1:64821,0.88615,0.0713686,7,72,0.745219,1,0.0796108,0.074762
train_xgboost_59d16_00002,TERMINATED,127.0.0.1:64816,0.833455,0.00753413,9,481,0.732699,1,0.520774,0.0713892
train_xgboost_59d16_00003,TERMINATED,127.0.0.1:64813,0.853525,0.00560269,5,290,0.69348,1,0.123237,0.073583
train_xgboost_59d16_00004,TERMINATED,127.0.0.1:64817,0.501343,0.00171139,4,129,0.764796,1,0.037679,0.0830579
train_xgboost_59d16_00005,TERMINATED,127.0.0.1:64822,0.747871,0.00201583,3,410,0.777951,1,0.066988,0.0768264
train_xgboost_59d16_00006,TERMINATED,127.0.0.1:64820,0.930128,0.134401,3,231,0.814112,1,0.0555792,0.0787288
train_xgboost_59d16_00007,TERMINATED,127.0.0.1:64819,0.723505,0.133405,6,495,0.958197,1,0.107877,0.079162
train_xgboost_59d16_00008,TERMINATED,127.0.0.1:64818,0.689554,0.022815,6,337,0.519052,1,0.153312,0.0685392
train_xgboost_59d16_00009,TERMINATED,127.0.0.1:64815,0.617927,0.0546927,9,76,0.702667,1,0.0850542,0.0728507


2025-03-13 12:48:34,291	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/ssen23/ray_results/train_xgboost_2025-03-13_12-48-19' in 0.0239s.
2025-03-13 12:48:34,297	INFO tune.py:1041 -- Total run time: 14.43 seconds (14.39 seconds for the tuning loop).


Best Parameters Found: {'n_estimators': 485, 'learning_rate': 0.014678401714523757, 'max_depth': 3, 'subsample': 0.5168479715080281, 'colsample_bytree': 0.7764294758622377}
Final Test MSE: 0.0673


In [None]:
LABS = {
    "51221": "Hematocrit",
    "51265": "Platelet Count",
    "50912": "Creatinine",
    "50971": "Potassium",
    "51222": "Hemoglobin",
    "51301": "White Blood Cells",
    "51249": "MCHC",
    "51279": "Red Blood Cells",
    "51250": "MCV",
    "51248": "MCH",
    "51277": "RDW",
    "51006": "Urea Nitrogen",
    "50983": "Sodium",
    "50902": "Chloride",
    "50882": "Bicarbonate",
    "50868": "Anion Gap",
    "50931": "Glucose",
    "50960": "Magnesium",
    "50893": "Calcium, Total",
    "50970": "Phosphate",
    "51237": "INR(PT)",
    "51274": "PT",
    "51275": "PTT",
    "51146": "Basophils",
    "51256": "Neutrophils",
    "51254": "Monocytes",
    "51200": "Eosinophils",
    "51244": "Lymphocytes",
    "52172": "RDW-SD",
    "50934": "H",
    "51678": "L",
    "50947": "I",
    "50861": "Alanine Aminotransferase (ALT)",
    "50878": "Asparate Aminotransferase (AST)",
    "50813": "Lactate",
    "50863": "Alkaline Phosphatase",
    "50885": "Bilirubin, Total",
    "50820": "pH",
    "50862": "Albumin",
    "50802": "Base Excess",
    "50821": "pO2",
    "50804": "Calculated Total CO2",
    "50818": "pCO2",
    "52075": "Absolute Neutrophil Count",
    "52073": "Absolute Eosinophil Count",
    "52074": "Absolute Monocyte Count",
    "52069": "Absolute Basophil Count",
    "51133": "Absolute Lymphocyte Count",
    "50910": "Creatine Kinase (CK)",
    "52135": "Immature Granulocytes"
}

# Replace IDs with ID_percentile
LABS_PERCENTILE = {f"{key}_percentile": value for key, value in LABS.items()}
print("inputs\n")
for id_p in INPUT_MARKERS:
    if id_p in LABS_PERCENTILE:
        print(LABS_PERCENTILE[id_p])
print("outputs\n")
for id_p in OUTPUT_MARKERS:
    if id_p in LABS_PERCENTILE:
        print(LABS_PERCENTILE[id_p])
