In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import random as rd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [37]:
df = pd.read_excel('symile_labs_complete.xlsx')
df.head()

Unnamed: 0,subject_id,51221_percentile,51265_percentile,50912_percentile,50971_percentile,51222_percentile,51301_percentile,51249_percentile,51279_percentile,51250_percentile,...,50813_percentile,50863_percentile,50885_percentile,50820_percentile,50862_percentile,50802_percentile,50821_percentile,50804_percentile,50818_percentile,50910_percentile
0,12138413,0.653935,0.471492,0.66817,0.122564,0.58566,0.379092,0.273604,0.508236,0.829349,...,0.32833,0.188457,0.750168,0.371901,0.539988,0.660286,0.141536,0.662118,0.72287,0.642922
1,12988457,0.744261,0.549488,0.764983,0.87392,0.600321,0.128942,0.115307,0.782644,0.367417,...,0.557945,0.414605,0.663706,0.807163,0.60397,0.469087,0.234647,0.287035,0.146159,0.226079
2,18624683,0.074887,0.071672,0.197234,0.012156,0.120607,0.56417,0.834371,0.080755,0.603556,...,0.875884,0.103988,0.750168,0.485908,0.374253,0.273487,0.058992,0.287035,0.304645,0.898581
3,11914968,0.870576,0.616041,0.66817,0.122564,0.921972,0.945371,0.875151,0.799719,0.661008,...,0.9746,0.565034,0.396615,0.154482,0.770091,0.108031,0.535329,0.117103,0.471935,0.878056
4,17957742,0.433584,0.264405,0.56364,0.886779,0.430006,0.735389,0.424468,0.512455,0.30675,...,0.119353,0.475349,0.780999,0.872643,0.42513,0.660286,0.916355,0.436496,0.173674,0.886206


In [38]:

all_columns = df.columns.tolist()
all_columns.remove('subject_id')
input_columns = rd.sample(all_columns, 25)

remaining_columns = [col for col in all_columns if col not in input_columns]
output_columns = rd.sample(remaining_columns, 5)


In [39]:
X = df[input_columns]  # Input columns (features)
y = df[output_columns]  # Output columns (targets)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
xgb_model = xgb.XGBRegressor(
    n_estimators=500,        # Number of boosting rounds
    learning_rate=0.05,      # Step size shrinkage
    max_depth=6,             # Maximum depth of trees
    subsample=0.8,           # Fraction of samples used per tree
    colsample_bytree=0.8,    # Fraction of features used per tree
    objective="reg:squarederror",  # For regression tasks
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=100)

[0]	validation_0-rmse:0.29014
[100]	validation_0-rmse:0.21518
[200]	validation_0-rmse:0.21611
[300]	validation_0-rmse:0.21612
[400]	validation_0-rmse:0.21617
[499]	validation_0-rmse:0.21617


In [41]:
y_pred = xgb_model.predict(X_test)

# Compute MSE
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE: {mse:.4f}")
print("Inputs: {}".format(input_columns))
print("Outputs: {}".format(output_columns))


Test MSE: 0.0467
Inputs: ['50818_percentile', '51237_percentile', '50960_percentile', '50971_percentile', '51250_percentile', '51265_percentile', '50878_percentile', '50861_percentile', '50820_percentile', '51256_percentile', '51248_percentile', '51277_percentile', '51274_percentile', '50910_percentile', '51222_percentile', '50912_percentile', '51301_percentile', '51249_percentile', '50804_percentile', '50931_percentile', '50862_percentile', '50893_percentile', '50821_percentile', '50885_percentile', '51221_percentile']
Outputs: ['50802_percentile', '51275_percentile', '51244_percentile', '51200_percentile', '50902_percentile']


In [42]:
LABS = {
    "51221": "Hematocrit",
    "51265": "Platelet Count",
    "50912": "Creatinine",
    "50971": "Potassium",
    "51222": "Hemoglobin",
    "51301": "White Blood Cells",
    "51249": "MCHC",
    "51279": "Red Blood Cells",
    "51250": "MCV",
    "51248": "MCH",
    "51277": "RDW",
    "51006": "Urea Nitrogen",
    "50983": "Sodium",
    "50902": "Chloride",
    "50882": "Bicarbonate",
    "50868": "Anion Gap",
    "50931": "Glucose",
    "50960": "Magnesium",
    "50893": "Calcium, Total",
    "50970": "Phosphate",
    "51237": "INR(PT)",
    "51274": "PT",
    "51275": "PTT",
    "51146": "Basophils",
    "51256": "Neutrophils",
    "51254": "Monocytes",
    "51200": "Eosinophils",
    "51244": "Lymphocytes",
    "52172": "RDW-SD",
    "50934": "H",
    "51678": "L",
    "50947": "I",
    "50861": "Alanine Aminotransferase (ALT)",
    "50878": "Asparate Aminotransferase (AST)",
    "50813": "Lactate",
    "50863": "Alkaline Phosphatase",
    "50885": "Bilirubin, Total",
    "50820": "pH",
    "50862": "Albumin",
    "50802": "Base Excess",
    "50821": "pO2",
    "50804": "Calculated Total CO2",
    "50818": "pCO2",
    "52075": "Absolute Neutrophil Count",
    "52073": "Absolute Eosinophil Count",
    "52074": "Absolute Monocyte Count",
    "52069": "Absolute Basophil Count",
    "51133": "Absolute Lymphocyte Count",
    "50910": "Creatine Kinase (CK)",
    "52135": "Immature Granulocytes"
}

# Replace IDs with ID_percentile
LABS_PERCENTILE = {f"{key}_percentile": value for key, value in LABS.items()}
print("inputs")
for id_p in input_columns:
    if id_p in LABS_PERCENTILE:
        print(LABS_PERCENTILE[id_p])
print("outputs")
for id_p in output_columns:
    if id_p in LABS_PERCENTILE:
        print(LABS_PERCENTILE[id_p])


inputs
pCO2
INR(PT)
Magnesium
Potassium
MCV
Platelet Count
Asparate Aminotransferase (AST)
Alanine Aminotransferase (ALT)
pH
Neutrophils
MCH
RDW
PT
Creatine Kinase (CK)
Hemoglobin
Creatinine
White Blood Cells
MCHC
Calculated Total CO2
Glucose
Albumin
Calcium, Total
pO2
Bilirubin, Total
Hematocrit
outputs
Base Excess
PTT
Lymphocytes
Eosinophils
Chloride
