In [1]:
import pandas as pd
import os 
import xgboost as xgb
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:
labs = {
    "51221": "Hematocrit",
    "51265": "Platelet Count",
    "50912": "Creatinine",
    "50971": "Potassium",
    "51222": "Hemoglobin",
    "51301": "White Blood Cells",
    "51249": "MCHC",
    "51279": "Red Blood Cells",
    "51250": "MCV",
    "51248": "MCH",
    "51277": "RDW",
    "51006": "Urea Nitrogen",
    "50983": "Sodium",
    "50902": "Chloride",
    "50882": "Bicarbonate",
    "50868": "Anion Gap",
    "50931": "Glucose",
    "50960": "Magnesium",
    "50893": "Calcium, Total",
    "50970": "Phosphate",
    "51237": "INR(PT)",
    "51274": "PT",
    "51275": "PTT",
    "51146": "Basophils",
    "51256": "Neutrophils",
    "51254": "Monocytes",
    "51200": "Eosinophils",
    "51244": "Lymphocytes",
    "52172": "RDW-SD",
    "50934": "H",
    "51678": "L",
    "50947": "I",
    "50861": "Alanine Aminotransferase (ALT)",
    "50878": "Asparate Aminotransferase (AST)",
    "50813": "Lactate",
    "50863": "Alkaline Phosphatase",
    "50885": "Bilirubin, Total",
    "50820": "pH",
    "50862": "Albumin",
    "50802": "Base Excess",
    "50821": "pO2",
    "50804": "Calculated Total CO2",
    "50818": "pCO2",
    "52075": "Absolute Neutrophil Count",
    "52073": "Absolute Eosinophil Count",
    "52074": "Absolute Monocyte Count",
    "52069": "Absolute Basophil Count",
    "51133": "Absolute Lymphocyte Count",
    "50910": "Creatine Kinase (CK)",
    "52135": "Immature Granulocytes"
}
labs_reversed = {value: key for key, value in labs.items()}

In [3]:
df_train = pd.read_excel(r"C:\Users\joshu\train_imputed.xlsx", index_col = "Unnamed: 0")
df_test = pd.read_excel(r"C:\Users\joshu\test_imputed.xlsx", index_col = "Unnamed: 0")

In [4]:
total_feats = ['Hematocrit',
 'PTT',
 'Asparate Aminotransferase (AST)',
 'Chloride',
 'White Blood Cells',
 'Potassium',
 'Calcium, Total',
 'Phosphate',
 'Monocytes',
 'Eosinophils',
 'Urea Nitrogen',
 'pH',
 'pCO2']

encode = lambda x: [labs_reversed[i] for i in x]
decode = lambda x: [labs[i] for i in x]

cols = decode(df_train.columns.to_list())
targets = list(set(cols) - set(total_feats))
targets

['Calculated Total CO2',
 'Lactate',
 'Base Excess',
 'Sodium',
 'INR(PT)',
 'Platelet Count',
 'Albumin',
 'RDW',
 'Hemoglobin',
 'MCV',
 'RDW-SD',
 'Anion Gap',
 'Glucose',
 'Lymphocytes',
 'PT',
 'Alanine Aminotransferase (ALT)',
 'MCHC',
 'Bilirubin, Total',
 'Basophils',
 'Creatinine',
 'MCH',
 'Neutrophils',
 'pO2',
 'Alkaline Phosphatase',
 'Magnesium',
 'Red Blood Cells',
 'Bicarbonate',
 'Creatine Kinase (CK)']

In [5]:
X_train = df_train[encode(total_feats)]
y_train = df_train[encode(targets)]

y_test = df_test[encode(targets)]
X_test =  df_test[encode(total_feats)]

In [6]:
# Create DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Parameters for multi-output regression
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': 4,
    'eta': 0.1,
    'num_targets': y_train.shape[1],  # Number of output dimensions
    'seed': 42
}

# Train
bst = xgb.train(params, dtrain, num_boost_round=100)

# Predict (will return 2D array)
y_pred = bst.predict(dtest)

errors = []
for i in range(y_test.shape[1]):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    errors.append(mse)
print(np.mean(errors))

Parameters: { "num_targets" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0.03815346391322389


In [7]:
params = {'eta': 0.06014477612764848, 
          'max_depth': 5, 
          'min_child_weight': 14, 
          'lambda': 0.23378311898486798, 
          'alpha': 0.00011202585063587642, 
          'gamma': 0.0009675173727657638, 
          'subsample': 0.6661968185586394, 
          'colsample_bytree': 0.871814732691916, 
          'grow_policy': 'depthwise', 
          'max_bin': 123}
# Train
bst = xgb.train(params, dtrain, num_boost_round=100)

# Predict (will return 2D array)
y_pred = bst.predict(dtest)

errors = []
loss_df= pd.DataFrame(columns=['output', 'loss'])
for i in range(y_test.shape[1]):
    mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
    colname = decode([y_test.iloc[:,i].name])[0]
    errors.append(mse)
    loss_df.loc[len(loss_df)] = [colname, mse]
loss_df

Unnamed: 0,output,loss
0,Calculated Total CO2,0.000992
1,Lactate,0.036813
2,Base Excess,0.002344
3,Sodium,0.031836
4,INR(PT),0.03663
5,Platelet Count,0.057221
6,Albumin,0.028573
7,RDW,0.058904
8,Hemoglobin,0.004334
9,MCV,0.076088


In [8]:
order = ['Albumin',
 'Alkaline Phosphatase',
 'Neutrophils',
 'pO2',
 'Magnesium',
 'MCH',
 'Red Blood Cells',
 'Creatinine',
 'Platelet Count',
 'PT',
 'Alanine Aminotransferase (ALT)',
 'Base Excess',
 'MCV',
 'Hemoglobin',
 'RDW-SD',
 'Creatine Kinase (CK)',
 'Glucose',
 'Bicarbonate',
 'Bilirubin, Total',
 'INR(PT)',
 'Lymphocytes',
 'MCHC',
 'Sodium',
 'Anion Gap',
 'RDW',
 'Lactate',
 'Calculated Total CO2',
 'Basophils']

loss_df = loss_df.set_index('output').reindex(order).reset_index()
loss_df

Unnamed: 0,output,loss
0,Albumin,0.028573
1,Alkaline Phosphatase,0.040711
2,Neutrophils,0.033755
3,pO2,0.02673
4,Magnesium,0.05644
5,MCH,0.076179
6,Red Blood Cells,0.011873
7,Creatinine,0.028668
8,Platelet Count,0.057221
9,PT,0.044784


In [9]:
bst.save_model('xgboost_model.model')

  bst.save_model('xgboost_model.model')


In [None]:
import optuna
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

def objective(trial):
    # Define hyperparameters to tune
    params = {
        # Core
        'eta': trial.suggest_float('eta', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        
        # Regularization
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 5.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        
        # Advanced
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'max_bin': trial.suggest_int('max_bin', 64, 512),
    }
    # Train
    bst = xgb.train(params, dtrain, num_boost_round=100)

    # Predict (will return 2D array)
    y_pred = bst.predict(dtest)

    errors = []
    for i in range(y_test.shape[1]):
        mse = mean_squared_error(y_test.iloc[:, i], y_pred[:, i])
        errors.append(mse)
    return np.mean(errors)

# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials= 200)

# Print the best hyperparameters
print('Best trial:')
trial = study.best_trial
print(f'  MSE: {trial.value}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

[I 2025-04-08 20:43:19,075] A new study created in memory with name: no-name-aec7f9bd-091d-4d6b-ad07-246404b9653f
[I 2025-04-08 20:43:25,480] Trial 0 finished with value: 0.04218033999448346 and parameters: {'eta': 0.021352485831920082, 'max_depth': 5, 'min_child_weight': 19, 'lambda': 1.1006638207660057e-07, 'alpha': 1.7237961284388739, 'gamma': 3.031407814549883, 'subsample': 0.8047913606820911, 'colsample_bytree': 0.9928523742073645, 'grow_policy': 'depthwise', 'max_bin': 291}. Best is trial 0 with value: 0.04218033999448346.
[I 2025-04-08 20:43:54,678] Trial 1 finished with value: 0.04372850201844431 and parameters: {'eta': 0.011554113079030232, 'max_depth': 11, 'min_child_weight': 3, 'lambda': 5.384057402725335e-07, 'alpha': 1.7612221620197726e-06, 'gamma': 0.7977975969233191, 'subsample': 0.8363541141906494, 'colsample_bytree': 0.7051192489259468, 'grow_policy': 'lossguide', 'max_bin': 368}. Best is trial 0 with value: 0.04218033999448346.
[I 2025-04-08 20:46:07,172] Trial 2 fini

In [None]:
# Plot optimization history
optuna.visualization.plot_optimization_history(study)

# Plot parameter importance
optuna.visualization.plot_param_importances(study)

# Plot slice plot
optuna.visualization.plot_slice(study)