In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import random as rd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


In [5]:
df = pd.read_excel('symile_labs_complete.xlsx')
INPUT_MARKERS = [
    "51221_percentile",  # Hematocrit
    "51222_percentile",  # Hemoglobin
    "51265_percentile",  # Platelet Count
    "51301_percentile",  # White Blood Cells
    "51279_percentile",  # Red Blood Cells
    "51250_percentile",  # MCV
    "51248_percentile",  # MCH
    "51249_percentile",  # MCHC
    # "51277_percentile",  # RDW
    "50983_percentile",  # Sodium
    "50971_percentile",  # Potassium
    "50902_percentile",  # Chloride
    "50882_percentile",  # Bicarbonate
    "50893_percentile",  # Calcium, Total
    "50931_percentile",  # Glucose
    "50912_percentile",  # Creatinine
    "51006_percentile",  # Urea Nitrogen (BUN)
    "50960_percentile",  # Magnesium
    "50970_percentile",  # Phosphate
    "50862_percentile",  # Albumin
    "50820_percentile",  # pH
]

# Less common, more expensive blood markers (good outputs)
OUTPUT_MARKERS = [
    "51237_percentile",  # INR (PT)
    "51274_percentile",  # PT
    "51275_percentile",  # PTT
    "52172_percentile",  # RDW-SD
    "50813_percentile",  # Lactate
    "50885_percentile",  # Bilirubin, Total
    "50861_percentile",  # Alanine Aminotransferase (ALT)
    "50878_percentile",  # Aspartate Aminotransferase (AST)
    "50863_percentile",  # Alkaline Phosphatase
    
    "50821_percentile",  # pO2
    "50818_percentile",  # pCO2
    "50804_percentile",  # Calculated Total CO2
    "50910_percentile",  # Creatine Kinase (CK)
]

In [20]:
X = df[INPUT_MARKERS]  # Input columns (features)
y = df[OUTPUT_MARKERS]  # Output columns (targets)
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X = scaler_X.fit_transform(X)
y = scaler_y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
svr_models = {}
y_pred = np.zeros_like(y_test)  # To store predictions
    

In [49]:
for i, target in enumerate(OUTPUT_MARKERS):
    svr = SVR(kernel="rbf", C=5, epsilon=0.1, gamma=0.05)  # RBF Kernel works well for nonlinear data
    svr.fit(X_train, y_train[:, i])
    svr_models[target] = svr
    y_pred[:, i] = svr.predict(X_test)

    

In [50]:
mse_scores = {target: mean_squared_error(y_test[:, i], y_pred[:, i]) for i, target in enumerate(OUTPUT_MARKERS)}

print("MSE for each predicted biomarker:", mse_scores)

MSE for each predicted biomarker: {'51237_percentile': 1.0040828830853907, '51274_percentile': 1.0490126738343633, '51275_percentile': 1.2411860031285713, '52172_percentile': 0.7837383659200077, '50813_percentile': 0.8948481878012255, '50885_percentile': 0.9792081927772578, '50861_percentile': 1.0278241199805602, '50878_percentile': 0.8728548964526895, '50863_percentile': 1.0367758552798285, '50821_percentile': 1.1922864292431445, '50818_percentile': 0.4614737611465759, '50804_percentile': 0.411855527864404, '50910_percentile': 0.9762678875006633}
