In [None]:
!pip install -r ../requirements.txt

In [70]:
import itertools
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import pandas as pd
import os
from tqdm import tqdm

# Overall Settings

In [71]:
DATA_PATH = "../data/data.csv"
MODEL_DIR = "../saved_models"
NUMBER_OF_ROWS = None
N_ESTIMATORS = 100
SEED = 42
NUMBER_OF_THREADS = os.cpu_count()
os.makedirs(MODEL_DIR, exist_ok=True)

# Define Inputs
We decided to have one mandatory Input with the "CCSR Procedure Code" and many other optional inputs.
For each combination of the optional inputs and the one mandatory input we have to train a random forrest models.

In [72]:
optional_features = ['Age Group', 'Gender', 'Race', 'Ethnicity']
base_feature = ['CCSR Procedure Code', ]
all_combinations = []

for r in range(len(optional_features) + 1):
    for combo in itertools.combinations(optional_features, r):
        all_combinations.append(base_feature + list(combo))

# Define Outputs

In [73]:
targets = ['Total Costs', 'Total Charges', 'Length of Stay', 'APR Risk of Mortality']

# Loading the Data
Now we load our preprocessed data and clean some parts up.
We also encode the "APR Risk of Mortality"

In [74]:
df = pd.read_csv(DATA_PATH, dtype=str, low_memory=False, nrows=NUMBER_OF_ROWS)

# make numbers correct
df['Total Costs'] = df['Total Costs'].astype(float)
df['Total Charges'] = df['Total Charges'].astype(float)
# Replace "120 +" with 140 and convert to float
df['Length of Stay'] = df['Length of Stay'].replace("120 +", "140").astype(float)

# Encode the risk of mortality
mortality_encoder = LabelEncoder()
df['APR Risk of Mortality'] = mortality_encoder.fit_transform(df['APR Risk of Mortality'])

# Print the number of loaded rows
print(f"Number of loaded rows: {len(df)}")

Number of loaded rows: 1239850


# Define the model training function

In [77]:
def train_model(features):
    try:
        # OneHot-Encoding der Features
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        X_encoded = encoder.fit_transform(df[features])
        X_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(features))

        y_df = df[targets].reset_index(drop=True)

        # Move data to GPU
        # dtrain = xgb.DMatrix(X_df, label=y_df, device='cuda')

        # XGBoost-Konfiguration für maximale Performance
        base_model = xgb.XGBRegressor(
            n_estimators=N_ESTIMATORS,             # z. B. 100 oder 500
            tree_method='hist',                # GPU!
            booster='gbtree',                      # Tree Booster
            device = 'cpu',                        # GPU!
            max_depth=5,                          # Tiefer = komplexer
            subsample=0.8,                         # Bagging
            colsample_bytree=0.8,                  # Feature Sampling
            learning_rate=0.1,                     # kleiner bei mehr Estimators
            n_jobs = -1  # nutzt alle CPU-Kerne
        )

        # MultiOutputRegressor für 4 Zielspalten
        model = MultiOutputRegressor(base_model)
        model.fit(X_df.to_numpy(), y_df.to_numpy())

        # Vorhersage und Bewertung
        y_pred = model.predict(X_df)

        scores = {}
        for i, target in enumerate(targets):
            scores[f"{target}_r2"] = r2_score(y_df.iloc[:, i], y_pred[:, i])
            scores[f"{target}_mse"] = mean_squared_error(y_df.iloc[:, i], y_pred[:, i])

        # Modell speichern
        model_name = f"{'__'.join(f.replace(' ', '_') for f in features)}.pkl"
        model_path = os.path.join(MODEL_DIR, model_name)

        joblib.dump({
            "model": model,
            "features": features,
            "encoder": encoder,
            "target_columns": targets,
            "mortality_encoder": mortality_encoder
        }, model_path)

        return {
            "features": features,
            "model_path": model_path,
            **scores
        }

    except Exception as e:
        print(f"Error training model for features {features}: {e}", flush=True)
        return None

# Use Multithreading for the model training

In [78]:
#results = Parallel(n_jobs=NUMBER_OF_THREADS)(
#    delayed(train_model)(feature_comb) for feature_comb in tqdm(all_combinations, desc="Training Models")
#)

results = []
for feature_comb in tqdm(all_combinations, desc="Training Models"):
    result = train_model(feature_comb)
    if result is not None:
        results.append(result)

# Save summary
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(MODEL_DIR, "model_overview.csv"), index=False)
print("\n📦 Models saved in:", MODEL_DIR)
print("📄 Summary saved as: model_overview.csv")


Training Models:   0%|          | 0/16 [00:00<?, ?it/s][A
Training Models:   6%|▋         | 1/16 [00:34<08:37, 34.51s/it][A
Training Models:  12%|█▎        | 2/16 [01:13<08:40, 37.21s/it][A
Training Models:  19%|█▉        | 3/16 [01:52<08:15, 38.10s/it][A
Training Models:  25%|██▌       | 4/16 [02:32<07:46, 38.91s/it][A
Training Models:  31%|███▏      | 5/16 [03:13<07:14, 39.52s/it][A
Training Models:  38%|███▊      | 6/16 [03:57<06:50, 41.06s/it][A
Training Models:  44%|████▍     | 7/16 [04:42<06:20, 42.27s/it][A
Training Models:  50%|█████     | 8/16 [05:26<05:42, 42.87s/it][A
Training Models:  56%|█████▋    | 9/16 [06:10<05:02, 43.21s/it][A
Training Models:  62%|██████▎   | 10/16 [06:54<04:21, 43.54s/it][A
Training Models:  69%|██████▉   | 11/16 [07:40<03:40, 44.11s/it][A
Training Models:  75%|███████▌  | 12/16 [08:26<02:59, 44.82s/it][A
Training Models:  81%|████████▏ | 13/16 [09:12<02:15, 45.15s/it][A
Training Models:  88%|████████▊ | 14/16 [09:59<01:31, 45.61s/it]


📦 Models saved in: ../saved_models
📄 Summary saved as: model_overview.csv



