In [None]:
!pip install -r ../requirements.txt

In [20]:
import itertools
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import pandas as pd
import os
from tqdm import tqdm

# Overall Settings

In [21]:
DATA_PATH = "../data/data.csv"
MODEL_DIR = "../saved_models_filtered_rf_final"
os.makedirs(MODEL_DIR, exist_ok=False)
os.makedirs(os.path.join(MODEL_DIR, "feature_importance"))

NUMBER_OF_ROWS = None # None means all rows

N_ESTIMATORS = 100
MAX_DEPTH = 5
FILTER_OUTLIERS_FAC = 0.65
SEED = 42
NUMBER_OF_THREADS = os.cpu_count()

print(f"Using {NUMBER_OF_THREADS} threads for model training.")
print(f"Models will be saved in: {MODEL_DIR}")

FileExistsError: [Errno 17] File exists: '../saved_models_filtered_rf_final'

# Define Inputs
We decided to have one mandatory Input with the "CCSR Procedure Code" and many other optional inputs.
For each combination of the optional inputs and the one mandatory input we have to train a random forrest models.

In [22]:
optional_features = ['Age Group', 'Gender', 'Race', 'Ethnicity']
base_feature = ['CCSR Procedure Code', 'Type of Admission']
all_combinations = []

for r in range(len(optional_features) + 1):
    for combo in itertools.combinations(optional_features, r):
        all_combinations.append(base_feature + list(combo))

# Define Outputs

In [23]:
targets = ['Total Costs', 'Length of Stay', 'APR Risk of Mortality']

# Loading the Data
Now we load our preprocessed data and clean some parts up.
We also encode the "APR Risk of Mortality"

In [24]:
df = pd.read_csv(DATA_PATH, dtype=str, low_memory=False, nrows=NUMBER_OF_ROWS)

# make numbers correct
df['Total Costs'] = df['Total Costs'].astype(float)
#df['Total Charges'] = df['Total Charges'].astype(float)
# Replace "120 +" with 140 and convert to float
df['Length of Stay'] = df['Length of Stay'].replace("120 +", "140").astype(float)

# Encode the risk of mortality
mortality_encoder = LabelEncoder()
df['APR Risk of Mortality'] = mortality_encoder.fit_transform(df['APR Risk of Mortality'])

# Print the number of loaded rows
print(f"Number of loaded rows: {len(df)}")

Number of loaded rows: 10


In [25]:
# Python
def filter_outliers(df, feature):
    median = df[feature].median()
    lower_bound = median * (1 - FILTER_OUTLIERS_FAC)
    upper_bound = median * (1 + FILTER_OUTLIERS_FAC)
    return df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]

filtered_dfs = []

# Iterate over unique combinations of procedure code and type of admission
for procedure_code, admission_type in df[['CCSR Procedure Code', 'Type of Admission']].drop_duplicates().itertuples(index=False):
    subset = df[(df['CCSR Procedure Code'] == procedure_code) & (df['Type of Admission'] == admission_type)]
    #for feature in targets:
    #    subset = filter_outliers(subset, feature)
    subset = filter_outliers(subset, 'Total Costs')
    filtered_dfs.append(subset)

# Combine all filtered subsets
filtered_df = pd.concat(filtered_dfs, ignore_index=True)
df = filtered_df

# Print the number of rows after filtering
print(f"Number of rows after filtering: {len(filtered_df)}")

Number of rows after filtering: 7


# Define the model training function

In [26]:
def train_model_xg(features):
    try:
        # OneHot-Encoding der Features
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        X_encoded = encoder.fit_transform(df[features])
        X_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(features))

        y_df = df[targets].reset_index(drop=True)

        # Move data to GPU
        # dtrain = xgb.DMatrix(X_df, label=y_df, device='cuda')

        # XGBoost-Konfiguration für maximale Performance
        base_model = xgb.XGBRegressor(
            n_estimators=N_ESTIMATORS,             # z. B. 100 oder 500
            tree_method='hist',                # GPU!
            booster='gbtree',                      # Tree Booster
            device = 'cpu',                        # GPU!
            max_depth=5,                          # Tiefer = komplexer
            subsample=0.8,                         # Bagging
            colsample_bytree=0.8,                  # Feature Sampling
            learning_rate=0.1,                     # kleiner bei mehr Estimators
            n_jobs = -1  # nutzt alle CPU-Kerne
        )

        # MultiOutputRegressor für 4 Zielspalten
        model = MultiOutputRegressor(base_model)
        model.fit(X_df.to_numpy(), y_df.to_numpy())

        # Vorhersage und Bewertung
        y_pred = model.predict(X_df)

        scores = {}
        for i, target in enumerate(targets):
            scores[f"{target}_r2"] = r2_score(y_df.iloc[:, i], y_pred[:, i])
            scores[f"{target}_mse"] = mean_squared_error(y_df.iloc[:, i], y_pred[:, i])

        # Modell speichern
        model_name = f"{'__'.join(f.replace(' ', '_') for f in features)}.pkl"
        model_path = os.path.join(MODEL_DIR, model_name)

        joblib.dump({
            "model": model,
            "features": features,
            "encoder": encoder,
            "target_columns": targets,
            "mortality_encoder": mortality_encoder
        }, model_path)

        return {
            "features": features,
            "model_path": model_path,
            **scores
        }

    except Exception as e:
        print(f"Error training model for features {features}: {e}", flush=True)
        return None

# Real Random Forest Model Training

In [27]:
def train_model_rf(features):
    try:
        # OneHot-Encoding der Features
        encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        X_encoded = encoder.fit_transform(df[features])
        X_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(features))

        y_df = df[targets].reset_index(drop=True)

        # Random Forest-Konfiguration für maximale Performance
        base_model = RandomForestRegressor(
            n_estimators=N_ESTIMATORS,  # Number of trees
            max_depth=MAX_DEPTH,               # Maximum depth of the trees
            n_jobs=NUMBER_OF_THREADS,                 # Use all CPU cores
            random_state=SEED          # Ensure reproducibility
        )

        # MultiOutputRegressor für 4 Zielspalten
        model = MultiOutputRegressor(base_model)
        model.fit(X_df.to_numpy(), y_df.to_numpy())

        # Vorhersage und Bewertung
        y_pred = model.predict(X_df.to_numpy())

        scores = {}
        for i, target in enumerate(targets):
            scores[f"{target}_r2"] = r2_score(y_df.iloc[:, i], y_pred[:, i])
            scores[f"{target}_mse"] = mean_squared_error(y_df.iloc[:, i], y_pred[:, i])

        # Modell speichern
        model_name = f"{'__'.join(f.replace(' ', '_') for f in features)}.pkl"
        model_path = os.path.join(MODEL_DIR, model_name)
        
        # feature importance speichern
        feature_importances = model.estimators_[0].feature_importances_
        feature_importances_df = pd.DataFrame({
            'feature': X_df.columns,
            'importance': feature_importances
        }).sort_values(by='importance', ascending=False)
        feature_importances_path = os.path.join(MODEL_DIR, f"{model_name.replace('.pkl', '_feature_importances.csv')}")
        feature_importances_df.to_csv(feature_importances_path, index=False)

        joblib.dump({
            "model": model,
            "features": features,
            "encoder": encoder,
            "target_columns": targets,
            "mortality_encoder": mortality_encoder
        }, model_path)

        return {
            "features": features,
            "model_path": model_path,
            **scores
        }

    except Exception as e:
        print(f"Error training model for features {features}: {e}", flush=True)
        return None

# Use Multithreading for the model training

In [28]:
train_model = train_model_rf  # Change to train_model_xg for XGBoost

#results = Parallel(n_jobs=NUMBER_OF_THREADS)(
#    delayed(train_model)(feature_comb) for feature_comb in tqdm(all_combinations, desc="Training Models")
#)

results = []
for feature_comb in tqdm(all_combinations, desc="Training Models"):
    result = train_model(feature_comb)
    if result is not None:
        results.append(result)

# Save summary
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(MODEL_DIR, "model_overview.csv"), index=False)
print("\n Models saved in:", MODEL_DIR)

Training Models: 100%|██████████| 16/16 [00:03<00:00,  4.01it/s]


 Models saved in: ../saved_models_filtered_rf_final



