In [1]:
import pandas as pd
import numpy as np
import optuna
import pickle
import joblib


from sklearn.preprocessing import StandardScaler, OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_pinball_loss, mean_squared_error, mean_absolute_error,mean_absolute_percentage_error
from sklearn.model_selection import KFold


In [2]:
# More informations about this dataset here :https://vitaldb.net/dataset/?query=overview#h.1fo5zknztqnw
df = pd.read_csv("datasets/operation_length_predictions_vitalDB.csv") 

### Preprocessing

In [3]:
def calculate_duration_cols(df):
    """Calculate time durations for hospital, operation and anesthesia in hours from seconds"""
    df['duration_stay'] = (df['dis'] - df['adm'])/3600
    df['duration_operation'] = (df['opend'] - df['opstart'])/3600
    df['duration_anesthesia'] = (df['aneend'] - df['anestart'])/3600
    return df

def drop_id_cols(df):
    """Drop ID columns and original timestamp columns"""
    columns_to_drop = [
        'subjectid',
        'dis', 'adm',
        'opend', 'opstart',
        'aneend', 'anestart',
        'casestart','caseend', 
        'icu_days',"dx","opname",
        'preop_ecg','preop_ph', 'preop_hco3', 'preop_be', 'preop_pao2', 
        'preop_paco2', 'preop_sao2', 'cormack', 'tubesize',
        'dltubesize', 'lmasize', 'iv2', 'aline1', 'aline2',
        'cline1', 'cline2',"preop_na","preop_k",'airway','iv1','duration_stay',
        'death_inhosp'
    ]

    df = df.drop(columns=df.filter(like='intraop').columns)

    return df.drop(columns_to_drop, axis=1)

def map_position(df):
    position_mapping = {
    'Supine': 'Supine',
    'Lithotomy': 'Lithotomy',
    'Left lateral decubitus': 'Lateral Decubitus',
    'Right lateral decubitus': 'Lateral Decubitus',
    'Prone': 'Prone',
    'Reverse Trendelenburg': 'Trendelenburg (Inclined)',
    'Trendelenburg': 'Trendelenburg (Inclined)',
    'Sitting': 'Sitting',
    'Left kidney': 'Lateral Decubitus',
    'Right kidney': 'Lateral Decubitus'
    }

    df['position'] = df['position'].map(position_mapping)
    return df

def map_ane_type(df):
    ane_mapping = {
        'General': 'General Anesthesia',
        'Spinal': 'Regional Anesthesia',
        'Sedationalgesia': 'Regional Anesthesia'
    }

    df['ane_type'] = df['ane_type'].map(ane_mapping)
    return df

def handle_age_more_than_89(df):
    df.loc[df['age'] == '>89', 'age'] = 92
    return df

def map_preop_pft(df):
    pft_mapping = {
    "Normal": "Normal",
    "Mild obstructive": "Mild/Moderate Obstructive",
    "Moderate obstructive": "Mild/Moderate Obstructive",
    "Borderline obstructive": "Mild/Moderate Obstructive",
    "Severe obstructive": "Severe Obstructive",
    "Mixed or pure obstructive": "Severe Obstructive",
    "Mild restrictive": "Mild/Moderate Restrictive",
    "Moderate restrictive": "Mild/Moderate Restrictive",
    "Severe restrictive": "Severe Restrictive",
    }
    df['preop_pft'] = df['preop_pft'].map(pft_mapping)
    return df
    

def drop_na_rows(df):
    return df.dropna().reset_index(drop=True)

def encode_scale(df):
    # label_encoder = OrdinalEncoder()
    scaler = StandardScaler()
    onehot_encoder = OneHotEncoder(handle_unknown='ignore')

    # categorical_features = []
    numerical_features = ['age', 'weight','height', 'bmi', 'asa', 'preop_hb', 'preop_plt', 'preop_pt', 'preop_aptt', 'preop_gluc', 'preop_alb', 'preop_ast', 'preop_alt', 'preop_bun','preop_cr']
    onehot_features = ['preop_pft','sex', 'department', 'optype', 'approach', 'ane_type', 'position']

    preprocessor = ColumnTransformer(
    transformers=[
        # ('cat', label_encoder, categorical_features),
        ('num', scaler, numerical_features),
        ('onehot', onehot_encoder, onehot_features)
    ],
    remainder='passthrough',
    # verbose_feature_names_out=False
    )

    save_path = 'models_and_preprocessing/preprocessor_duration_anesthesia.pkl'
    df_processed = preprocessor.fit_transform(df)
    with open(save_path, 'wb') as f:
        pickle.dump(preprocessor, f)

    return df_processed

def load_and_apply_transformers(df, save_path='models_and_preprocessing/preprocessor_duration_anesthesia.pkl'):
    with open(save_path, 'rb') as f:
        preprocessor = pickle.load(f)

    df_processed = preprocessor.transform(df)
    return df_processed

def split_dataset(df,target_column='duration_anesthesia'):
    """Split the dataset into train and test sets"""
    y = df[target_column]
    X = df.drop(columns=['duration_operation','duration_anesthesia'], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def handle_outliers(X_train,y_train,target_column):
    """Handle outliers in the training set"""
    df = pd.concat([X_train, y_train], axis=1)
    df = df.drop(df[df[target_column] > 10].index)
    df = df.drop(df[df[target_column] < 0].index)

    y_train = df[target_column]
    X_train = df.drop(columns=[target_column], axis=1)
    # print(X_train.shape)
    return X_train,y_train

def process_data(df,target_column):
    """Main function to process the dataframe"""
    df = calculate_duration_cols(df)
    df = drop_id_cols(df)
    df = drop_na_rows(df)
    df = map_position(df)
    df = map_ane_type(df)
    df = handle_age_more_than_89(df)
    df = map_preop_pft(df)

    sample = df[df["caseid"]<6]
    df = df[df["caseid"]>6].reset_index(drop=True)
    df = df.drop(columns=['caseid'], axis=1)

    X_train, X_test, y_train, y_test = split_dataset(df,target_column)
    X_train, y_train = handle_outliers(X_train,y_train,target_column)
    X_train = encode_scale(X_train)
    X_test = load_and_apply_transformers(X_test)
    return X_train, X_test, y_train, y_test,sample

X_train, X_test, y_train, y_test,sample = process_data(df, target_column='duration_anesthesia')

In [4]:
# sample.to_csv("sample.csv", index=False)

### Tuning with optuna

In [5]:
def objective(trial, X, y, n_splits=4):
    """Optuna objective function with cross-validation for 0.90 quantile"""
    # Define hyperparameter search space
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 2, 6),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0)
    }
    
    # Initialize cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    
    # Perform cross-validation
    for train_idx, val_idx in kf.split(X):
        X_fold_train, X_fold_val = X[train_idx], X[val_idx]
        y_fold_train, y_fold_val = y[train_idx], y[val_idx]
        
        # Train model for 90th percentile
        model = GradientBoostingRegressor(
            loss="quantile",
            alpha=0.90,
            random_state=42,
            **params
        )
        
        # Fit and evaluate
        model.fit(X_fold_train, y_fold_train)
        y_pred = model.predict(X_fold_val)
        
        # Calculate pinball loss
        score = mean_pinball_loss(y_fold_val, y_pred, alpha=0.90)
        scores.append(score)
    
    return np.mean(scores)

def optimize_and_train_model(X_train, y_train, n_trials=10):
    """Optimize hyperparameters and return the best model"""
    # Create study
    study = optuna.create_study(direction="minimize")
    
    # Optimize
    study.optimize(lambda trial: objective(trial, X_train, y_train), 
                  n_trials=n_trials,
                  show_progress_bar=True)
    
    # Get best parameters
    best_params = study.best_params
    print("Best parameters:", best_params)
    
    # Train final model with best parameters
    final_model = GradientBoostingRegressor(
        loss="quantile",
        alpha=0.90,
        random_state=42,
        **best_params
    )
    final_model.fit(X_train, y_train)
    
    return final_model, study,best_params

def evaluate_model(model, X_test, y_test):
    """Evaluate the model on test data"""
    y_pred = model.predict(X_test)
    pinball_loss = mean_pinball_loss(y_test, y_pred, alpha=0.90)
    
    # Calculate percentage of predictions above actual values
    coverage = np.mean(y_test <= y_pred)
    
    return {
        'pinball_loss': pinball_loss,
        'coverage': coverage
    }

X_train_array = np.array(X_train)
y_train_array = np.array(y_train)

best_model, study,best_params = optimize_and_train_model(
    X_train_array, 
    y_train_array, 
    n_trials=10
)

metrics = evaluate_model(best_model, X_test, y_test)
print("\nModel Evaluation:")
print(f"Pinball Loss: {metrics['pinball_loss']:.4f}")
print(f"Coverage (should be close to 0.90): {metrics['coverage']:.4f}")

[I 2025-02-16 03:21:04,847] A new study created in memory with name: no-name-adba6c6f-8a5f-4df8-8060-d5e005131e3f


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2025-02-16 03:21:22,113] Trial 0 finished with value: 0.2761709670153281 and parameters: {'learning_rate': 0.11593253773852119, 'n_estimators': 372, 'max_depth': 2, 'min_samples_leaf': 14, 'min_samples_split': 10, 'subsample': 0.8948923707683191}. Best is trial 0 with value: 0.2761709670153281.
[I 2025-02-16 03:21:45,211] Trial 1 finished with value: 0.2885702806878709 and parameters: {'learning_rate': 0.21278747078520102, 'n_estimators': 206, 'max_depth': 5, 'min_samples_leaf': 6, 'min_samples_split': 6, 'subsample': 0.9259242185523049}. Best is trial 0 with value: 0.2761709670153281.
[I 2025-02-16 03:21:52,382] Trial 2 finished with value: 0.27600923123348897 and parameters: {'learning_rate': 0.06406078222241268, 'n_estimators': 144, 'max_depth': 3, 'min_samples_leaf': 8, 'min_samples_split': 13, 'subsample': 0.6256636900136544}. Best is trial 2 with value: 0.27600923123348897.
[I 2025-02-16 03:22:28,271] Trial 3 finished with value: 0.27901865745960097 and parameters: {'learning_

### Train on all datas

In [6]:
def train_upper_quantile_model(X_full, y_full):
    """Train a single Gradient Boosting model for 0.90 quantile"""
    model = GradientBoostingRegressor(
        loss="quantile",
        alpha=0.9,
        **best_params # best params identified during the tuning procedure
    )
    model.fit(X_full, y_full)
    return model

# Combine training and test data
X_full = np.concatenate([X_train, X_test], axis=0)
y_full = np.concatenate([y_train, y_test], axis=0)

# Train model
upper_model = train_upper_quantile_model(X_full, y_full)

### Save final model

In [7]:
joblib.dump(upper_model, 'models_and_preprocessing/quantile_model_90_duration_anesthesia.joblib')

['models_and_preprocessing/quantile_model_90_duration_anesthesia.joblib']

In [8]:
sample['duration_anesthesia']

0    3.166667
1    4.433333
2    1.333333
3    5.833333
4    6.500000
Name: duration_anesthesia, dtype: float64