In [1]:
import pandas as pd 


In [2]:
df = pd.read_csv("../data/cleaned_choreographies.csv")

feature_columns = ["timeDuration",
"nMovements",
"movementsDifficulty",
"robotSpeech",
"acrobaticMovements",
"movementsRepetition",
"movementsTransitionsDuration",
"humanMovements",
"balance",
"speed",
"bodyPartsCombination",
"musicBPM",
"sameStartEndPositionPlace",
"headMovement",
"armsMovement",
"handsMovement",
"legsMovement",
"feetMovement",
"musicGenre_electronic",
"musicGenre_folk",
"musicGenre_indie",
"musicGenre_latin",
"musicGenre_pop",
"musicGenre_rap",
"musicGenre_rock"
]
target_columns = [c for c in df.columns if c not in feature_columns]
df = df[feature_columns + target_columns]


In [3]:
df.head(20)

Unnamed: 0,timeDuration,nMovements,movementsDifficulty,robotSpeech,acrobaticMovements,movementsRepetition,movementsTransitionsDuration,humanMovements,balance,speed,...,musicGenre_pop,musicGenre_rap,musicGenre_rock,EvaluationChoreographyStoryTelling,EvaluationChoreographyRhythm,EvaluationChoreographyMovementTechnique,EvaluationChoreographyPublicInvolvement,EvaluationChoreographySpaceUse,EvaluationChoreographyHumanCharacterization,EvaluationChoreographyHumanReproducibility
0,140,20,2,0,2,1,1,2,1,1,...,0,0,0,2,2,2,1,2,2,3
1,115,21,2,0,3,1,2,3,2,2,...,0,0,0,3,3,3,3,3,3,2
2,110,20,2,0,3,2,2,3,2,2,...,0,0,0,1,1,1,1,1,1,1
3,100,19,1,0,1,1,1,1,1,1,...,0,0,0,1,2,2,2,2,2,2
4,125,20,3,0,3,2,2,3,3,2,...,0,0,0,1,1,1,1,1,1,3
5,108,20,3,0,3,2,2,3,3,2,...,0,0,0,1,2,1,1,2,1,3
6,140,20,2,0,2,1,1,2,1,1,...,0,0,0,1,1,1,1,2,1,4
7,115,21,2,0,3,1,2,3,2,2,...,0,0,0,2,3,4,2,3,4,4
8,110,20,2,0,3,2,2,3,2,2,...,0,0,0,2,4,3,3,3,4,5
9,100,19,1,0,1,1,1,1,1,1,...,0,0,0,4,4,4,4,4,5,5


In [3]:
df_binary = df.copy()
df_binary[target_columns] = (df_binary[target_columns] >= 4).astype(int)


In [5]:
print(df_binary.head(5))

   timeDuration  nMovements  movementsDifficulty  robotSpeech  \
0           140          20                    2            0   
1           115          21                    2            0   
2           110          20                    2            0   
3           100          19                    1            0   
4           125          20                    3            0   

   acrobaticMovements  movementsRepetition  movementsTransitionsDuration  \
0                   2                    1                             1   
1                   3                    1                             2   
2                   3                    2                             2   
3                   1                    1                             1   
4                   3                    2                             2   

   humanMovements  balance  speed  ...  musicGenre_pop  musicGenre_rap  \
0               2        1      1  ...               0               0   
1   

In [4]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, make_scorer

In [5]:
# CELL 0: SETUP (RUN THIS CELL FIRST)
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
# import seaborn as sns # Optional, for potentially nicer plots

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_validate # Added cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler # For Logistic Regression
from sklearn.linear_model import LogisticRegression # For Logistic Regression
from sklearn.ensemble import RandomForestClassifier # For Random Forest
from xgboost import XGBClassifier                     # For XGBoost
from catboost import CatBoostClassifier               # For CatBoost
import torch                                          # For TabPFN (GPU check)
from tabpfn import TabPFNClassifier                   # For TabPFN

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

# --- 0.1 Helper function to print metrics ---
def print_metrics(metrics_dict, prefix=""):
    print(f"{prefix}AUC: {metrics_dict.get('roc_auc', float('nan')):.4f}")
    print(f"{prefix}F1: {metrics_dict.get('f1', float('nan')):.4f}")
    print(f"{prefix}Precision: {metrics_dict.get('precision', float('nan')):.4f}")
    print(f"{prefix}Recall: {metrics_dict.get('recall', float('nan')):.4f}")

# --- 0.2 Global Configuration ---
RANDOM_STATE = 42
scoring_metrics = { # Common scoring metrics for CV
    'roc_auc': 'roc_auc',
    'f1': 'f1',
    'precision': 'precision',
    'recall': 'recall'
}

# --- 0.3 Data Preparation (Using your pre-loaded variables) ---
if 'df_binary' not in globals() or not isinstance(df_binary, pd.DataFrame):
    raise NameError("DataFrame 'df_binary' is not defined. Please ensure it's loaded and named correctly.")
if 'feature_columns' not in globals() or not isinstance(feature_columns, list):
    raise NameError("List 'feature_columns' is not defined. Please ensure it's defined correctly.")
if 'target_columns' not in globals() or not isinstance(target_columns, list):
    raise NameError("List 'target_columns' is not defined. Please ensure it's defined correctly.")

print("--- Data Summary (Using Pre-loaded Data) ---")
print(f"Full dataset ('df_binary') shape: {df_binary.shape}")
print(f"Number of features: {len(feature_columns)}") # Should be <= 100 for TabPFN default
print(f"Number of target variables: {len(target_columns)}")

missing_feats = [col for col in feature_columns if col not in df_binary.columns]
if missing_feats:
    raise ValueError(f"The following feature_columns are not in df_binary: {missing_feats}")
missing_targets = [col for col in target_columns if col not in df_binary.columns]
if missing_targets:
    raise ValueError(f"The following target_columns are not in df_binary: {missing_targets}")

X = df_binary[feature_columns].copy()

# --- 0.4 Initialize Global Results Dictionary ---
if 'all_models_results' not in globals():
    all_models_results = {}
    print("Initialized global `all_models_results` dictionary.")
else:
    print("Global `all_models_results` dictionary already exists. New model results will be added/updated.")

--- Data Summary (Using Pre-loaded Data) ---
Full dataset ('df_binary') shape: (8563, 32)
Number of features: 25
Number of target variables: 7
Initialized global `all_models_results` dictionary.


In [8]:
# CELL 1: LOGISTIC REGRESSION (ASSUMES CELL 0 HAS BEEN RUN)

model_name_lr = "LogisticRegression"
# Initialize or clear previous results for this model if re-running cell
all_models_results[model_name_lr] = {} 

print(f"\n\n--- Training Model: {model_name_lr} ---")

for target_name in target_columns:
    print(f"\n===== Training {model_name_lr} for Target: {target_name} =====")
    y = df_binary[target_name].copy()

    # --- Start of copy-pasteable block for each model cell ---
    print(f"Target: {target_name}, N_total: {len(y)}")
    class_counts = y.value_counts(normalize=True) * 100
    print(f"  Class distribution: 0: {class_counts.get(0, 0):.2f}%, 1: {class_counts.get(1, 0):.2f}%")
    if y.nunique() < 2:
        print(f"  WARNING: Target '{target_name}' has only one class. Skipping.")
        all_models_results[model_name_lr][target_name] = {'status': 'Skipped - single class'}
        continue
    current_min_class_samples = y.value_counts().min()
    if current_min_class_samples < 2 :
        print(f"  WARNING: Smallest class in target {target_name} has {current_min_class_samples} sample(s). Cannot stratify. Skipping.")
        all_models_results[model_name_lr][target_name] = {'status': f'Skipped - too few samples for stratification ({current_min_class_samples})'}
        continue
    try:
        X_train, X_holdout, y_train, y_holdout = train_test_split(
            X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
        )
    except ValueError as e:
        print(f"  ERROR during train_test_split for target {target_name}: {e}")
        all_models_results[model_name_lr][target_name] = {'status': f'Skipped - split error: {e}'}
        continue
    print(f"  Train set size: {X_train.shape[0]}, Hold-out set size: {X_holdout.shape[0]}")
    min_class_count_train = y_train.value_counts().min()
    n_cv_splits = min(5, min_class_count_train)
    if n_cv_splits < 2:
        print(f"  WARNING: Smallest class in y_train for target {target_name} has {min_class_count_train} samples. Cannot perform CV. Skipping.")
        all_models_results[model_name_lr][target_name] = {'status': f'Skipped - too few samples for CV ({min_class_count_train})'}
        continue
    cv_splitter = StratifiedKFold(n_splits=n_cv_splits, shuffle=True, random_state=RANDOM_STATE)
    print(f"  Using Stratified {n_cv_splits}-Fold CV.")
    # --- End of copy-pasteable block ---

    # Pipeline and Parameters for Logistic Regression
    lr_pipeline = Pipeline([
        ('preprocessing', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ])),
        ('clf', LogisticRegression(solver='liblinear', class_weight='balanced', random_state=RANDOM_STATE, max_iter=1000))
    ])
    lr_param_grid = {
        'clf__C': np.logspace(-3, 3, 7),
        'clf__penalty': ['l1', 'l2']
    }

    # GridSearchCV
    search_cv_lr = GridSearchCV(
        lr_pipeline, lr_param_grid, cv=cv_splitter, scoring=scoring_metrics,
        refit='roc_auc', verbose=0, n_jobs=-1
    )

    print("  Starting GridSearchCV...")
    start_time_cv_lr = time.time()
    try:
        search_cv_lr.fit(X_train, y_train)
    except Exception as e:
        print(f"  ERROR during GridSearchCV for target {target_name}: {e}")
        all_models_results[model_name_lr][target_name] = {'status': f'Skipped - SearchCV error: {e}'}
        continue
    cv_training_time_lr = time.time() - start_time_cv_lr
    print(f"  GridSearchCV completed in {cv_training_time_lr:.2f} seconds.")

    # CV Metrics (Simplified: using mean_test_score from cv_results_)
    results_df_lr = pd.DataFrame(search_cv_lr.cv_results_)
    best_index_lr = search_cv_lr.best_index_
    fold_metrics_summary_lr = {}
    print("\n  Cross-Validation Mean Metrics (for best estimator):")
    for metric_name_cv in scoring_metrics.keys():
        mean_cv_score = results_df_lr.iloc[best_index_lr][f'mean_test_{metric_name_cv}']
        std_cv_score = results_df_lr.iloc[best_index_lr][f'std_test_{metric_name_cv}']
        fold_metrics_summary_lr[metric_name_cv] = {'mean': mean_cv_score, 'std': std_cv_score}
        print(f"    Mean CV {metric_name_cv.upper()}: {mean_cv_score:.4f} ± {std_cv_score:.4f}")
    
    best_params_lr = search_cv_lr.best_params_
    print(f"\n  Best Hyperparameters: {best_params_lr}")
    print(f"  Best CV ROC AUC: {search_cv_lr.best_score_:.4f}")

    # Hold-out Evaluation
    best_model_lr = search_cv_lr.best_estimator_
    start_time_inference_lr = time.time()
    y_holdout_pred_lr = best_model_lr.predict(X_holdout)
    y_holdout_proba_lr = best_model_lr.predict_proba(X_holdout)[:, 1]
    inference_time_lr = time.time() - start_time_inference_lr

    holdout_metrics_lr = {
        'roc_auc': roc_auc_score(y_holdout, y_holdout_proba_lr),
        'f1': f1_score(y_holdout, y_holdout_pred_lr, zero_division=0),
        'precision': precision_score(y_holdout, y_holdout_pred_lr, zero_division=0),
        'recall': recall_score(y_holdout, y_holdout_pred_lr, zero_division=0)
    }
    print("\n  Hold-Out Set Metrics:")
    print_metrics(holdout_metrics_lr)
    print(f"  Inference time on hold-out: {inference_time_lr:.4f} seconds")

    # Coefficients
    coefficients_lr = best_model_lr.named_steps['clf'].coef_[0]
    coeff_df_lr = pd.DataFrame({'feature': X_train.columns, 'coefficient': coefficients_lr})
    coeff_df_lr = coeff_df_lr.sort_values(by='coefficient', key=abs, ascending=False)

    # Store results
    all_models_results[model_name_lr][target_name] = {
        'status': 'Completed',
        'best_params': best_params_lr,
        'cv_training_time': cv_training_time_lr,
        'cv_metrics': fold_metrics_summary_lr,
        'best_cv_roc_auc': search_cv_lr.best_score_,
        'holdout_metrics': holdout_metrics_lr,
        'inference_time': inference_time_lr,
        'coefficients': coeff_df_lr, # LR specific
        'oob_score': np.nan # Not applicable for LR
    }
    print(f"===== Finished {model_name_lr} for Target: {target_name} =====")

print(f"\n--- Completed Training for {model_name_lr} ---")
# Optional: print summary for LR only
# for target_name, results in all_models_results[model_name_lr].items():
#     if results.get('status') == 'Completed':
#         print(f"  Target: {target_name}, Hold-out AUC: {results['holdout_metrics']['roc_auc']:.4f}")
#     else:
#         print(f"  Target: {target_name}, Status: {results.get('status')}")



--- Training Model: LogisticRegression ---

===== Training LogisticRegression for Target: EvaluationChoreographyStoryTelling =====
Target: EvaluationChoreographyStoryTelling, N_total: 8563
  Class distribution: 0: 65.98%, 1: 34.02%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting GridSearchCV...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  GridSearchCV completed in 1.89 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.6880 ± 0.0162
    Mean CV F1: 0.5562 ± 0.0149
    Mean CV PRECISION: 0.5111 ± 0.0194
    Mean CV RECALL: 0.6103 ± 0.0092

  Best Hyperparameters: {'clf__C': np.float64(0.01), 'clf__penalty': 'l2'}
  Best CV ROC AUC: 0.6880

  Hold-Out Set Metrics:
AUC: 0.6802
F1: 0.5316
Precision: 0.4985
Recall: 0.5695
  Inference time on hold-out: 0.0024 seconds
===== Finished LogisticRegression for Target: EvaluationChoreographyStoryTelling =====

===== Training LogisticRegression for Target: EvaluationChoreographyRhythm =====
Target: EvaluationChoreographyRhythm, N_total: 8563
  Class distribution: 0: 57.58%, 1: 42.42%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting GridSearchCV...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  GridSearchCV completed in 0.39 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.6799 ± 0.0141
    Mean CV F1: 0.6039 ± 0.0181
    Mean CV PRECISION: 0.5991 ± 0.0142
    Mean CV RECALL: 0.6090 ± 0.0242

  Best Hyperparameters: {'clf__C': np.float64(0.001), 'clf__penalty': 'l2'}
  Best CV ROC AUC: 0.6799

  Hold-Out Set Metrics:
AUC: 0.6776
F1: 0.5973
Precision: 0.6062
Recall: 0.5887
  Inference time on hold-out: 0.0024 seconds
===== Finished LogisticRegression for Target: EvaluationChoreographyRhythm =====

===== Training LogisticRegression for Target: EvaluationChoreographyMovementTechnique =====
Target: EvaluationChoreographyMovementTechnique, N_total: 8563
  Class distribution: 0: 65.27%, 1: 34.73%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting GridSearchCV...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  GridSearchCV completed in 0.43 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.6637 ± 0.0092
    Mean CV F1: 0.5416 ± 0.0065
    Mean CV PRECISION: 0.5038 ± 0.0125
    Mean CV RECALL: 0.5860 ± 0.0075

  Best Hyperparameters: {'clf__C': np.float64(0.01), 'clf__penalty': 'l2'}
  Best CV ROC AUC: 0.6637

  Hold-Out Set Metrics:
AUC: 0.6696
F1: 0.5529
Precision: 0.5176
Recall: 0.5933
  Inference time on hold-out: 0.0024 seconds
===== Finished LogisticRegression for Target: EvaluationChoreographyMovementTechnique =====

===== Training LogisticRegression for Target: EvaluationChoreographyPublicInvolvement =====
Target: EvaluationChoreographyPublicInvolvement, N_total: 8563
  Class distribution: 0: 68.33%, 1: 31.67%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting GridSearchCV...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  GridSearchCV completed in 0.39 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.6577 ± 0.0108
    Mean CV F1: 0.5065 ± 0.0130
    Mean CV PRECISION: 0.4367 ± 0.0106
    Mean CV RECALL: 0.6031 ± 0.0187

  Best Hyperparameters: {'clf__C': np.float64(0.001), 'clf__penalty': 'l2'}
  Best CV ROC AUC: 0.6577

  Hold-Out Set Metrics:
AUC: 0.6658
F1: 0.5267
Precision: 0.4498
Recall: 0.6354
  Inference time on hold-out: 0.0024 seconds
===== Finished LogisticRegression for Target: EvaluationChoreographyPublicInvolvement =====

===== Training LogisticRegression for Target: EvaluationChoreographySpaceUse =====
Target: EvaluationChoreographySpaceUse, N_total: 8563
  Class distribution: 0: 73.02%, 1: 26.98%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting GridSearchCV...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  GridSearchCV completed in 0.38 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.6718 ± 0.0203
    Mean CV F1: 0.4962 ± 0.0198
    Mean CV PRECISION: 0.4079 ± 0.0154
    Mean CV RECALL: 0.6337 ± 0.0302

  Best Hyperparameters: {'clf__C': np.float64(0.001), 'clf__penalty': 'l2'}
  Best CV ROC AUC: 0.6718

  Hold-Out Set Metrics:
AUC: 0.6517
F1: 0.4808
Precision: 0.3966
Recall: 0.6104
  Inference time on hold-out: 0.0021 seconds
===== Finished LogisticRegression for Target: EvaluationChoreographySpaceUse =====

===== Training LogisticRegression for Target: EvaluationChoreographyHumanCharacterization =====
Target: EvaluationChoreographyHumanCharacterization, N_total: 8563
  Class distribution: 0: 59.27%, 1: 40.73%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting GridSearchCV...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  GridSearchCV completed in 0.37 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.5367 ± 0.0129
    Mean CV F1: 0.4794 ± 0.0124
    Mean CV PRECISION: 0.4311 ± 0.0114
    Mean CV RECALL: 0.5409 ± 0.0262

  Best Hyperparameters: {'clf__C': np.float64(0.1), 'clf__penalty': 'l1'}
  Best CV ROC AUC: 0.5367

  Hold-Out Set Metrics:
AUC: 0.5313
F1: 0.4810
Precision: 0.4317
Recall: 0.5430
  Inference time on hold-out: 0.0024 seconds
===== Finished LogisticRegression for Target: EvaluationChoreographyHumanCharacterization =====

===== Training LogisticRegression for Target: EvaluationChoreographyHumanReproducibility =====
Target: EvaluationChoreographyHumanReproducibility, N_total: 8563
  Class distribution: 0: 23.27%, 1: 76.73%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting GridSearchCV...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  GridSearchCV completed in 0.40 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.5500 ± 0.0095
    Mean CV F1: 0.6357 ± 0.0176
    Mean CV PRECISION: 0.7919 ± 0.0029
    Mean CV RECALL: 0.5314 ± 0.0244

  Best Hyperparameters: {'clf__C': np.float64(1000.0), 'clf__penalty': 'l1'}
  Best CV ROC AUC: 0.5500

  Hold-Out Set Metrics:
AUC: 0.5323
F1: 0.6433
Precision: 0.7866
Recall: 0.5441
  Inference time on hold-out: 0.0020 seconds
===== Finished LogisticRegression for Target: EvaluationChoreographyHumanReproducibility =====

--- Completed Training for LogisticRegression ---


In [9]:
# CELL 2: RANDOM FOREST (ASSUMES CELL 0 HAS BEEN RUN)

model_name_rf = "RandomForest"
# Initialize or clear previous results for this model if re-running cell
all_models_results[model_name_rf] = {} 

print(f"\n\n--- Training Model: {model_name_rf} ---")

# RF Specific Hyperparameters (as per your context)
rf_param_dist = {
    'clf__n_estimators': [100, 200, 300], # Reduced for quicker demo, adjust as needed
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__max_features': ['sqrt', 'log2', 0.3] # Use 'sqrt' or 'log2' instead of 'auto'
}
n_iter_random_rf = 10 # Number of iterations for RandomizedSearchCV

for target_name in target_columns:
    print(f"\n===== Training {model_name_rf} for Target: {target_name} =====")
    y = df_binary[target_name].copy()

    # --- Start of copy-pasteable block for each model cell ---
    print(f"Target: {target_name}, N_total: {len(y)}")
    class_counts = y.value_counts(normalize=True) * 100
    print(f"  Class distribution: 0: {class_counts.get(0, 0):.2f}%, 1: {class_counts.get(1, 0):.2f}%")
    if y.nunique() < 2:
        print(f"  WARNING: Target '{target_name}' has only one class. Skipping.")
        all_models_results[model_name_rf][target_name] = {'status': 'Skipped - single class'}
        continue
    current_min_class_samples = y.value_counts().min()
    if current_min_class_samples < 2 :
        print(f"  WARNING: Smallest class in target {target_name} has {current_min_class_samples} sample(s). Cannot stratify. Skipping.")
        all_models_results[model_name_rf][target_name] = {'status': f'Skipped - too few samples for stratification ({current_min_class_samples})'}
        continue
    try:
        X_train, X_holdout, y_train, y_holdout = train_test_split(
            X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
        )
    except ValueError as e:
        print(f"  ERROR during train_test_split for target {target_name}: {e}")
        all_models_results[model_name_rf][target_name] = {'status': f'Skipped - split error: {e}'}
        continue
    print(f"  Train set size: {X_train.shape[0]}, Hold-out set size: {X_holdout.shape[0]}")
    min_class_count_train = y_train.value_counts().min()
    n_cv_splits = min(5, min_class_count_train)
    if n_cv_splits < 2:
        print(f"  WARNING: Smallest class in y_train for target {target_name} has {min_class_count_train} samples. Cannot perform CV. Skipping.")
        all_models_results[model_name_rf][target_name] = {'status': f'Skipped - too few samples for CV ({min_class_count_train})'}
        continue
    cv_splitter = StratifiedKFold(n_splits=n_cv_splits, shuffle=True, random_state=RANDOM_STATE)
    print(f"  Using Stratified {n_cv_splits}-Fold CV.")
    # --- End of copy-pasteable block ---

    # Pipeline for Random Forest
    rf_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')), # As per your context
        ('clf', RandomForestClassifier(random_state=RANDOM_STATE,
                                      class_weight='balanced', # As per your context
                                      oob_score=True)) # Enable OOB score
    ])

    # RandomizedSearchCV (as per your context for RF)
    search_cv_rf = RandomizedSearchCV(
        rf_pipeline,
        param_distributions=rf_param_dist,
        n_iter=n_iter_random_rf,
        cv=cv_splitter,
        scoring=scoring_metrics,
        refit='roc_auc',
        verbose=0,
        n_jobs=-1,
        random_state=RANDOM_STATE
    )

    print("  Starting RandomizedSearchCV...")
    start_time_cv_rf = time.time()
    try:
        search_cv_rf.fit(X_train, y_train)
    except Exception as e:
        print(f"  ERROR during RandomizedSearchCV for target {target_name}: {e}")
        all_models_results[model_name_rf][target_name] = {'status': f'Skipped - SearchCV error: {e}'}
        continue
    cv_training_time_rf = time.time() - start_time_cv_rf
    print(f"  RandomizedSearchCV completed in {cv_training_time_rf:.2f} seconds.")

    # CV Metrics
    results_df_rf = pd.DataFrame(search_cv_rf.cv_results_)
    best_index_rf = search_cv_rf.best_index_
    fold_metrics_summary_rf = {}
    print("\n  Cross-Validation Mean Metrics (for best estimator):")
    for metric_name_cv in scoring_metrics.keys():
        mean_cv_score = results_df_rf.iloc[best_index_rf][f'mean_test_{metric_name_cv}']
        std_cv_score = results_df_rf.iloc[best_index_rf][f'std_test_{metric_name_cv}']
        fold_metrics_summary_rf[metric_name_cv] = {'mean': mean_cv_score, 'std': std_cv_score}
        print(f"    Mean CV {metric_name_cv.upper()}: {mean_cv_score:.4f} ± {std_cv_score:.4f}")

    best_params_rf = search_cv_rf.best_params_
    print(f"\n  Best Hyperparameters: {best_params_rf}")
    print(f"  Best CV ROC AUC: {search_cv_rf.best_score_:.4f}")

    # Hold-out Evaluation
    best_model_rf = search_cv_rf.best_estimator_
    
    oob_score_value_rf = np.nan
    if hasattr(best_model_rf.named_steps['clf'], 'oob_score_'):
        oob_score_value_rf = best_model_rf.named_steps['clf'].oob_score_
        print(f"  OOB Score: {oob_score_value_rf:.4f}")

    start_time_inference_rf = time.time()
    y_holdout_pred_rf = best_model_rf.predict(X_holdout)
    y_holdout_proba_rf = best_model_rf.predict_proba(X_holdout)[:, 1]
    inference_time_rf = time.time() - start_time_inference_rf

    holdout_metrics_rf = {
        'roc_auc': roc_auc_score(y_holdout, y_holdout_proba_rf),
        'f1': f1_score(y_holdout, y_holdout_pred_rf, zero_division=0),
        'precision': precision_score(y_holdout, y_holdout_pred_rf, zero_division=0),
        'recall': recall_score(y_holdout, y_holdout_pred_rf, zero_division=0)
    }
    print("\n  Hold-Out Set Metrics:")
    print_metrics(holdout_metrics_rf)
    print(f"  Inference time on hold-out: {inference_time_rf:.4f} seconds")

    # Feature Importances
    importances_rf = best_model_rf.named_steps['clf'].feature_importances_
    importances_df_rf = pd.DataFrame({'feature': X_train.columns, 'importance': importances_rf})
    importances_df_rf = importances_df_rf.sort_values(by='importance', ascending=False)

    # Store results
    all_models_results[model_name_rf][target_name] = {
        'status': 'Completed',
        'best_params': best_params_rf,
        'cv_training_time': cv_training_time_rf,
        'cv_metrics': fold_metrics_summary_rf,
        'best_cv_roc_auc': search_cv_rf.best_score_,
        'holdout_metrics': holdout_metrics_rf,
        'inference_time': inference_time_rf,
        'feature_importances': importances_df_rf, # RF specific
        'oob_score': oob_score_value_rf # RF specific
    }
    print(f"===== Finished {model_name_rf} for Target: {target_name} =====")

print(f"\n--- Completed Training for {model_name_rf} ---")



--- Training Model: RandomForest ---

===== Training RandomForest for Target: EvaluationChoreographyStoryTelling =====
Target: EvaluationChoreographyStoryTelling, N_total: 8563
  Class distribution: 0: 65.98%, 1: 34.02%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting RandomizedSearchCV...
  RandomizedSearchCV completed in 5.81 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.7319 ± 0.0102
    Mean CV F1: 0.5728 ± 0.0062
    Mean CV PRECISION: 0.5622 ± 0.0144
    Mean CV RECALL: 0.5841 ± 0.0087

  Best Hyperparameters: {'clf__n_estimators': 300, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 2, 'clf__max_features': 0.3, 'clf__max_depth': 10}
  Best CV ROC AUC: 0.7319
  OOB Score: 0.7006

  Hold-Out Set Metrics:
AUC: 0.7243
F1: 0.5550
Precision: 0.5397
Recall: 0.5712
  Inference time on hold-out: 0.0656 seconds
===== Finished RandomForest for Target: EvaluationChoreographyStoryTelling =====

===== Tr

In [10]:
# CELL 4: XGBOOST (ASSUMES CELL 0 HAS BEEN RUN)

model_name_xgb = "XGBoost"
all_models_results[model_name_xgb] = {} 

print(f"\n\n--- Training Model: {model_name_xgb} ---")

# XGBoost Specific Hyperparameters (Reduced for RandomizedSearchCV demo)
# For a full search, expand this based on your context
xgb_param_dist = {
    'clf__learning_rate': [0.01, 0.05, 0.1], # eta
    'clf__max_depth': [3, 5, 7], # Reduced from [3, 5, 7, 10]
    'clf__subsample': [0.7, 0.8, 0.9], # Reduced from [0.6, 0.8, 1.0]
    'clf__colsample_bytree': [0.7, 0.8, 0.9], # Reduced from [0.6, 0.8, 1.0]
    'clf__n_estimators': [100, 200, 300], # Reduced from [100, 200, 500]
    'clf__min_child_weight': [1, 3], # Reduced from [1, 3, 5]
    'clf__gamma': [0, 0.1], # Reduced from [0, 0.1, 0.2]
    'clf__reg_alpha': [0, 0.1, 1], # Reduced from [0, 1, 10] (L1)
    'clf__reg_lambda': [0.1, 1, 5]  # Reduced from [0, 1, 10] (L2)
}
n_iter_random_xgb = 15 # Number of iterations for RandomizedSearchCV, increase for more thorough search

for target_name in target_columns:
    print(f"\n===== Training {model_name_xgb} for Target: {target_name} =====")
    y = df_binary[target_name].copy()

    # --- Start of copy-pasteable block for each model cell ---
    print(f"Target: {target_name}, N_total: {len(y)}")
    class_counts = y.value_counts(normalize=True) * 100
    print(f"  Class distribution: 0: {class_counts.get(0, 0):.2f}%, 1: {class_counts.get(1, 0):.2f}%")
    if y.nunique() < 2:
        print(f"  WARNING: Target '{target_name}' has only one class. Skipping.")
        all_models_results[model_name_xgb][target_name] = {'status': 'Skipped - single class'}
        continue
    current_min_class_samples = y.value_counts().min()
    if current_min_class_samples < 2 :
        print(f"  WARNING: Smallest class in target {target_name} has {current_min_class_samples} sample(s). Cannot stratify. Skipping.")
        all_models_results[model_name_xgb][target_name] = {'status': f'Skipped - too few samples for stratification ({current_min_class_samples})'}
        continue
    try:
        X_train, X_holdout, y_train, y_holdout = train_test_split(
            X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
        )
    except ValueError as e:
        print(f"  ERROR during train_test_split for target {target_name}: {e}")
        all_models_results[model_name_xgb][target_name] = {'status': f'Skipped - split error: {e}'}
        continue
    print(f"  Train set size: {X_train.shape[0]}, Hold-out set size: {X_holdout.shape[0]}")
    min_class_count_train = y_train.value_counts().min()
    n_cv_splits = min(5, min_class_count_train)
    if n_cv_splits < 2:
        print(f"  WARNING: Smallest class in y_train for target {target_name} has {min_class_count_train} samples. Cannot perform CV. Skipping.")
        all_models_results[model_name_xgb][target_name] = {'status': f'Skipped - too few samples for CV ({min_class_count_train})'}
        continue
    cv_splitter = StratifiedKFold(n_splits=n_cv_splits, shuffle=True, random_state=RANDOM_STATE)
    print(f"  Using Stratified {n_cv_splits}-Fold CV.")
    # --- End of copy-pasteable block ---

    # Define the scale_pos_weight for imbalanced datasets
    # scale_pos_weight = count(negative examples)/count(positive examples)
    # This is an alternative or complement to class_weight='balanced' used in other models
    counter_train = y_train.value_counts()
    scale_pos_weight_val = counter_train.get(0,1) / counter_train.get(1,1) # Default to 1 if a class is missing (should be caught earlier)


    # Pipeline for XGBoost
    xgb_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')), # XGBoost can handle NaNs, but for consistency with context
        ('clf', XGBClassifier(
            objective='binary:logistic',
            eval_metric='auc',      # This is for internal XGB eval, not for GridSearchCV scoring
            use_label_encoder=False, # Recommended for newer XGBoost versions
            tree_method='hist',     # Faster for many datasets
            random_state=RANDOM_STATE,
            scale_pos_weight=scale_pos_weight_val # Handle class imbalance
        ))
    ])

    # RandomizedSearchCV for XGBoost
    search_cv_xgb = RandomizedSearchCV(
        xgb_pipeline,
        param_distributions=xgb_param_dist,
        n_iter=n_iter_random_xgb,
        cv=cv_splitter,
        scoring=scoring_metrics,
        refit='roc_auc',
        verbose=0,
        n_jobs=-1,
        random_state=RANDOM_STATE
    )

    print("  Starting RandomizedSearchCV...")
    start_time_cv_xgb = time.time()
    try:
        search_cv_xgb.fit(X_train, y_train)
    except Exception as e:
        print(f"  ERROR during RandomizedSearchCV for {model_name_xgb}, target {target_name}: {e}")
        all_models_results[model_name_xgb][target_name] = {'status': f'Skipped - SearchCV error: {e}'}
        continue
    cv_training_time_xgb = time.time() - start_time_cv_xgb
    print(f"  RandomizedSearchCV completed in {cv_training_time_xgb:.2f} seconds.")

    # CV Metrics
    results_df_xgb = pd.DataFrame(search_cv_xgb.cv_results_)
    best_index_xgb = search_cv_xgb.best_index_
    fold_metrics_summary_xgb = {}
    print("\n  Cross-Validation Mean Metrics (for best estimator):")
    for metric_name_cv in scoring_metrics.keys():
        mean_cv_score = results_df_xgb.iloc[best_index_xgb][f'mean_test_{metric_name_cv}']
        std_cv_score = results_df_xgb.iloc[best_index_xgb][f'std_test_{metric_name_cv}']
        fold_metrics_summary_xgb[metric_name_cv] = {'mean': mean_cv_score, 'std': std_cv_score}
        print(f"    Mean CV {metric_name_cv.upper()}: {mean_cv_score:.4f} ± {std_cv_score:.4f}")

    best_params_xgb = search_cv_xgb.best_params_
    print(f"\n  Best Hyperparameters: {best_params_xgb}")
    print(f"  Best CV ROC AUC: {search_cv_xgb.best_score_:.4f}")

    # Hold-out Evaluation
    best_model_xgb = search_cv_xgb.best_estimator_
    start_time_inference_xgb = time.time()
    y_holdout_pred_xgb = best_model_xgb.predict(X_holdout)
    y_holdout_proba_xgb = best_model_xgb.predict_proba(X_holdout)[:, 1]
    inference_time_xgb = time.time() - start_time_inference_xgb

    holdout_metrics_xgb = {
        'roc_auc': roc_auc_score(y_holdout, y_holdout_proba_xgb),
        'f1': f1_score(y_holdout, y_holdout_pred_xgb, zero_division=0),
        'precision': precision_score(y_holdout, y_holdout_pred_xgb, zero_division=0),
        'recall': recall_score(y_holdout, y_holdout_pred_xgb, zero_division=0)
    }
    print("\n  Hold-Out Set Metrics:")
    print_metrics(holdout_metrics_xgb)
    print(f"  Inference time on hold-out: {inference_time_xgb:.4f} seconds")

    # Feature Importances (default is 'weight', can also be 'gain', 'cover')
    importances_xgb = best_model_xgb.named_steps['clf'].feature_importances_
    importances_df_xgb = pd.DataFrame({'feature': X_train.columns, 'importance': importances_xgb})
    importances_df_xgb = importances_df_xgb.sort_values(by='importance', ascending=False)

    # Store results
    all_models_results[model_name_xgb][target_name] = {
        'status': 'Completed',
        'best_params': best_params_xgb,
        'cv_training_time': cv_training_time_xgb,
        'cv_metrics': fold_metrics_summary_xgb,
        'best_cv_roc_auc': search_cv_xgb.best_score_,
        'holdout_metrics': holdout_metrics_xgb,
        'inference_time': inference_time_xgb,
        'feature_importances': importances_df_xgb,
        'oob_score': np.nan # Not applicable for XGBoost in this Scikit-learn pipeline setup
    }
    print(f"===== Finished {model_name_xgb} for Target: {target_name} =====")

print(f"\n--- Completed Training for {model_name_xgb} ---")



--- Training Model: XGBoost ---

===== Training XGBoost for Target: EvaluationChoreographyStoryTelling =====
Target: EvaluationChoreographyStoryTelling, N_total: 8563
  Class distribution: 0: 65.98%, 1: 34.02%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting RandomizedSearchCV...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  RandomizedSearchCV completed in 1.67 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.7359 ± 0.0078
    Mean CV F1: 0.5878 ± 0.0089
    Mean CV PRECISION: 0.5413 ± 0.0135
    Mean CV RECALL: 0.6433 ± 0.0085

  Best Hyperparameters: {'clf__subsample': 0.8, 'clf__reg_lambda': 5, 'clf__reg_alpha': 0, 'clf__n_estimators': 300, 'clf__min_child_weight': 3, 'clf__max_depth': 7, 'clf__learning_rate': 0.01, 'clf__gamma': 0, 'clf__colsample_bytree': 0.7}
  Best CV ROC AUC: 0.7359

  Hold-Out Set Metrics:
AUC: 0.7270
F1: 0.5696
Precision: 0.5286
Recall: 0.6175
  Inference time on hold-out: 0.0044 seconds
===== Finished XGBoost for Target: EvaluationChoreographyStoryTelling =====

===== Training XGBoost for Target: EvaluationChoreographyRhythm =====
Target: EvaluationChoreographyRhythm, N_total: 8563
  Class distribution: 0: 57.58%, 1: 42.42%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting RandomizedSearchCV...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  RandomizedSearchCV completed in 1.61 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.7289 ± 0.0084
    Mean CV F1: 0.6223 ± 0.0122
    Mean CV PRECISION: 0.6430 ± 0.0080
    Mean CV RECALL: 0.6034 ± 0.0231

  Best Hyperparameters: {'clf__subsample': 0.8, 'clf__reg_lambda': 5, 'clf__reg_alpha': 0, 'clf__n_estimators': 300, 'clf__min_child_weight': 3, 'clf__max_depth': 7, 'clf__learning_rate': 0.01, 'clf__gamma': 0, 'clf__colsample_bytree': 0.7}
  Best CV ROC AUC: 0.7289

  Hold-Out Set Metrics:
AUC: 0.7339
F1: 0.6219
Precision: 0.6625
Recall: 0.5860
  Inference time on hold-out: 0.0043 seconds
===== Finished XGBoost for Target: EvaluationChoreographyRhythm =====

===== Training XGBoost for Target: EvaluationChoreographyMovementTechnique =====
Target: EvaluationChoreographyMovementTechnique, N_total: 8563
  Class distribution: 0: 65.27%, 1: 34.73%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting RandomizedSe

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  RandomizedSearchCV completed in 1.36 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.6946 ± 0.0045
    Mean CV F1: 0.5618 ± 0.0035
    Mean CV PRECISION: 0.5112 ± 0.0079
    Mean CV RECALL: 0.6238 ± 0.0099

  Best Hyperparameters: {'clf__subsample': 0.8, 'clf__reg_lambda': 5, 'clf__reg_alpha': 1, 'clf__n_estimators': 200, 'clf__min_child_weight': 1, 'clf__max_depth': 3, 'clf__learning_rate': 0.01, 'clf__gamma': 0.1, 'clf__colsample_bytree': 0.8}
  Best CV ROC AUC: 0.6946

  Hold-Out Set Metrics:
AUC: 0.7080
F1: 0.5660
Precision: 0.5137
Recall: 0.6303
  Inference time on hold-out: 0.0022 seconds
===== Finished XGBoost for Target: EvaluationChoreographyMovementTechnique =====

===== Training XGBoost for Target: EvaluationChoreographyPublicInvolvement =====
Target: EvaluationChoreographyPublicInvolvement, N_total: 8563
  Class distribution: 0: 68.33%, 1: 31.67%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  RandomizedSearchCV completed in 1.52 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.7084 ± 0.0169
    Mean CV F1: 0.5464 ± 0.0175
    Mean CV PRECISION: 0.4962 ± 0.0162
    Mean CV RECALL: 0.6081 ± 0.0233

  Best Hyperparameters: {'clf__subsample': 0.9, 'clf__reg_lambda': 0.1, 'clf__reg_alpha': 0.1, 'clf__n_estimators': 300, 'clf__min_child_weight': 1, 'clf__max_depth': 5, 'clf__learning_rate': 0.01, 'clf__gamma': 0.1, 'clf__colsample_bytree': 0.7}
  Best CV ROC AUC: 0.7084

  Hold-Out Set Metrics:
AUC: 0.6986
F1: 0.5357
Precision: 0.4790
Recall: 0.6077
  Inference time on hold-out: 0.0033 seconds
===== Finished XGBoost for Target: EvaluationChoreographyPublicInvolvement =====

===== Training XGBoost for Target: EvaluationChoreographySpaceUse =====
Target: EvaluationChoreographySpaceUse, N_total: 8563
  Class distribution: 0: 73.02%, 1: 26.98%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting RandomizedSea

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  RandomizedSearchCV completed in 1.33 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.7051 ± 0.0186
    Mean CV F1: 0.5145 ± 0.0145
    Mean CV PRECISION: 0.4121 ± 0.0136
    Mean CV RECALL: 0.6851 ± 0.0198

  Best Hyperparameters: {'clf__subsample': 0.8, 'clf__reg_lambda': 5, 'clf__reg_alpha': 1, 'clf__n_estimators': 200, 'clf__min_child_weight': 1, 'clf__max_depth': 3, 'clf__learning_rate': 0.01, 'clf__gamma': 0.1, 'clf__colsample_bytree': 0.8}
  Best CV ROC AUC: 0.7051

  Hold-Out Set Metrics:
AUC: 0.6970
F1: 0.5148
Precision: 0.4089
Recall: 0.6948
  Inference time on hold-out: 0.0023 seconds
===== Finished XGBoost for Target: EvaluationChoreographySpaceUse =====

===== Training XGBoost for Target: EvaluationChoreographyHumanCharacterization =====
Target: EvaluationChoreographyHumanCharacterization, N_total: 8563
  Class distribution: 0: 59.27%, 1: 40.73%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  RandomizedSearchCV completed in 1.47 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.6201 ± 0.0129
    Mean CV F1: 0.5353 ± 0.0191
    Mean CV PRECISION: 0.4925 ± 0.0144
    Mean CV RECALL: 0.5864 ± 0.0267

  Best Hyperparameters: {'clf__subsample': 0.8, 'clf__reg_lambda': 0.1, 'clf__reg_alpha': 0, 'clf__n_estimators': 200, 'clf__min_child_weight': 3, 'clf__max_depth': 7, 'clf__learning_rate': 0.01, 'clf__gamma': 0, 'clf__colsample_bytree': 0.9}
  Best CV ROC AUC: 0.6201

  Hold-Out Set Metrics:
AUC: 0.5968
F1: 0.5155
Precision: 0.4684
Recall: 0.5731
  Inference time on hold-out: 0.0034 seconds
===== Finished XGBoost for Target: EvaluationChoreographyHumanCharacterization =====

===== Training XGBoost for Target: EvaluationChoreographyHumanReproducibility =====
Target: EvaluationChoreographyHumanReproducibility, N_total: 8563
  Class distribution: 0: 23.27%, 1: 76.73%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


  RandomizedSearchCV completed in 1.41 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.5878 ± 0.0165
    Mean CV F1: 0.7329 ± 0.0076
    Mean CV PRECISION: 0.7966 ± 0.0082
    Mean CV RECALL: 0.6787 ± 0.0096

  Best Hyperparameters: {'clf__subsample': 0.8, 'clf__reg_lambda': 0.1, 'clf__reg_alpha': 0, 'clf__n_estimators': 200, 'clf__min_child_weight': 3, 'clf__max_depth': 7, 'clf__learning_rate': 0.01, 'clf__gamma': 0, 'clf__colsample_bytree': 0.9}
  Best CV ROC AUC: 0.5878

  Hold-Out Set Metrics:
AUC: 0.5850
F1: 0.7469
Precision: 0.7954
Recall: 0.7040
  Inference time on hold-out: 0.0035 seconds
===== Finished XGBoost for Target: EvaluationChoreographyHumanReproducibility =====

--- Completed Training for XGBoost ---


In [11]:
# CELL 5: CATBOOST (ASSUMES CELL 0 HAS BEEN RUN)

model_name_cat = "CatBoost"
all_models_results[model_name_cat] = {}

print(f"\n\n--- Training Model: {model_name_cat} ---")

# CatBoost Specific Hyperparameters (Reduced for RandomizedSearchCV demo)
cat_param_dist = {
    'clf__iterations': [200, 300, 500], # Reduced from [200, 500, 1000]
    'clf__learning_rate': [0.01, 0.03, 0.05, 0.1], # From [0.01, 0.03, 0.1]
    'clf__depth': [4, 6, 8], # Reduced from [4, 6, 8, 10]
    'clf__l2_leaf_reg': [1, 3, 5], # Reduced from [1, 3, 10]
    'clf__bagging_temperature': [0, 1], # Reduced from [0, 1, 2] (for exploration)
    'clf__border_count': [32, 64] # Reduced from [32, 50, 100] (CatBoost default is 254 if on GPU, 32 if on CPU with Hist)
}
n_iter_random_cat = 15 # Number of iterations for RandomizedSearchCV

# Identify categorical feature indices if any (example: 'robotSpeech')
# Assuming 'feature_columns' is the list of feature names used for X
cat_features_indices = [] # Default to no categorical features
# If you want to treat 'robotSpeech' as categorical:
# if 'robotSpeech' in feature_columns:
#    cat_features_indices.append(feature_columns.index('robotSpeech'))
# For this run, we'll assume all are numeric unless specified otherwise.
# CatBoost handles numeric features well.

for target_name in target_columns:
    print(f"\n===== Training {model_name_cat} for Target: {target_name} =====")
    y = df_binary[target_name].copy()

    # --- Start of copy-pasteable block for each model cell ---
    print(f"Target: {target_name}, N_total: {len(y)}")
    class_counts = y.value_counts(normalize=True) * 100
    print(f"  Class distribution: 0: {class_counts.get(0, 0):.2f}%, 1: {class_counts.get(1, 0):.2f}%")
    if y.nunique() < 2:
        print(f"  WARNING: Target '{target_name}' has only one class. Skipping.")
        all_models_results[model_name_cat][target_name] = {'status': 'Skipped - single class'}
        continue
    current_min_class_samples = y.value_counts().min()
    if current_min_class_samples < 2 :
        print(f"  WARNING: Smallest class in target {target_name} has {current_min_class_samples} sample(s). Cannot stratify. Skipping.")
        all_models_results[model_name_cat][target_name] = {'status': f'Skipped - too few samples for stratification ({current_min_class_samples})'}
        continue
    try:
        X_train, X_holdout, y_train, y_holdout = train_test_split(
            X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
        )
    except ValueError as e:
        print(f"  ERROR during train_test_split for target {target_name}: {e}")
        all_models_results[model_name_cat][target_name] = {'status': f'Skipped - split error: {e}'}
        continue
    print(f"  Train set size: {X_train.shape[0]}, Hold-out set size: {X_holdout.shape[0]}")
    min_class_count_train = y_train.value_counts().min()
    n_cv_splits = min(5, min_class_count_train)
    if n_cv_splits < 2:
        print(f"  WARNING: Smallest class in y_train for target {target_name} has {min_class_count_train} samples. Cannot perform CV. Skipping.")
        all_models_results[model_name_cat][target_name] = {'status': f'Skipped - too few samples for CV ({min_class_count_train})'}
        continue
    cv_splitter = StratifiedKFold(n_splits=n_cv_splits, shuffle=True, random_state=RANDOM_STATE)
    print(f"  Using Stratified {n_cv_splits}-Fold CV.")
    # --- End of copy-pasteable block ---
    
    # auto_class_weights for imbalanced datasets. Options: 'Balanced', 'SqrtBalanced' or None
    # This is CatBoost's way to handle class imbalance.
    auto_class_weights_val = 'Balanced'


    # Pipeline for CatBoost
    # CatBoost handles NaNs internally by default, but imputer for consistency or if specific strategy needed.
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('clf', CatBoostClassifier(
            # cat_features=cat_features_indices, # Pass if you have identified categorical features
            eval_metric='AUC',
            random_seed=RANDOM_STATE,
            verbose=0, # Suppress CatBoost's own output during training iterations
            auto_class_weights=auto_class_weights_val, # Handle class imbalance
            # early_stopping_rounds=50 # Cannot be directly used with scikit-learn CV wrappers effectively. Tune iterations instead.
        ))
    ])

    # RandomizedSearchCV for CatBoost
    search_cv_cat = RandomizedSearchCV(
        cat_pipeline,
        param_distributions=cat_param_dist,
        n_iter=n_iter_random_cat,
        cv=cv_splitter,
        scoring=scoring_metrics,
        refit='roc_auc',
        verbose=0, # Suppress RandomizedSearchCV's output
        n_jobs=-1,
        random_state=RANDOM_STATE
    )

    print("  Starting RandomizedSearchCV...")
    start_time_cv_cat = time.time()
    try:
        search_cv_cat.fit(X_train, y_train) # CatBoost can take X_train (pd.DataFrame) directly if not using imputer
    except Exception as e:
        print(f"  ERROR during RandomizedSearchCV for {model_name_cat}, target {target_name}: {e}")
        all_models_results[model_name_cat][target_name] = {'status': f'Skipped - SearchCV error: {e}'}
        continue
    cv_training_time_cat = time.time() - start_time_cv_cat
    print(f"  RandomizedSearchCV completed in {cv_training_time_cat:.2f} seconds.")

    # CV Metrics
    results_df_cat = pd.DataFrame(search_cv_cat.cv_results_)
    best_index_cat = search_cv_cat.best_index_
    fold_metrics_summary_cat = {}
    print("\n  Cross-Validation Mean Metrics (for best estimator):")
    for metric_name_cv in scoring_metrics.keys():
        mean_cv_score = results_df_cat.iloc[best_index_cat][f'mean_test_{metric_name_cv}']
        std_cv_score = results_df_cat.iloc[best_index_cat][f'std_test_{metric_name_cv}']
        fold_metrics_summary_cat[metric_name_cv] = {'mean': mean_cv_score, 'std': std_cv_score}
        print(f"    Mean CV {metric_name_cv.upper()}: {mean_cv_score:.4f} ± {std_cv_score:.4f}")

    best_params_cat = search_cv_cat.best_params_
    print(f"\n  Best Hyperparameters: {best_params_cat}")
    print(f"  Best CV ROC AUC: {search_cv_cat.best_score_:.4f}")

    # Hold-out Evaluation
    best_model_cat = search_cv_cat.best_estimator_
    start_time_inference_cat = time.time()
    y_holdout_pred_cat = best_model_cat.predict(X_holdout)
    y_holdout_proba_cat = best_model_cat.predict_proba(X_holdout)[:, 1]
    inference_time_cat = time.time() - start_time_inference_cat

    holdout_metrics_cat = {
        'roc_auc': roc_auc_score(y_holdout, y_holdout_proba_cat),
        'f1': f1_score(y_holdout, y_holdout_pred_cat, zero_division=0),
        'precision': precision_score(y_holdout, y_holdout_pred_cat, zero_division=0),
        'recall': recall_score(y_holdout, y_holdout_pred_cat, zero_division=0)
    }
    print("\n  Hold-Out Set Metrics:")
    print_metrics(holdout_metrics_cat)
    print(f"  Inference time on hold-out: {inference_time_cat:.4f} seconds")

    # Feature Importances
    importances_cat = best_model_cat.named_steps['clf'].get_feature_importance()
    importances_df_cat = pd.DataFrame({'feature': X_train.columns, 'importance': importances_cat})
    importances_df_cat = importances_df_cat.sort_values(by='importance', ascending=False)

    # Store results
    all_models_results[model_name_cat][target_name] = {
        'status': 'Completed',
        'best_params': best_params_cat,
        'cv_training_time': cv_training_time_cat,
        'cv_metrics': fold_metrics_summary_cat,
        'best_cv_roc_auc': search_cv_cat.best_score_,
        'holdout_metrics': holdout_metrics_cat,
        'inference_time': inference_time_cat,
        'feature_importances': importances_df_cat,
        'oob_score': np.nan # Not directly applicable/retrieved for CatBoost in this Scikit-learn pipeline setup
    }
    print(f"===== Finished {model_name_cat} for Target: {target_name} =====")

print(f"\n--- Completed Training for {model_name_cat} ---")



--- Training Model: CatBoost ---

===== Training CatBoost for Target: EvaluationChoreographyStoryTelling =====
Target: EvaluationChoreographyStoryTelling, N_total: 8563
  Class distribution: 0: 65.98%, 1: 34.02%
  Train set size: 6850, Hold-out set size: 1713
  Using Stratified 5-Fold CV.
  Starting RandomizedSearchCV...
  RandomizedSearchCV completed in 6.97 seconds.

  Cross-Validation Mean Metrics (for best estimator):
    Mean CV ROC_AUC: 0.7312 ± 0.0103
    Mean CV F1: 0.5824 ± 0.0057
    Mean CV PRECISION: 0.5366 ± 0.0112
    Mean CV RECALL: 0.6369 ± 0.0090

  Best Hyperparameters: {'clf__learning_rate': 0.05, 'clf__l2_leaf_reg': 5, 'clf__iterations': 200, 'clf__depth': 4, 'clf__border_count': 64, 'clf__bagging_temperature': 1}
  Best CV ROC AUC: 0.7312

  Hold-Out Set Metrics:
AUC: 0.7238
F1: 0.5795
Precision: 0.5357
Recall: 0.6312
  Inference time on hold-out: 0.0029 seconds
===== Finished CatBoost for Target: EvaluationChoreographyStoryTelling =====

===== Training CatBoost 

In [10]:
# Install TabPFN
!pip install tabpfn

# TabPFN Community installs optional functionalities around the TabPFN model
# These include post-hoc ensembles, interpretability tools, and more
!git clone https://github.com/PriorLabs/tabpfn-extensions
!pip install -e tabpfn-extensions

fatal: destination path 'tabpfn-extensions' already exists and is not an empty directory.
Obtaining file:///Users/ehsan/Desktop/industry/Evaluations_on_Robotic_Choreographies/classification/tabpfn-extensions
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: tabpfn-extensions
  Building editable for tabpfn-extensions (pyproject.toml) ... [?25ldone
[?25h  Created wheel for tabpfn-extensions: filename=tabpfn_extensions-0.1.0-0.editable-py3-none-any.whl size=11719 sha256=08daf04d677f66805dc5341aca83135b75ecc4a2e2378d6c2f1ed84d197af501
  Stored in directory: /private/var/folders/x6/t819xn051hvgc3lldbwgshrr0000gn/T/pip-ephem-wheel-cache-38fd301m/wheels/8d/36/d8/042dedd103ee83599fe7867254f34dfb8e00cd5109663ebdad
Successfully built tabpfn-extensions
Instal

In [6]:
X = df_binary[feature_columns].copy()
y = df_binary[target_columns[0]].copy()

# Encode target labels to classes


#

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Print dataset description


display(X)

Unnamed: 0,timeDuration,nMovements,movementsDifficulty,robotSpeech,acrobaticMovements,movementsRepetition,movementsTransitionsDuration,humanMovements,balance,speed,...,handsMovement,legsMovement,feetMovement,musicGenre_electronic,musicGenre_folk,musicGenre_indie,musicGenre_latin,musicGenre_pop,musicGenre_rap,musicGenre_rock
0,140,20,2,0,2,1,1,2,1,1,...,1,2,2,0,1,0,0,0,0,0
1,115,21,2,0,3,1,2,3,2,2,...,2,3,2,0,1,0,0,0,0,0
2,110,20,2,0,3,2,2,3,2,2,...,2,3,2,0,1,0,0,0,0,0
3,100,19,1,0,1,1,1,1,1,1,...,1,2,1,1,0,0,0,0,0,0
4,125,20,3,0,3,2,2,3,3,2,...,2,3,2,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8558,131,15,2,0,3,1,1,3,1,2,...,3,3,2,0,0,0,1,0,0,0
8559,135,15,1,1,1,3,1,1,2,1,...,2,1,3,0,0,0,0,0,0,1
8560,130,13,3,1,1,1,1,1,2,2,...,3,1,3,0,0,0,0,0,0,1
8561,134,13,3,0,2,3,1,1,1,1,...,2,2,2,0,0,1,0,0,0,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

# Train and evaluate TabPFN
y_pred = TabPFNClassifier(random_state=42).fit(X_train, y_train).predict_proba(X_test)

# Calculate ROC AUC (handles both binary and multiclass)
score = roc_auc_score(y_test, y_pred if len(np.unique(y)) > 2 else y_pred[:, 1])
print(f"TabPFN ROC AUC: {score:.4f}")

: 

In [13]:
# # CELL 6: COMPARISON PLOTTING (ASSUMES CELL 0 AND MODEL CELLS HAVE BEEN RUN)
# # This is identical to Cell 3 from the previous response.
# # Just re-run it after training XGBoost and CatBoost.

# def plot_model_comparison(results_dict, targets_list, metrics_to_plot=None):
#     """
#     Plots comparison metrics for models present in results_dict.
#     results_dict: The dictionary holding all model results.
#     targets_list: List of target column names.
#     metrics_to_plot: List of metrics like ['roc_auc', 'f1']. Defaults to main four.
#     """
#     if metrics_to_plot is None:
#         metrics_to_plot = ['roc_auc', 'f1', 'precision', 'recall']

#     model_names_available = [
#         model_name for model_name, res_dict in results_dict.items() if res_dict  # Check if model has any results
#     ]
    
#     if not model_names_available:
#         print("No model results found in `all_models_results`. Run model training cells first.")
#         return

#     num_models = len(model_names_available)
    
#     common_completed_targets = []
#     if targets_list: 
#         # Start with targets that completed for the first available model
#         # This ensures that if a model is added later and fails on some targets, 
#         # we still try to plot common targets from earlier successful models.
        
#         # Find all targets that have at least one 'Completed' status from any model
#         all_potentially_plottable_targets = set()
#         for m_name in model_names_available:
#             for t_name, res in results_dict.get(m_name, {}).items():
#                 if res.get('status') == 'Completed':
#                     all_potentially_plottable_targets.add(t_name)
        
#         # Now, filter these to ensure they are 'Completed' for ALL currently available models
#         for t_name in sorted(list(all_potentially_plottable_targets)): # Sort for consistent order
#             is_common_and_completed_for_all_available = True
#             for m_name in model_names_available:
#                 if results_dict.get(m_name, {}).get(t_name, {}).get('status') != 'Completed':
#                     is_common_and_completed_for_all_available = False
#                     break
#             if is_common_and_completed_for_all_available:
#                 common_completed_targets.append(t_name)
    
#     if not common_completed_targets:
#         print("\nNo common targets completed successfully across all available models. Skipping plots.")
#         return

#     num_targets = len(common_completed_targets)
#     bar_width = 0.8 / num_models 
    
#     print(f"\n--- Generating Comparison Plots for Models: {', '.join(model_names_available)} ---")
#     print(f"--- On Common Completed Targets: {', '.join([t.replace('EvaluationChoreography', '') for t in common_completed_targets])} ---")
    
#     for metric in metrics_to_plot:
#         plt.figure(figsize=(max(12, num_targets * 1.1 * (num_models / 1.5) ), 7)) # Adjusted fig size
        
#         metric_values_all_models = {}
#         for model_name in model_names_available:
#             metric_values_all_models[model_name] = [
#                 results_dict[model_name][target_name]['holdout_metrics'].get(metric, 0)
#                 for target_name in common_completed_targets
#             ]
        
#         index = np.arange(num_targets)
        
#         for i, model_name in enumerate(model_names_available):
#             bars = plt.bar(index + i * bar_width, metric_values_all_models[model_name], bar_width, label=model_name)
#             for bar_item in bars:
#                 yval = bar_item.get_height()
#                 plt.text(bar_item.get_x() + bar_item.get_width()/2.0, yval + 0.01, f'{yval:.3f}', 
#                          ha='center', va='bottom', fontsize=7, rotation=30) # Smaller font, less rotation

#         plt.xlabel('Target Variable', fontsize=12)
#         plt.ylabel(metric.replace('_', ' ').title(), fontsize=12)
#         plt.title(f'Hold-Out {metric.replace("_", " ").title()} Comparison', fontsize=14)
        
#         xtick_labels = [t.replace("EvaluationChoreography", "") for t in common_completed_targets]
        
#         plt.xticks(index + bar_width * (num_models - 1) / 2, xtick_labels, rotation=45, ha="right", fontsize=9) # Smaller font for x-ticks
#         plt.yticks(fontsize=10)
#         plt.legend(fontsize=10)
#         plt.grid(axis='y', linestyle='--', alpha=0.7)
#         plt.ylim(0, 1.15) 
#         plt.tight_layout()
#         plt.show()

# # --- Call the plotting function ---
# if 'all_models_results' in globals() and all_models_results:
#     plot_model_comparison(all_models_results, target_columns, metrics_to_plot=['roc_auc', 'f1'])
#     plot_model_comparison(all_models_results, target_columns, metrics_to_plot=['precision', 'recall'])

#     # --- Optional: Display a summary table ---
#     summary_data_list = []
#     for model_name, target_data_dict in all_models_results.items():
#         if not target_data_dict: continue 
#         for target_name, metrics_dict in target_data_dict.items():
#             if metrics_dict.get('status') == 'Completed':
#                 row = {
#                     'Model': model_name,
#                     'Target': target_name.replace("EvaluationChoreography", ""),
#                     'Holdout AUC': metrics_dict['holdout_metrics'].get('roc_auc'),
#                     'Holdout F1': metrics_dict['holdout_metrics'].get('f1'),
#                     'Holdout Precision': metrics_dict['holdout_metrics'].get('precision'),
#                     'Holdout Recall': metrics_dict['holdout_metrics'].get('recall'),
#                     'CV AUC (mean)': metrics_dict['cv_metrics']['roc_auc']['mean'] if 'roc_auc' in metrics_dict.get('cv_metrics',{}) else np.nan,
#                     'Train Time (s)': metrics_dict.get('cv_training_time'),
#                     'Infer Time (s)': metrics_dict.get('inference_time'),
#                     'OOB/Equivalent': metrics_dict.get('oob_score', np.nan)
#                 }
#                 summary_data_list.append(row)

#     if summary_data_list:
#         summary_df_all = pd.DataFrame(summary_data_list)
#         print("\n\n--- Overall Model Performance Summary ---")
        
#         # Define a preferred order for models if desired
#         model_order = ['LogisticRegression', 'RandomForest', 'XGBoost', 'CatBoost'] 
#         # Filter and reorder models present in the summary
#         summary_df_all['Model'] = pd.Categorical(summary_df_all['Model'], categories=[m for m in model_order if m in summary_df_all['Model'].unique()], ordered=True)
#         summary_df_all = summary_df_all.sort_values(by=['Target', 'Model'])

#         # Option 1: Pivot table (good for comparing models across targets for a few key metrics)
#         print("\n--- Pivot Table Summary (Holdout AUC & F1) ---")
#         try:
#             summary_pivot = summary_df_all.pivot_table(
#                 index='Target', 
#                 columns='Model', 
#                 values=['Holdout AUC', 'Holdout F1']
#             )
#             print(summary_pivot.to_string(float_format="%.3f"))
#         except Exception as e:
#             print(f"Could not create pivot table: {e}. Displaying flat table.")
#             print(summary_df_all[['Model', 'Target', 'Holdout AUC', 'Holdout F1']].to_string(float_format="%.3f", index=False))

#         # Option 2: Detailed flat table (shows more metrics)
#         print("\n--- Detailed Flat Table Summary ---")
#         print(summary_df_all[[
#             'Model', 'Target', 'Holdout AUC', 'Holdout F1', 'Holdout Precision', 'Holdout Recall', 
#             'CV AUC (mean)', 'Train Time (s)', 'Infer Time (s)', 'OOB/Equivalent'
#         ]].to_string(float_format="%.3f", index=False))
#     else:
#         print("\nNo completed model results available to summarize.")
# else:
#     print("`all_models_results` is not defined or empty. Run setup and model training cells first.")