In [5]:
import pandas as pd
import mlflow
import logging
import os
MLFLOW_EXPERIMENT_NAME = "Real Estate Price Prediction"
ARTIFACT_PATH = "../models_and_artifacts"

mlflow.set_tracking_uri("file:///Users/sergiocarcamo/Dev/thesis/mlruns")

print("Generating Summary")
print("Searching across ALL experiments for final runs...")

runs = mlflow.search_runs(
    search_all_experiments=True,  
    filter_string="tags.\"mlflow.runName\" LIKE '%_BEST'",
    order_by=["metrics.cv_mape ASC"],      
)
print(f"Found {len(runs)} runs matching the filter across all experiments.")

if runs.empty:
    print("Filtered search returned empty results. Cannot create summary.")
    raise ValueError("No runs found matching the filter criteria.")
else:
    print("\nColumns returned by filtered search (all experiments):")
    # Only print relevant columns to avoid clutter
    cols_to_print = [c for c in runs.columns if c.startswith('metrics.') or c.startswith('params.') or c.startswith('tags.') or c == 'run_id' or c == 'experiment_id']

    # Select relevant columns for summary
    summary_cols = [
        "params.model_name",
        "params.feature_set",
        "metrics.test_final_mape",
        "metrics.test_final_rmse",
        "metrics.test_final_r2",
        "metrics.train_final_mape",
        "metrics.train_final_rmse",
        "metrics.best_optuna_cv_mape",
        "params.num_selected_features",
        "metrics.total_pipeline_duration_sec",
        "metrics.tuning_duration_sec",
        "tags.mlflow.runName",
        "experiment_id",
        "params.model__learning_rate",
        "params.model__max_depth",
        "params.model__num_leaves",
        "params.model__n_estimators",
        "params.model__min_child_samples",
        "params.model__colsample_bytree",
        "params.model__subsample",
        "params.model__reg_lambda",
        "params.model__reg_alpha",
    ]
    # Filter out columns that might not exist in all runs
    valid_summary_cols = [col for col in summary_cols if col in runs.columns]

    if not valid_summary_cols or not any(c.startswith("metrics.") or c.startswith("params.") for c in valid_summary_cols):
            print("WARNING: Key parameter/metric columns for summary are missing in filtered runs!")
            print("Displaying basic info instead:")
            cols_to_show_basic = [c for c in ['run_id', 'experiment_id', 'tags.mlflow.runName', 'status'] if c in runs.columns]
            print(runs[cols_to_show_basic])
            summary_df = pd.DataFrame(columns=[]) # Ensure empty dataframe
    else:
            print(f"\nUsing columns for summary: {valid_summary_cols}")
            summary_df = runs[valid_summary_cols].round(4)

            # Display summary
            print("\n--- Experiment Summary (Sorted by Test MAPE) ---")

# Save summary (even if empty)
summary_path = os.path.join(ARTIFACT_PATH, "experiment_summary.csv")
summary_df.to_csv(summary_path, index=False)
print(f"\nSummary saved to: {summary_path}")

Generating Summary
Searching across ALL experiments for final runs...
Found 18 runs matching the filter across all experiments.

Columns returned by filtered search (all experiments):

Using columns for summary: ['params.model_name', 'params.feature_set', 'metrics.test_final_mape', 'metrics.test_final_rmse', 'metrics.test_final_r2', 'metrics.train_final_mape', 'metrics.train_final_rmse', 'metrics.best_optuna_cv_mape', 'params.num_selected_features', 'metrics.total_pipeline_duration_sec', 'metrics.tuning_duration_sec', 'tags.mlflow.runName', 'experiment_id', 'params.model__learning_rate', 'params.model__max_depth', 'params.model__num_leaves', 'params.model__n_estimators', 'params.model__min_child_samples', 'params.model__colsample_bytree', 'params.model__subsample', 'params.model__reg_lambda', 'params.model__reg_alpha']

--- Experiment Summary (Sorted by Test MAPE) ---

Summary saved to: ../models_and_artifacts/experiment_summary.csv


In [6]:
summary_df

Unnamed: 0,params.model_name,params.feature_set,metrics.test_final_mape,metrics.test_final_rmse,metrics.test_final_r2,metrics.train_final_mape,metrics.train_final_rmse,metrics.best_optuna_cv_mape,params.num_selected_features,metrics.total_pipeline_duration_sec,...,experiment_id,params.model__learning_rate,params.model__max_depth,params.model__num_leaves,params.model__n_estimators,params.model__min_child_samples,params.model__colsample_bytree,params.model__subsample,params.model__reg_lambda,params.model__reg_alpha
0,LightGBM,rfe_100,0.0564,96247.4243,0.9371,0.0051,10721.1513,0.0587,100,12871.1213,...,130308261222661570,0.071602284277405,25.0,150.0,1500.0,9.0,0.8682865876740816,0.8025867256934134,1.2905146055508157,0.0027456379719999
1,LightGBM,rfe_100_non_pca,0.0568,96301.3466,0.937,0.0069,12879.4312,0.0587,100,23979.4768,...,656904675591708458,0.062713899736203,25.0,148.0,1400.0,26.0,0.9118670283319584,0.6707380145212968,0.0019458654399173,0.0184931170190726
2,LightGBM,,0.0581,96204.4818,0.9371,0.0113,16943.9614,0.0591,44,45853.1362,...,856005883594059767,0.077256975195537,28.0,97.0,1500.0,11.0,0.8903726842049561,0.7732597144089367,0.0381488206374942,0.0158339828490663
3,XGBoost,,0.0586,96310.2907,0.937,0.0148,21488.6442,0.0603,44,16477.6296,...,856005883594059767,0.1043407598770414,8.0,,950.0,,0.8956167172374453,0.8349453562929189,0.0235340896552428,0.1849070844657376
4,XGBoost,rfe_100,0.0584,99515.0985,0.9327,0.0177,25807.3993,0.0607,100,6125.3913,...,385446753358819974,0.0236441614140082,12.0,,350.0,,0.8471805691284786,0.6978970224603848,0.0080137301796008,0.1157384672248045
5,XGBoost,rfe_100_non_pca,0.0594,97536.153,0.9354,0.0219,31421.9033,0.0608,100,1652.1849,...,250108750998019940,0.0410104421717041,9.0,,550.0,,0.8168285685046426,0.7590483217277055,2.189975959782028,0.0172284676308955
6,RandomForest,rfe_100,0.0594,101684.1417,0.9298,0.0188,35200.0003,0.062,100,73915.9645,...,666687770586148491,,36.0,,500.0,,,,,
7,XGBoostQuantile,rfe_100,0.0601,104661.6695,0.9256,0.0211,55313.4041,0.0621,100,16407.9546,...,186367452582778471,0.0498205784657197,12.0,,900.0,,0.9438945694018376,0.9477913638287904,0.3520361475433595,0.008509805905942
8,RandomForest,rfe_100_non_pca,0.0596,102371.4205,0.9288,0.0179,33794.5799,0.0624,100,45833.6061,...,373136786488109189,,31.0,,250.0,,,,,
9,RandomForest,,0.06,102411.6308,0.9288,0.0233,41003.0735,0.0625,44,167530.9438,...,856005883594059767,,20.0,,1000.0,,,,,
