# Benchmarking regression models with MLBenchmarks

## Installation

In [1]:
!pip uninstall -y MLBenchmarks && pip install git+https://github.com/rcpsilva/MLBenchmarks@main

Collecting git+https://github.com/rcpsilva/MLBenchmarks@main
  Cloning https://github.com/rcpsilva/MLBenchmarks (to revision main) to c:\users\rcpsi\appdata\local\temp\pip-req-build-60trcnex
  Resolved https://github.com/rcpsilva/MLBenchmarks to commit e656ee15e423a823e12e35c46dfb88ea48d07bcf
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/rcpsilva/MLBenchmarks 'C:\Users\rcpsi\AppData\Local\Temp\pip-req-build-60trcnex'


## Imports

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from MLBenchmarks.benchmarking_methods  import load_regression_datasets, run_cross_dataset_benchmark_models

## Load datasets

In [3]:
# Load all the available regression datasets
datasets = load_regression_datasets()

Running load_auto_mpg ...
Running load_student_mat ...
Running load_student_por ...
Running load_wine_quality_red ...
Running load_wine_quality_white ...


## Define models and pipelines

In [4]:
# Benchmark pipelines
pipeline_linear_rf = Pipeline([
    ('feature_extraction', FeatureUnion([
        ('pca', PCA(n_components=5)),
        ('polynomial_features', PolynomialFeatures(degree=2)),
    ])),
    ('regressor', DecisionTreeRegressor())
])

# Add the modified pipeline and selected models to a dictionary dictionary
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": DecisionTreeRegressor(),
    "Pipeline (Linear + Random Forest)": pipeline_linear_rf
}


## Define Evaluation Metrics

In [5]:
metrics = ['neg_mean_absolute_error','explained_variance','neg_root_mean_squared_error'] # accepts scikit-learn metrics

## Run experiment

In [6]:
output_json = 'regression_benchmarks.json'
res = run_cross_dataset_benchmark_models(models, datasets, metrics, output_json, cv=5)

100%|██████████| 5/5 [00:00<00:00, 25.80it/s]
100%|██████████| 5/5 [00:00<00:00,  7.60it/s]
100%|██████████| 5/5 [00:07<00:00,  1.46s/it]
100%|██████████| 3/3 [00:08<00:00,  2.72s/it]


## Print results

In [13]:
import numpy as np

### Print in natural order 

In [None]:
for model in res:
    print(f'{model}')
    for dataset in res[model]:
        print(f'\t{dataset}')
        for metric in res[model][dataset]:
            results = res[model][dataset][metric]
            print(f'\t\t{metric}: {np.mean(results):.3f} +- {np.std(results):.3f}')

### Compare results in each dataset

In [17]:
models = list(res.keys())
models

['Linear Regression',
 'Random Forest Regressor',
 'Pipeline (Linear + Random Forest)']

In [37]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[2:3]: # assesing neg_mean_absolute_error
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')

['Linear Regression', 'Random Forest Regressor', 'Pipeline (Linear + Random Forest)']
['load_auto_mpg', 'load_student_mat', 'load_student_por', 'load_wine_quality_red', 'load_wine_quality_white']
['fit_time', 'score_time', 'test_neg_mean_absolute_error', 'test_explained_variance', 'test_neg_root_mean_squared_error', 'memory_usage(MB)']
load_auto_mpg
	test_neg_mean_absolute_error
		                  Linear Regression:	 -2.988 	 +- 0.706
		            Random Forest Regressor:	 -2.880 	 +- 0.907
		  Pipeline (Linear + Random Forest):	 -2.647 	 +- 1.070
load_student_mat
	test_neg_mean_absolute_error
		                  Linear Regression:	 -3.418 	 +- 0.513
		            Random Forest Regressor:	 -4.380 	 +- 0.359
		  Pipeline (Linear + Random Forest):	 -4.815 	 +- 0.641
load_student_por
	test_neg_mean_absolute_error
		                  Linear Regression:	 -2.055 	 +- 0.431
		            Random Forest Regressor:	 -3.006 	 +- 0.474
		  Pipeline (Linear + Random Forest):	 -2.742 	 +- 0.454
lo