# Benchmarking regression models with MLBenchmarks

## Installation

In [None]:
!pip uninstall -y MLBenchmarks && pip install git+https://github.com/rcpsilva/MLBenchmarks@main

## Imports

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from MLBenchmarks.benchmarking_methods  import load_regression_datasets, run_cross_dataset_benchmark_models
from MLBenchmarks.benchmarking_methods import count_datasets

In [2]:
count_datasets()

['load_auto_mpg', 'load_bike_sharing_day', 'load_bike_sharing_hour', 'load_concrete_strength', 'load_energy_efficiency_y1', 'load_energy_efficiency_y2', 'load_facebook_comments', 'load_facebook_engaged_users', 'load_facebook_impressions_liked', 'load_facebook_lifetime_impressions', 'load_facebook_lifetime_reach', 'load_facebook_liked_engaged', 'load_facebook_post_consumers', 'load_facebook_post_consumptions', 'load_facebook_post_interactions', 'load_facebook_post_likes', 'load_facebook_post_shares', 'load_facebook_reach_liked', 'load_forest_fires', 'load_obesity_levels', 'load_real_state_valuation', 'load_spm_demagnetization_FEM', 'load_spm_demagnetization_analytical', 'load_student_mat', 'load_student_por', 'load_wine_quality_red', 'load_wine_quality_white']
27 regression datasets


AttributeError: module 'MLBenchmarks.classification_datasets_loaders' has no attribute 'load_auto_mpg'

## Load datasets

In [None]:
# Load all the available regression datasets
datasets = load_regression_datasets()

## Define models and pipelines

In [None]:
# Benchmark pipelines
pipeline_linear_rf = Pipeline([
    ('feature_extraction', FeatureUnion([
        ('pca', PCA(n_components=5)),
        ('polynomial_features', PolynomialFeatures(degree=2)),
    ])),
    ('regressor', DecisionTreeRegressor())
])

# Add the modified pipeline and selected models to a dictionary dictionary
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": DecisionTreeRegressor(),
    "Pipeline (Linear + Random Forest)": pipeline_linear_rf
}


## Define Evaluation Metrics

In [None]:
metrics = ['neg_mean_absolute_error','explained_variance','neg_root_mean_squared_error'] # accepts scikit-learn metrics

## Run experiment

In [None]:
output_json = 'regression_benchmarks.json'
res = run_cross_dataset_benchmark_models(models, datasets, metrics, output_json, cv=5)

## Print results

In [None]:
import numpy as np

### Print in natural order 

In [None]:
for model in res:
    print(f'{model}')
    for dataset in res[model]:
        print(f'\t{dataset}')
        for metric in res[model][dataset]:
            results = res[model][dataset][metric]
            print(f'\t\t{metric}: {np.mean(results):.3f} +- {np.std(results):.3f}')

### Compare results in each dataset

In [None]:
models = list(res.keys())
models

In [None]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[2:3]: # assesing neg_mean_absolute_error
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')