# Distribution feature

This notebook uses ML benchmarks to test a pipeline that build a new feature based on the terget variable distribution

In [1]:
!pip uninstall -y MLBenchmarks && pip install git+https://github.com/rcpsilva/MLBenchmarks@main

Found existing installation: MLBenchmarks 0.1
Uninstalling MLBenchmarks-0.1:
  Successfully uninstalled MLBenchmarks-0.1
Collecting git+https://github.com/rcpsilva/MLBenchmarks@main
  Cloning https://github.com/rcpsilva/MLBenchmarks (to revision main) to c:\users\rcpsi\appdata\local\temp\pip-req-build-yr8tx_tz
  Resolved https://github.com/rcpsilva/MLBenchmarks to commit a4661de432fcb82365c97a35e58b34897ddef248
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: MLBenchmarks
  Building wheel for MLBenchmarks (setup.py): started
  Building wheel for MLBenchmarks (setup.py): finished with status 'done'
  Created wheel for MLBenchmarks: filename=MLBenchmarks-0.1-py3-none-any.whl size=17375 sha256=23fd602bb8bb2eebe7fe4dfc6d8a31f1bc1ee06570eea94176a9af8836d3f794
  Stored in directory: C:\Users\rcpsi\AppData\Local\Temp\pip-ephem-wheel-cache-o2ludehf\wheels\c3\f7\95\155bc37c57bbc7281b0addda642a4521ee2d82

  Running command git clone --filter=blob:none --quiet https://github.com/rcpsilva/MLBenchmarks 'C:\Users\rcpsi\AppData\Local\Temp\pip-req-build-yr8tx_tz'


In [2]:
from MLBenchmarks import classification_datasets_loaders as cdls
from MLBenchmarks import regression_datasets_loaders as rdls
from MLBenchmarks.benchmarking_methods import load_regression_datasets, run_cross_dataset_benchmark_models

In [3]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from xgboost import XGBRegressor,XGBClassifier
import numpy as np

In [4]:
class QuartileFeatureRF(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = RandomForestClassifier()

    def fit(self, X, y):
        q1, q2, q3 = np.percentile(np.sort(y), [25, 50, 75])
        quartile_labels = [int(value > q1) + int(value > q2) + int(value > q3) for value in y]
        self.quartile_classifier.fit(X, quartile_labels)
        return self

    def transform(self, X):
        return self.quartile_classifier.predict(X).reshape(-1, 1)


In [5]:
class QuartileFeatureDT(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = DecisionTreeClassifier()

    def fit(self, X, y):
        q1, q2, q3 = np.percentile(np.sort(y), [25, 50, 75])
        quartile_labels = [int(value > q1) + int(value > q2) + int(value > q3) for value in y]
        self.quartile_classifier.fit(X, quartile_labels)
        return self

    def transform(self, X):
        return self.quartile_classifier.predict(X).reshape(-1, 1)

In [6]:
class QuartileFeatureGB(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = GradientBoostingClassifier()

    def fit(self, X, y):
        q1, q2, q3 = np.percentile(np.sort(y), [25, 50, 75])
        quartile_labels = [int(value > q1) + int(value > q2) + int(value > q3) for value in y]
        self.quartile_classifier.fit(X, quartile_labels)
        return self

    def transform(self, X):
        return self.quartile_classifier.predict(X).reshape(-1, 1)

In [7]:
# Create the final pipeline with a regression model
base_model = DecisionTreeRegressor(max_depth=3)
DT_DT = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileFeatureDT())  # Use the custom quartile predictor
                    ])),
    ('regression_model', base_model)  # You can use any regression model here
])


DT_RF = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileFeatureRF())  # Use the custom quartile predictor
                    ])),
    ('regression_model', base_model)  # You can use any regression model here
])

DT_GB = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileFeatureGB())  # Use the custom quartile predictor
                    ])),
    ('regression_model', base_model)  # You can use any regression model here
])

In [8]:
# Add the modified pipeline and selected models to a dictionary dictionary
models = {
    "DT": base_model,
    "DT_DT": DT_DT,
    "DT_RF": DT_RF,
    "DT_GB": DT_GB,
}

In [9]:
metrics = ['neg_mean_absolute_percentage_error','neg_mean_absolute_error'] # accepts scikit-learn metrics

In [10]:
datasets = load_regression_datasets()
output_json = 'quartile_features.json'
res = run_cross_dataset_benchmark_models(models, datasets, metrics, output_json, cv=5)

Running load_auto_mpg ...
Running load_bike_sharing_day ...
Running load_bike_sharing_hour ...
Running load_energy_efficiency_y1 ...
Running load_energy_efficiency_y2 ...
Running load_forest_fires ...
Running load_real_state_valuation ...
Running load_student_mat ...
Running load_student_por ...
Running load_wine_quality_red ...
Running load_wine_quality_white ...


100%|██████████| 11/11 [00:00<00:00, 31.06it/s]
100%|██████████| 11/11 [00:01<00:00,  7.75it/s]
100%|██████████| 11/11 [00:50<00:00,  4.57s/it]
100%|██████████| 11/11 [02:28<00:00, 13.52s/it]
100%|██████████| 4/4 [03:20<00:00, 50.21s/it]


In [11]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[2:3]: # MAPE
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')

['DT', 'DT_DT', 'DT_RF', 'DT_GB']
['load_auto_mpg', 'load_bike_sharing_day', 'load_bike_sharing_hour', 'load_energy_efficiency_y1', 'load_energy_efficiency_y2', 'load_forest_fires', 'load_real_state_valuation', 'load_student_mat', 'load_student_por', 'load_wine_quality_red', 'load_wine_quality_white']
['fit_time', 'score_time', 'test_neg_mean_absolute_percentage_error', 'test_neg_mean_absolute_error', 'memory_usage(MB)']
load_auto_mpg
	test_neg_mean_absolute_percentage_error
		                                 DT:	 -0.140 	 +- 0.021
		                              DT_DT:	 -0.132 	 +- 0.008
		                              DT_RF:	 -0.114 	 +- 0.012
		                              DT_GB:	 -0.113 	 +- 0.015
load_bike_sharing_day
	test_neg_mean_absolute_percentage_error
		                                 DT:	 -0.229 	 +- 0.129
		                              DT_DT:	 -0.198 	 +- 0.137
		                              DT_RF:	 -0.210 	 +- 0.146
		                              DT_GB:	 -0.198 	 +-

In [12]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[3:4]: # mean_absolute_error
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')

['DT', 'DT_DT', 'DT_RF', 'DT_GB']
['load_auto_mpg', 'load_bike_sharing_day', 'load_bike_sharing_hour', 'load_energy_efficiency_y1', 'load_energy_efficiency_y2', 'load_forest_fires', 'load_real_state_valuation', 'load_student_mat', 'load_student_por', 'load_wine_quality_red', 'load_wine_quality_white']
['fit_time', 'score_time', 'test_neg_mean_absolute_percentage_error', 'test_neg_mean_absolute_error', 'memory_usage(MB)']
load_auto_mpg
	test_neg_mean_absolute_error
		                                 DT:	 -3.224 	 +- 1.050
		                              DT_DT:	 -3.081 	 +- 0.873
		                              DT_RF:	 -2.677 	 +- 0.913
		                              DT_GB:	 -2.651 	 +- 1.011
load_bike_sharing_day
	test_neg_mean_absolute_error
		                                 DT:	 -602.228 	 +- 98.133
		                              DT_DT:	 -464.084 	 +- 97.046
		                              DT_RF:	 -523.688 	 +- 131.642
		                              DT_GB:	 -463.228 	 +- 101.574
l