# Distribution feature

This notebook uses ML benchmarks to test a pipeline that build a new feature based on the terget variable distribution

In [1]:
!pip uninstall -y MLBenchmarks && pip install git+https://github.com/rcpsilva/MLBenchmarks@main

Found existing installation: MLBenchmarks 0.1
Uninstalling MLBenchmarks-0.1:
  Successfully uninstalled MLBenchmarks-0.1
Collecting git+https://github.com/rcpsilva/MLBenchmarks@main
  Cloning https://github.com/rcpsilva/MLBenchmarks (to revision main) to c:\users\rcpsi\appdata\local\temp\pip-req-build-scojp0te
  Resolved https://github.com/rcpsilva/MLBenchmarks to commit 1097ed04d634608ee37c97bb9b5516c63109714d
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: MLBenchmarks
  Building wheel for MLBenchmarks (setup.py): started
  Building wheel for MLBenchmarks (setup.py): finished with status 'done'
  Created wheel for MLBenchmarks: filename=MLBenchmarks-0.1-py3-none-any.whl size=17234 sha256=6214ea4039f78f702e95dbeeab81a71f26ab2f25bfe54439a60fd19c1a855ae8
  Stored in directory: C:\Users\rcpsi\AppData\Local\Temp\pip-ephem-wheel-cache-vy_ja3vw\wheels\c3\f7\95\155bc37c57bbc7281b0addda642a4521ee2d82

  Running command git clone --filter=blob:none --quiet https://github.com/rcpsilva/MLBenchmarks 'C:\Users\rcpsi\AppData\Local\Temp\pip-req-build-scojp0te'


In [2]:
from MLBenchmarks import classification_datasets_loaders as cdls
from MLBenchmarks import regression_datasets_loaders as rdls
from MLBenchmarks.benchmarking_methods import load_regression_datasets, run_cross_dataset_benchmark_models

In [3]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
import numpy as np

In [4]:
# Define a custom transformer (feature_model) to predict quartiles based on X
class QuartileRandomForest(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = RandomForestClassifier()  # You can use any classifier here
    
    def fit(self, X, y):
        # Calculate quartiles for y
        sorted_y = np.sort(y)
        q1 = np.percentile(sorted_y, 25)
        q2 = np.percentile(sorted_y, 50)
        q3 = np.percentile(sorted_y, 75)
        
        # Create quartile labels for y
        quartile_labels = np.array([self.get_quartile_label(value, q1, q2, q3) for value in y])
        
        # Fit the quartile classifier
        self.quartile_classifier.fit(X, quartile_labels)
        return self
    
    def transform(self, X):
        # Predict quartiles for X
        predicted_quartiles = self.quartile_classifier.predict(X).reshape(-1, 1)
        return predicted_quartiles
    
    def get_quartile_label(self, value, q1, q2, q3):
        if value <= q1:
            return 1
        elif value <= q2:
            return 2
        elif value <= q3:
            return 3
        else:
            return 4

In [5]:
# Define a custom transformer (feature_model) to predict quartiles based on X
class QuartileDecisionTree(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = DecisionTreeClassifier()  # You can use any classifier here
    
    def fit(self, X, y):
        # Calculate quartiles for y
        sorted_y = np.sort(y)
        q1 = np.percentile(sorted_y, 25)
        q2 = np.percentile(sorted_y, 50)
        q3 = np.percentile(sorted_y, 75)
        
        # Create quartile labels for y
        quartile_labels = np.array([self.get_quartile_label(value, q1, q2, q3) for value in y])
        
        # Fit the quartile classifier
        self.quartile_classifier.fit(X, quartile_labels)
        return self
    
    def transform(self, X):
        # Predict quartiles for X
        predicted_quartiles = self.quartile_classifier.predict(X).reshape(-1, 1)
        return predicted_quartiles
    
    def get_quartile_label(self, value, q1, q2, q3):
        if value <= q1:
            return 1
        elif value <= q2:
            return 2
        elif value <= q3:
            return 3
        else:
            return 4

In [6]:
# Create the final pipeline with a regression model
dt_lr = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileDecisionTree())  # Use the custom quartile predictor
                    ])),
    ('regression_model', LinearRegression())  # You can use any regression model here
])

rf_lr = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileRandomForest())  # Use the custom quartile predictor
                    ])),
    ('regression_model', LinearRegression())  # You can use any regression model here
])

dt_dt = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileDecisionTree())  # Use the custom quartile predictor
                    ])),
    ('regression_model', DecisionTreeRegressor())  # You can use any regression model here
])

rf_dt = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileRandomForest())  # Use the custom quartile predictor
                    ])),
    ('regression_model', DecisionTreeRegressor())  # You can use any regression model here
])

In [7]:
dataset = rdls.load_auto_mpg()
X = dataset['data']
y = dataset['target']

pipeline = rf_dt

# Fit the pipeline to the data
pipeline.fit(X, y)

# Make predictions
y_pred = pipeline.predict(X)

# Print the predictions
print("Predicted y:", y_pred)

Predicted y: [15.  18.  16.  17.  15.  14.  14.  14.  15.  15.  14.  15.  14.  24.
 22.  18.  21.  27.  26.  25.  24.  25.  26.  21.  10.  10.  11.   9.
 27.  28.  25.  19.  16.  17.  19.  18.  14.  14.  14.  14.  12.  13.
 13.  18.  22.  19.  18.  23.  28.  30.  30.  31.  35.  27.  26.  24.
 25.  23.  20.  21.  13.  14.  15.  14.  17.  11.  13.  12.  13.  19.
 15.  13.  13.  14.  18.  22.  21.  26.  22.  28.  23.  28.  27.  13.
 14.  13.  14.  15.  12.  13.  13.  14.  13.  12.  13.  18.  16.  18.
 18.  23.  26.  11.  12.  13.  12.  18.  20.  21.  22.  18.  19.  21.
 26.  15.  16.  29.  24.  20.  19.  15.  24.  20.  11.  20.  19.  15.
 31.  26.  32.  25.  16.  16.  18.  16.  13.  14.  14.  14.  29.  26.
 26.  31.  32.  28.  24.  26.  24.  26.  31.  19.  18.  15.  15.  16.
 15.  16.  14.  17.  16.  15.  18.  21.  20.  13.  29.  23.  20.  23.
 24.  25.  24.  18.  29.  19.  23.  23.  22.  25.  33.  28.  25.  25.
 26.  27.  17.5 16.  15.5 14.5 22.  22.  24.  22.5 29.  24.5 29.  33.
 20.  1

In [8]:
# Add the modified pipeline and selected models to a dictionary dictionary
models = {
    "dt+lr": dt_lr,
    "rf+lr": rf_lr,
    "dt+dt": dt_dt,
    "rf+dt": rf_dt,
    "DT": DecisionTreeRegressor(),
    "LR": LinearRegression(),
}

In [9]:
metrics = ['neg_mean_absolute_percentage_error','neg_mean_absolute_error'] # accepts scikit-learn metrics

In [10]:
datasets = load_regression_datasets()
output_json = 'quartile_features.json'
res = run_cross_dataset_benchmark_models(models, datasets, metrics, output_json, cv=5)

Running load_auto_mpg ...
Running load_energy_efficiency_y1 ...
Running load_energy_efficiency_y2 ...
Running load_forest_fires ...
Running load_student_mat ...
Running load_student_por ...
Running load_wine_quality_red ...
Running load_wine_quality_white ...


100%|██████████| 8/8 [00:00<00:00,  8.51it/s]
100%|██████████| 8/8 [00:21<00:00,  2.72s/it]
100%|██████████| 8/8 [00:01<00:00,  4.69it/s]
100%|██████████| 8/8 [00:31<00:00,  3.89s/it]
100%|██████████| 8/8 [00:01<00:00,  7.24it/s]
100%|██████████| 8/8 [00:00<00:00, 39.83it/s]
100%|██████████| 6/6 [00:56<00:00,  9.49s/it]


In [11]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[2:3]: # MAPE
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')

['dt+lr', 'rf+lr', 'dt+dt', 'rf+dt', 'DT', 'LR']
['load_auto_mpg', 'load_energy_efficiency_y1', 'load_energy_efficiency_y2', 'load_forest_fires', 'load_student_mat', 'load_student_por', 'load_wine_quality_red', 'load_wine_quality_white']
['fit_time', 'score_time', 'test_neg_mean_absolute_percentage_error', 'test_neg_mean_absolute_error', 'memory_usage(MB)']
load_auto_mpg
	test_neg_mean_absolute_percentage_error
		                              dt+lr:	 -0.125 	 +- 0.015
		                              rf+lr:	 -0.111 	 +- 0.015
		                              dt+dt:	 -0.135 	 +- 0.028
		                              rf+dt:	 -0.122 	 +- 0.021
		                                 DT:	 -0.121 	 +- 0.012
		                                 LR:	 -0.142 	 +- 0.036
load_energy_efficiency_y1
	test_neg_mean_absolute_percentage_error
		                              dt+lr:	 -0.127 	 +- 0.032
		                              rf+lr:	 -0.125 	 +- 0.039
		                              dt+dt:	 -0.052 	 +- 0.

In [12]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[3:4]: # mean_absolute_error
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')

['dt+lr', 'rf+lr', 'dt+dt', 'rf+dt', 'DT', 'LR']
['load_auto_mpg', 'load_energy_efficiency_y1', 'load_energy_efficiency_y2', 'load_forest_fires', 'load_student_mat', 'load_student_por', 'load_wine_quality_red', 'load_wine_quality_white']
['fit_time', 'score_time', 'test_neg_mean_absolute_percentage_error', 'test_neg_mean_absolute_error', 'memory_usage(MB)']
load_auto_mpg
	test_neg_mean_absolute_error
		                              dt+lr:	 -2.889 	 +- 0.811
		                              rf+lr:	 -2.645 	 +- 1.029
		                              dt+dt:	 -3.179 	 +- 1.431
		                              rf+dt:	 -2.877 	 +- 1.129
		                                 DT:	 -2.825 	 +- 0.804
		                                 LR:	 -2.988 	 +- 0.706
load_energy_efficiency_y1
	test_neg_mean_absolute_error
		                              dt+lr:	 -2.358 	 +- 0.391
		                              rf+lr:	 -2.326 	 +- 0.484
		                              dt+dt:	 -0.841 	 +- 0.669
		                