# Distribution feature

This notebook uses ML benchmarks to test a pipeline that build a new feature based on the terget variable distribution

In [None]:
!pip uninstall -y MLBenchmarks && pip install git+https://github.com/rcpsilva/MLBenchmarks@main

In [None]:
from MLBenchmarks import classification_datasets_loaders as cdls
from MLBenchmarks import regression_datasets_loaders as rdls
from MLBenchmarks.benchmarking_methods import load_regression_datasets, run_cross_dataset_benchmark_models

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from xgboost import XGBRegressor,XGBClassifier
import numpy as np

In [None]:
class QuartileFeatureRF(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = RandomForestClassifier()

    def fit(self, X, y):
        q1, q2, q3 = np.percentile(np.sort(y), [25, 50, 75])
        quartile_labels = [int(value > q1) + int(value > q2) + int(value > q3) for value in y]
        self.quartile_classifier.fit(X, quartile_labels)
        return self

    def transform(self, X):
        return self.quartile_classifier.predict(X).reshape(-1, 1)


In [None]:
class QuartileFeatureDT(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = DecisionTreeClassifier()

    def fit(self, X, y):
        q1, q2, q3 = np.percentile(np.sort(y), [25, 50, 75])
        quartile_labels = [int(value > q1) + int(value > q2) + int(value > q3) for value in y]
        self.quartile_classifier.fit(X, quartile_labels)
        return self

    def transform(self, X):
        return self.quartile_classifier.predict(X).reshape(-1, 1)

In [None]:
class QuartileFeatureGB(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = GradientBoostingClassifier()

    def fit(self, X, y):
        q1, q2, q3 = np.percentile(np.sort(y), [25, 50, 75])
        quartile_labels = [int(value > q1) + int(value > q2) + int(value > q3) for value in y]
        self.quartile_classifier.fit(X, quartile_labels)
        return self

    def transform(self, X):
        return self.quartile_classifier.predict(X).reshape(-1, 1)

In [None]:
# Create the final pipeline with a regression model
DT_DT = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileFeatureDT())  # Use the custom quartile predictor
                    ])),
    ('regression_model', DecisionTreeRegressor(max_depth=3))  # You can use any regression model here
])


DT_RF = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileFeatureRF())  # Use the custom quartile predictor
                    ])),
    ('regression_model', DecisionTreeRegressor(max_depth=3))  # You can use any regression model here
])

DT_GB = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileFeatureGB())  # Use the custom quartile predictor
                    ])),
    ('regression_model', DecisionTreeRegressor(max_depth=3))  # You can use any regression model here
])

In [None]:
# Add the modified pipeline and selected models to a dictionary dictionary
models = {
    "DT": DecisionTreeRegressor(max_depth=3),
    "DT_DT": DT_DT,
    "DT_RF": DT_RF,
    "DT_GB": DT_GB,
}

In [None]:
metrics = ['neg_mean_absolute_percentage_error','neg_mean_absolute_error'] # accepts scikit-learn metrics

In [None]:
datasets = load_regression_datasets()
output_json = 'quartile_features.json'
res = run_cross_dataset_benchmark_models(models, datasets, metrics, output_json, cv=5)

In [None]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[2:3]: # MAPE
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')

In [None]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[3:4]: # mean_absolute_error
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')