# Distribution feature

This notebook uses ML benchmarks to test a pipeline that build a new feature based on the terget variable distribution

In [None]:
!pip install git+https://github.com/rcpsilva/MLBenchmarks@main

In [None]:
from MLBenchmarks import classification_datasets_loaders as cdls
from MLBenchmarks import regression_datasets_loaders as rdls
from MLBenchmarks.benchmarking_methods import load_regression_datasets, run_cross_dataset_benchmark_models

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
import numpy as np

In [None]:
# Define a custom transformer (feature_model) to predict quartiles based on X
class QuartileRandomForest(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = RandomForestClassifier()  # You can use any classifier here
    
    def fit(self, X, y):
        # Calculate quartiles for y
        sorted_y = np.sort(y)
        q1 = np.percentile(sorted_y, 25)
        q2 = np.percentile(sorted_y, 50)
        q3 = np.percentile(sorted_y, 75)
        
        # Create quartile labels for y
        quartile_labels = np.array([self.get_quartile_label(value, q1, q2, q3) for value in y])
        
        # Fit the quartile classifier
        self.quartile_classifier.fit(X, quartile_labels)
        return self
    
    def transform(self, X):
        # Predict quartiles for X
        predicted_quartiles = self.quartile_classifier.predict(X).reshape(-1, 1)
        return predicted_quartiles
    
    def get_quartile_label(self, value, q1, q2, q3):
        if value <= q1:
            return 1
        elif value <= q2:
            return 2
        elif value <= q3:
            return 3
        else:
            return 4

In [None]:
# Define a custom transformer (feature_model) to predict quartiles based on X
class QuartileDecisionTree(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quartile_classifier = DecisionTreeClassifier()  # You can use any classifier here
    
    def fit(self, X, y):
        # Calculate quartiles for y
        sorted_y = np.sort(y)
        q1 = np.percentile(sorted_y, 25)
        q2 = np.percentile(sorted_y, 50)
        q3 = np.percentile(sorted_y, 75)
        
        # Create quartile labels for y
        quartile_labels = np.array([self.get_quartile_label(value, q1, q2, q3) for value in y])
        
        # Fit the quartile classifier
        self.quartile_classifier.fit(X, quartile_labels)
        return self
    
    def transform(self, X):
        # Predict quartiles for X
        predicted_quartiles = self.quartile_classifier.predict(X).reshape(-1, 1)
        return predicted_quartiles
    
    def get_quartile_label(self, value, q1, q2, q3):
        if value <= q1:
            return 1
        elif value <= q2:
            return 2
        elif value <= q3:
            return 3
        else:
            return 4

In [None]:
# Create the final pipeline with a regression model
dt_lr = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileDecisionTree())  # Use the custom quartile predictor
                    ])),
    ('regression_model', LinearRegression())  # You can use any regression model here
])

rf_lr = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileRandomForest())  # Use the custom quartile predictor
                    ])),
    ('regression_model', LinearRegression())  # You can use any regression model here
])

dt_dt = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileDecisionTree())  # Use the custom quartile predictor
                    ])),
    ('regression_model', DecisionTreeRegressor())  # You can use any regression model here
])

rf_dt = Pipeline([
    ('feature_union', FeatureUnion([
                        ('original_features', StandardScaler()),  # Example: Standardize the original features (X)
                        ('quartile_feature', QuartileRandomForest())  # Use the custom quartile predictor
                    ])),
    ('regression_model', DecisionTreeRegressor())  # You can use any regression model here
])

In [None]:
dataset = rdls.load_auto_mpg()
X = dataset['data']
y = dataset['target']

pipeline = rf_dt

# Fit the pipeline to the data
pipeline.fit(X, y)

# Make predictions
y_pred = pipeline.predict(X)

# Print the predictions
print("Predicted y:", y_pred)

In [None]:
# Add the modified pipeline and selected models to a dictionary dictionary
models = {
    "dt+lr": dt_lr,
    "rf+lr": rf_lr,
    "dt+dt": dt_dt,
    "rf+dt": rf_dt,
    "DT": DecisionTreeRegressor(),
    "LR": LinearRegression(),
}

In [None]:
metrics = ['neg_mean_absolute_percentage_error','neg_mean_absolute_error'] # accepts scikit-learn metrics

In [None]:
datasets = load_regression_datasets()
output_json = 'quartile_features.json'
res = run_cross_dataset_benchmark_models(models, datasets, metrics, output_json, cv=5)

In [None]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[2:3]: # MAPE
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')

In [None]:
models = list(res.keys())
datasets = list(res[models[0]].keys())
metrics = list(res[models[0]][datasets[0]].keys())

print(models)
print(datasets)
print(metrics)

for dataset in datasets:
    print(f'{dataset}')
    for metric in metrics[3:4]: # mean_absolute_error
        print(f'\t{metric}')
        for model in models:
            print(f'\t\t{model:>35}:\t {np.mean(res[model][dataset][metric]):.3f} \t +- {np.std(res[model][dataset][metric]):.3f}')