# Bagging Regressor for car price prediction

In [25]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
import pandas as pd
import numpy as np

In [26]:
def bagging(X_train, X_test, y_train, y_test, estimator_range, cv_folds=5, decimal_places=2):
    results = []

    for n_estimators in estimator_range:
        # Create bagging regressor
        clf = BaggingRegressor(n_estimators=n_estimators, random_state=22)

        # Perform 5-fold cross-validation
        kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
        
        mae_scores = -cross_val_score(clf, X_train, y_train, cv=kf, scoring='neg_mean_absolute_error')
        medae_scores = -cross_val_score(clf, X_train, y_train, cv=kf, scoring='neg_median_absolute_error')
        r2_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='r2')

        # prediction via cross validation
        y_pred_cv = cross_val_predict(clf, X_train, y_train, cv=kf)

        # Cross Validation errors on training set
        mae_cv = mean_absolute_error(y_train, y_pred_cv)
        medAE_cv = median_absolute_error(y_train,y_pred_cv)
        
        # fitting model on all data
        clf.fit(X_train,y_train)
        y_pred_test = clf.predict(X_test)

        # Erros on the test set
        mae_test = mean_absolute_error(y_test,y_pred_test)
        medae_test = median_absolute_error(y_test, y_pred_test)
        r2_test = r2_score(y_test,y_pred_test)
        
        """
        # Store results for each fold        
        for fold_idx, (mae_scores, medae_scores, r2_scores) in enumerate(zip(mae_scores, medae_scores, r2_scores)):
            results.append({
                'n_estimators': n_estimators,
                # 'Fold': fold_idx + 1,
                'MAE CV': mae_cv,
                'medAE CV': medAE_cv,
                'R2 CV': r2_scores.mean(),
                'MAE Train (mean)': mae_scores.mean(),
                'medAE Train (mean)': medae_scores.mean(),
                'MAE Test': mae_test,
                'medAE Test': medae_test,
                'R2 Test': r2_test
            })
            """

        #instead of looping and showing errors for each fold, we take the mean value of the errors
        results.append({
            'n_estimators': n_estimators,
            # 'Fold': fold_idx + 1,
            'MAE CV': mae_cv,
            'medAE CV': medAE_cv,
            'R2 CV': r2_scores.mean(),
            'MAE Train (mean)': mae_scores.mean(),
            'medAE Train (mean)': medae_scores.mean(),
            'MAE Test': mae_test,
            'medAE Test': medae_test,
            'R2 Test': r2_test
        })
        

    # Convert the results list to a pandas DataFrame
    df_errors = pd.DataFrame(results).round(decimal_places)
    
    return y_pred_test, df_errors