# Random Forest

In [None]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd

In [None]:
def random_forest(X_train, X_test, y_train, y_test, cv_folds=5, decimal_places=2):
    # Standardizing the features
    scale_X = StandardScaler()
    scale_y = StandardScaler()
    X_train_scaled = scale_X.fit_transform(X_train)
    X_test_scaled = scale_X.transform(X_test)
    y_train_scaled = scale_y.fit_transform(y_train.reshape(-1, 1)).flatten()
    y_test_scaled = scale_y.fit_transform(y_test.reshape(-1, 1)).flatten()
    
    # Defining the Random Forest model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    
    # Setting up 5-fold cross-validation
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Performing cross-validation and evaluating the model
    mae_scores_scaled = -cross_val_score(rf_model, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_mean_absolute_error')
    medae_scores_scaled = -cross_val_score(rf_model, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_median_absolute_error')
    r2_scores = cross_val_score(rf_model, X_train_scaled, y_train_scaled, cv=kf, scoring='r2')
    
    # Performing cross-validation predictions
    y_pred_cv_scaled = cross_val_predict(rf_model, X_train_scaled, y_train_scaled, cv=kf)
    y_pred_cv = scale_y.inverse_transform(y_pred_cv_scaled.reshape(-1, 1)).flatten()
    
    # Cross Validation errors on training set
    mae_cv = mean_absolute_error(y_train, y_pred_cv)
    medae_cv = median_absolute_error(y_train, y_pred_cv)
    
    # Fit the model on the entire training data
    rf_model.fit(X_train_scaled, y_train_scaled)
    y_pred_test_scaled = rf_model.predict(X_test_scaled)
    y_pred_test = scale_y.inverse_transform(y_pred_test_scaled.reshape(-1, 1)).flatten()
    
    # Errors on the test set
    mae_test = mean_absolute_error(y_test, y_pred_test)
    medae_test = median_absolute_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)
    
    # Transforming scaled MAE to original scale
    y_train_std = np.std(y_train)
    mae_scores = mae_scores_scaled * y_train_std
    medae_scores = medae_scores_scaled * y_train_std
    
    # Dictionary of errors then convert to dataframe
    error_metrics = {
        'MAE CV': mae_cv,
        'medAE CV': medae_cv,
        'R2 CV': r2_scores.mean(),
        'MAE CV (mean)': mae_scores.mean(),
        'medAE Train (mean)': medae_scores.mean(),
        'MAE Test': mae_test,
        'medAE Test': medae_test,
        'R2 Test': r2_test
    }
    
    # Convert the dictionary to a pandas DataFrame and round the values
    df_errors = pd.DataFrame([error_metrics]).round(decimal_places)
    
    return y_pred_test, df_errors, scale_X, scale_y