In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import plotly.express as px

# Load the dataset
df = pd.read_csv('ANN_df.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['GDP_per_capita_(constant_2015_US$)'], axis=1)  # Features
y = df['GDP_per_capita_(constant_2015_US$)']  # Target variable

# Set up the splits
splits = [0.1, 0.2, 0.3]  # 10%, 20%, and 30% splits
random_state = 23  # Set a random state for reproducibility

# Create a K-fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

# Define hyperparameter grid for the Random Forest
param_grid = {
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Create a pipeline for Mean Squared Error (MSE) regression
mse_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=random_state))
])

# Create a pipeline for Mean Absolute Error (MAE) regression
mae_pipeline = Pipeline([
    ('regressor', RandomForestRegressor(random_state=random_state))
])

for split in splits:
    # Split the scaled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=random_state)

    # MSE Pipeline
    grid_search_mse = GridSearchCV(estimator=mse_pipeline, param_grid=param_grid,
                                   scoring='neg_mean_squared_error', cv=kf)
    grid_search_mse.fit(X_train, y_train)
    best_rf_regressor_mse = grid_search_mse.best_estimator_
    y_pred_mse = best_rf_regressor_mse.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred_mse, squared=False)
    print(f"MSE RMSE for {int(split*100)}% test set size: {rmse:.2f}")

    # MAE Pipeline
    grid_search_mae = GridSearchCV(estimator=mae_pipeline, param_grid=param_grid,
                                   scoring='neg_mean_absolute_error', cv=kf)
    grid_search_mae.fit(X_train, y_train)
    best_rf_regressor_mae = grid_search_mae.best_estimator_
    y_pred_mae = best_rf_regressor_mae.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred_mae)
    print(f"MAE for {int(split*100)}% test set size: {mae:.2f}")

# Visualization using Plotly
# Similar to the code you provided
# ...
# ...

MSE RMSE for 10% test set size: 21620728318.78
MAE for 10% test set size: 3691550124.42
MSE RMSE for 20% test set size: 21013366281.01
MAE for 20% test set size: 3113427428.63
MSE RMSE for 30% test set size: 18440654655.34
MAE for 30% test set size: 2724523709.50


In [4]:
# MAE Pipeline
grid_search_mae = GridSearchCV(estimator=mae_pipeline, param_grid=param_grid,
                               scoring='neg_mean_absolute_error', cv=kf)
grid_search_mae.fit(X_train, y_train)
best_rf_regressor_mae = grid_search_mae.best_estimator_
y_pred_mae = best_rf_regressor_mae.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_mae)
print(f"MAE for {int(split*100)}% test set size: {mae:.2f}")

MAE for 30% test set size: 2724523709.50


In [5]:
# R2 Pipeline
grid_search_r2 = GridSearchCV(estimator=mae_pipeline, param_grid=param_grid,
                               scoring='r2', cv=kf)
grid_search_r2.fit(X_train, y_train)
best_rf_regressor_r2 = grid_search_r2.best_estimator_
y_pred_r2 = best_rf_regressor_r2.predict(X_test)
r2 = r2_score(y_test, y_pred_r2)
print(f"R2 for {int(split*100)}% test set size: {r2:.2f}")

R2 for 30% test set size: 0.95
