In [1]:
import pandas as pd

# Specify the file path
file_path = 'ANN_df.csv'  # Adjust the file path as needed

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# I have loaded the data file directly for each of the 3 x evaluation metrics along with the regressor and training models to ensure clear processing of data and prevent any file errors due to the completixy of computing required with the fine tuning applied.
# I also reduced my number of estimators from 200 to just 50 & 100 to enable more efficency when processing.

In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('ANN_df.csv')

# Split the data into features (X) and target variable (y)
X = df.drop(['GDP_per_capita_(constant_2015_US$)'], axis=1)  # Features
y = df['GDP_per_capita_(constant_2015_US$)']  # Target variable

# Data preprocessing steps
# For example, you can use StandardScaler to scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Set up the splits
splits = [0.1, 0.2, 0.3]  # 10%, 20%, and 30% splits
random_state = 23  # Set a random state for reproducibility

# Define hyperparameter grid for the Random Forest
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a K-fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

for split in splits:
    # Split the scaled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=split, random_state=random_state)

    # Create a Random Forest Regressor
    rf_regressor = RandomForestRegressor(random_state=random_state)

    # Create GridSearchCV object with K-fold cross-validation
    grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kf)

    # Fit the GridSearchCV to the training data
    grid_search.fit(X_train, y_train)

    # Get the best model from grid search
    best_rf_regressor = grid_search.best_estimator_

    # Make predictions on the testing data
    y_pred = best_rf_regressor.predict(X_test)

    # Calculate the RMSE on the testing data
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Print the RMSE
    print(f"RMSE for {int(split*100)}% test set size: {rmse:.2f}")

RMSE for 10% test set size: 19061995786.15
RMSE for 20% test set size: 21811867903.45
RMSE for 30% test set size: 19579433507.72


In [17]:
import plotly.express as px

# Test set sizes
test_set_sizes = [0.1, 0.2, 0.3]

# Corresponding RMSE results
rmse_results = [19061995786.15, 21811867903.45, 19579433507.72]

# Create a DataFrame
df_rmse = pd.DataFrame({'Test Set Size': test_set_sizes, 'RMSE': rmse_results})

# Create a bar plot with grid lines
fig_rmse = px.bar(df_rmse, x='Test Set Size', y='RMSE', title='RMSE for Different Test Set Sizes',
                  labels={'Test Set Size': 'Test Set Size', 'RMSE': 'RMSE'})

# Add grid lines
fig_rmse.update_layout(xaxis=dict(showgrid=True, zeroline=False, showline=False),
                       yaxis=dict(showgrid=True, zeroline=False, showline=False))

# Make the plot interactive
fig_rmse.show()
