In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.neural_network import MLPRegressor

import warnings
warnings.filterwarnings("ignore")
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

sns.set(style="whitegrid", font_scale=1.2)

In [2]:
# 1 Load Perth and Sydney 49-WEC datasets

df_perth = pd.read_csv("Dataset/WEC_Perth_49.csv")
df_sydney = pd.read_csv("Dataset/WEC_Sydney_49.csv")

In [3]:
# 2 Coordinates of the WEC Extracted

def extract_X_y(df):
    coord_cols = [c for c in df.columns if c.startswith("X") or c.startswith("Y")]
    X = df[coord_cols].copy()
    y = df["Total_Power"].copy()
    return X, y

X_perth, y_perth = extract_X_y(df_perth)
X_sydney, y_sydney = extract_X_y(df_sydney)

In [4]:
# 3 Train-Test Split

def random_split(X, y, test_size=0.20, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

Xp_train, Xp_test, yp_train, yp_test = random_split(X_perth, y_perth)
Xs_train, Xs_test, ys_train, ys_test = random_split(X_sydney, y_sydney)

In [5]:
# 4 Defining 4 Advanced Models selected to perform better than the baseline

def make_advanced_models():
    return {
        "LightGBM": LGBMRegressor(
            n_estimators=1200,
            learning_rate=0.01,
            num_leaves=64,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        ),

        "XGBoost": XGBRegressor(
            n_estimators=1500,
            learning_rate=0.01,
            max_depth=8,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method="hist",
            random_state=42
        ),

        "CatBoost": CatBoostRegressor(
            depth=10,
            learning_rate=0.02,
            iterations=1200,
            loss_function="RMSE",
            verbose=False,
            random_state=42
        ),

        "MLPRegressor": MLPRegressor(
            hidden_layer_sizes=(256, 256, 128),
            activation="relu",
            solver="adam",
            max_iter=300,
            random_state=42
        ),
    }

In [6]:
# 5 Evaluation Function

def train_and_eval(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    rmse = root_mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)

    return rmse, mae, r2, pred

In [7]:
# 6 Run Models

def run_models(region_name, X_train, y_train, X_test, y_test):
    region_results = []
    
    models = make_advanced_models()
    
    for name, model in models.items():
        rmse, mae, r2, pred = train_and_eval(model, X_train, y_train, X_test, y_test)
        region_results.append([region_name, name, rmse, mae, r2, pred])
    
    return region_results

perth_adv_results = run_models("Perth_49", Xp_train, yp_train, Xp_test, yp_test)
sydney_adv_results = run_models("Sydney_49", Xs_train, ys_train, Xs_test, ys_test)

all_adv = perth_adv_results + sydney_adv_results

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14042
[LightGBM] [Info] Number of data points in the train set: 28834, number of used features: 98
[LightGBM] [Info] Start training from score 3938391.702660
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002676 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10184
[LightGBM] [Info] Number of data points in the train set: 14371, number of used features: 98
[LightGBM] [Info] Start training from score 4026542.964234


In [8]:
# 7 Print Metrics

adv_metrics = pd.DataFrame(
    [[r[0], r[1], r[2], r[3], r[4]] for r in all_adv],
    columns=["Region", "Model", "RMSE", "MAE", "R²"]
).sort_values(["Region", "RMSE"]).reset_index(drop=True)

display(adv_metrics)

Unnamed: 0,Region,Model,RMSE,MAE,R²
0,Perth_49,CatBoost,21296.611871,9985.962019,0.969227
1,Perth_49,LightGBM,21687.056442,10405.109141,0.968088
2,Perth_49,XGBoost,22242.270077,9753.658921,0.966433
3,Perth_49,MLPRegressor,34699.687705,20585.273158,0.918303
4,Sydney_49,LightGBM,7256.901454,3315.946978,0.989751
5,Sydney_49,XGBoost,7447.537671,2696.740874,0.989205
6,Sydney_49,CatBoost,7597.399784,3282.70656,0.988766
7,Sydney_49,MLPRegressor,38055.530855,17986.544624,0.718143
