In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings("ignore")

sns.set_theme(style="whitegrid", font_scale=1.2)

In [2]:
# 1 Load Perth and Sydney 49-WEC datasets

df_perth = pd.read_csv("Dataset/WEC_Perth_49.csv")
df_sydney = pd.read_csv("Dataset/WEC_Sydney_49.csv")

In [3]:
# 2 Showing the average range of power absorbed by the WEC Array

print(f"Total Power Perth Example: {df_perth["Total_Power"][0]}")
print(f"Total Power Sydney Example: {df_sydney["Total_Power"][0]}")

Total Power Perth Example: 4102461.43
Total Power Sydney Example: 4065416.61


In [4]:
# 3 Coordinates of the WEC Extracted

def extract_X_y(df):
    coord_cols = [c for c in df.columns if c.startswith("X") or c.startswith("Y")]
    X = df[coord_cols].copy()
    y = df["Total_Power"].copy()
    return X, y

X_perth, y_perth = extract_X_y(df_perth)
X_sydney, y_sydney = extract_X_y(df_sydney)

In [5]:
# 4 Train-Test Split

def random_split(X, y, test_size=0.20, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

Xp_train, Xp_test, yp_train, yp_test = random_split(X_perth, y_perth)
Xs_train, Xs_test, ys_train, ys_test = random_split(X_sydney, y_sydney)

In [6]:
# 4 Defining 3 Baseline Models

def make_baseline_models():
    return {
        "LinearRegression": LinearRegression(),
        "RidgeRegression": Ridge(alpha=1.0),
        "RandomForest": RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            n_jobs=-1
        )
    }

In [7]:
# 5 Evaluation Function

def eval_regression(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    rmse = root_mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)

    return pred, rmse, mae, r2

In [8]:
# 6 Run Models and Print Metrics

def run_all_models(region_name, X_train, y_train, X_test, y_test):
    region_results = []

    models = make_baseline_models()
    
    for name, model in models.items():
        pred, rmse, mae, r2 = eval_regression(model, X_train, y_train, X_test, y_test)
        region_results.append([region_name, name, rmse, mae, r2, pred])

    return region_results

perth_results = run_all_models("Perth_49", Xp_train, yp_train, Xp_test, yp_test)
sydney_results = run_all_models("Sydney_49", Xs_train, ys_train, Xs_test, ys_test)

all_results = perth_results + sydney_results

metrics_df = pd.DataFrame(
    [[r[0], r[1], r[2], r[3], r[4]] for r in all_results],
    columns=["Region", "Model", "RMSE", "MAE", "R²"]
).sort_values(["Region", "RMSE"]).reset_index(drop=True)

display(metrics_df)

best_perth  = min(perth_results,  key=lambda x: x[2])
best_sydney = min(sydney_results, key=lambda x: x[2])

print("Best Perth baseline:", best_perth[:5])
print("Best Sydney baseline:", best_sydney[:5])

Unnamed: 0,Region,Model,RMSE,MAE,R²
0,Perth_49,RandomForest,26910.829981,10241.562399,0.950863
1,Perth_49,LinearRegression,49726.741388,36706.651082,0.832223
2,Perth_49,RidgeRegression,49726.741459,36706.651038,0.832223
3,Sydney_49,RandomForest,9778.64811,3037.890742,0.98139
4,Sydney_49,RidgeRegression,27610.80667,16283.027167,0.851628
5,Sydney_49,LinearRegression,27610.808255,16283.027647,0.851628


Best Perth baseline: ['Perth_49', 'RandomForest', 26910.82998051728, 10241.562399356173, 0.9508631837351778]
Best Sydney baseline: ['Sydney_49', 'RandomForest', 9778.648109834117, 3037.890741991389, 0.9813898304413082]
