In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings("ignore")

sns.set_theme(style="whitegrid", font_scale=1.2)

In [2]:
# 1 Load Perth and Sydney 100-WEC datasets

df_perth = pd.read_csv("Dataset/WEC_Perth_100.csv")
df_sydney = pd.read_csv("Dataset/WEC_Sydney_100.csv")

In [3]:
# 2 Showing the average range of power absorbed by the WEC Array

print(f"Total Power Perth Example: {df_perth["Total_Power"][0]}")
print(f"Total Power Sydney Example: {df_sydney["Total_Power"][0]}")

Total Power Perth Example: 7257985.04
Total Power Sydney Example: 7247491.41


In [4]:
# 3 Coordinates of the WEC Extracted

def extract_X_y(df):
    coord_cols = [c for c in df.columns if c.startswith("X") or c.startswith("Y")]
    X = df[coord_cols].copy()
    y = df["Total_Power"].copy()
    return X, y

X_perth, y_perth = extract_X_y(df_perth)
X_sydney, y_sydney = extract_X_y(df_sydney)

In [5]:
# 4 Train-Test Split

def random_split(X, y, test_size=0.20, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

Xp_train, Xp_test, yp_train, yp_test = random_split(X_perth, y_perth)
Xs_train, Xs_test, ys_train, ys_test = random_split(X_sydney, y_sydney)

In [6]:
# 4 Defining 3 Baseline Models

def make_baseline_models():
    return {
        "LinearRegression": LinearRegression(),
        "RidgeRegression": Ridge(alpha=1.0),
        "RandomForest": RandomForestRegressor(
            n_estimators=300,
            random_state=42,
            n_jobs=-1
        )
    }

In [7]:
# 5 Evaluation Function

def eval_regression(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    rmse = root_mean_squared_error(y_test, pred)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)

    return pred, rmse, mae, r2

In [8]:
# 6 Run Models and Print Metrics

def run_all_models(region_name, X_train, y_train, X_test, y_test):
    region_results = []

    models = make_baseline_models()
    
    for name, model in models.items():
        pred, rmse, mae, r2 = eval_regression(model, X_train, y_train, X_test, y_test)
        region_results.append([region_name, name, rmse, mae, r2, pred])

    return region_results

perth_results = run_all_models("Perth_100", Xp_train, yp_train, Xp_test, yp_test)
sydney_results = run_all_models("Sydney_100", Xs_train, ys_train, Xs_test, ys_test)

all_results = perth_results + sydney_results

metrics_df = pd.DataFrame(
    [[r[0], r[1], r[2], r[3], r[4]] for r in all_results],
    columns=["Region", "Model", "RMSE", "MAE", "R²"]
).sort_values(["Region", "RMSE"]).reset_index(drop=True)

display(metrics_df)

best_perth  = min(perth_results,  key=lambda x: x[2])
best_sydney = min(sydney_results, key=lambda x: x[2])

print("Best Perth baseline:", best_perth[:5])
print("Best Sydney baseline:", best_sydney[:5])

Unnamed: 0,Region,Model,RMSE,MAE,R²
0,Perth_100,RandomForest,59475.049089,24409.640709,0.903953
1,Perth_100,RidgeRegression,72857.502604,50468.684021,0.855868
2,Perth_100,LinearRegression,72857.503997,50468.684471,0.855868
3,Sydney_100,RandomForest,40363.452009,15459.772873,0.83328
4,Sydney_100,RidgeRegression,99218.071193,38465.725584,-0.007377
5,Sydney_100,LinearRegression,99218.317347,38465.795317,-0.007382


Best Perth baseline: ['Perth_100', 'RandomForest', 59475.04908919713, 24409.6407088304, 0.9039534742595151]
Best Sydney baseline: ['Sydney_100', 'RandomForest', 40363.45200874399, 15459.772873351738, 0.8332802900615601]
