In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from scipy.spatial import ConvexHull
from scipy.spatial.distance import pdist, squareform
import warnings

In [2]:
# 1 Physics Feature Functions

def fast_pairwise_features(X, Y, n_wec):
    X_i = X[:, :, None]
    X_j = X[:, None, :]
    Y_i = Y[:, :, None]
    Y_j = Y[:, None, :]

    dx = X_i - X_j
    dy = Y_i - Y_j

    dists = np.sqrt(dx * dx + dy * dy)
    angles = np.arctan2(dy, dx)

    iu = np.triu_indices(n_wec, k=1)

    dist_flat = dists[:, iu[0], iu[1]]
    angle_flat = angles[:, iu[0], iu[1]]

    dist_cols = [f"dist_{i}_{j}" for i in range(1, n_wec+1) for j in range(i+1, n_wec+1)]
    angle_cols = [f"angle_{i}_{j}" for i in range(1, n_wec+1) for j in range(i+1, n_wec+1)]

    return (
        pd.DataFrame(dist_flat, columns=dist_cols),
        pd.DataFrame(angle_flat, columns=angle_cols)
    )


def compute_pca_direction(X, Y):
    pts = []
    for row_x, row_y in zip(X, Y):
        pts.append(np.column_stack([row_x, row_y]))
    pts = np.vstack(pts)

    pca = PCA(n_components=2)
    pca.fit(pts)

    return pca, pca.components_[0]


def compute_alignment_features(X, Y, n_wec, wave_dir):
    N = X.shape[0]
    feats = np.zeros((N, n_wec * (n_wec - 1) // 2))

    idx = 0
    for i in range(n_wec):
        for j in range(i+1, n_wec):
            vec_x = X[:, j] - X[:, i]
            vec_y = Y[:, j] - Y[:, i]

            norm = np.sqrt(vec_x**2 + vec_y**2) + 1e-8
            u_x = vec_x / norm
            u_y = vec_y / norm

            feats[:, idx] = u_x * wave_dir[0] + u_y * wave_dir[1]
            idx += 1

    cols = [f"align_{i}_{j}" for i in range(1, n_wec+1) for j in range(i+1, n_wec+1)]
    return pd.DataFrame(feats, columns=cols)


def convex_hull_area(X, Y):
    areas = []
    for xs, ys in zip(X, Y):
        pts = np.column_stack([xs, ys])
        try:
            hull = ConvexHull(pts)
            areas.append(hull.volume)
        except:
            areas.append(0.0)
    return pd.DataFrame({"convex_hull_area": areas})


def build_physics_features(df_coords, pca_model=None, wave_dir=None):
    n_wec = df_coords.shape[1] // 2

    X = df_coords[[f"X{i}" for i in range(1, n_wec+1)]].values
    Y = df_coords[[f"Y{i}" for i in range(1, n_wec+1)]].values

    # Fit PCA only on training, reuse for test
    if pca_model is None:
        pca_model, wave_dir = compute_pca_direction(X, Y)

    # pairwise
    df_dist, df_angle = fast_pairwise_features(X, Y, n_wec)

    # spatial stats
    df_stats = pd.DataFrame({
        "min_dist": df_dist.min(axis=1),
        "mean_dist": df_dist.mean(axis=1),
        "std_dist": df_dist.std(axis=1),
        "median_dist": df_dist.median(axis=1)
    })

    # convex hull area
    df_hull = convex_hull_area(X, Y)

    # alignment
    df_align = compute_alignment_features(X, Y, n_wec, wave_dir)

    full = pd.concat([df_dist, df_angle, df_align, df_stats, df_hull], axis=1)

    return full, pca_model, wave_dir

In [None]:
# 2 Defining K Fold Validation

def run_kfold_cv(df, region_name, k=5):
    coord_cols = [c for c in df.columns if c.startswith("X") or c.startswith("Y")]
    X_coords = df[coord_cols].values
    y = df["Total_Power"].values

    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    fold_metrics = []

    fold = 1
    for train_idx, test_idx in kf.split(X_coords):

        print(f"\n================ Fold {fold}/{k} ({region_name}) ================\n")

        X_train_coords = df.iloc[train_idx][coord_cols]
        X_test_coords  = df.iloc[test_idx][coord_cols]
        y_train = y[train_idx]
        y_test  = y[test_idx]


        X_train_phys, pca_model, wave_dir = build_physics_features(X_train_coords)
        X_test_phys, _, _ = build_physics_features(
            X_test_coords, pca_model=pca_model, wave_dir=wave_dir
        )


        model = LGBMRegressor(
            n_estimators=1200,
            learning_rate=0.01,
            num_leaves=64,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )

        model.fit(X_train_phys, y_train)
        preds = model.predict(X_test_phys)

        rmse = root_mean_squared_error(y_test, preds)
        mae  = mean_absolute_error(y_test, preds)
        r2   = r2_score(y_test, preds)
        rel_mae = (mae / np.mean(y_test)) * 100

        fold_metrics.append([rmse, mae, r2, rel_mae])

        print(f"RMSE = {rmse:.3f}")
        print(f"MAE = {mae:.3f}")
        print(f"R² = {r2:.4f}")
        print(f"Relative MAE = {rel_mae:.2f}%")

        fold += 1

    # Aggregate results
    fold_metrics = np.array(fold_metrics)
    results = pd.DataFrame({
        "Metric": ["RMSE", "MAE", "R²", "Relative MAE (%)"],
        "Mean": fold_metrics.mean(axis=0),
        "Std": fold_metrics.std(axis=0)
    })

    print(f"\n============== FINAL {k}-FOLD RESULTS ({region_name}) ==============\n")
    print(results)

    return results

In [4]:
# 3 Run Model

df_perth = pd.read_csv("Dataset/WEC_Perth_49.csv")
df_sydney = pd.read_csv("Dataset/WEC_Sydney_49.csv")

results_perth = run_kfold_cv(df_perth, "Perth_49", k=5)
results_sydney = run_kfold_cv(df_sydney, "Sydney_49", k=5)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.316965 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 824849
[LightGBM] [Info] Number of data points in the train set: 28834, number of used features: 3533
[LightGBM] [Info] Start training from score 3938391.702660
RMSE       = 20481.045
MAE        = 8896.013
R²         = 0.9715
Relative MAE = 0.23%


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.262513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 825056
[LightGBM] [Info] Number of data points in the train set: 28834, number of used features: 3533
[LightGBM] [Info] Start training from score 3937759.857651
RMSE       = 20505.223
MAE        = 8792.595
R²         = 0.9721
Relative MAE = 0.22%


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.333861 seconds.
You can set `f