#### Tuning for RBF-PSO Modelling of QFE Air Pressure
Some code using the "03" notebook cell.

In [1]:
from pathlib import Path
import pandas as pd, numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
import pyswarms as ps, warnings ; warnings.filterwarnings("ignore")

# ── EDIT ONLY THESE TWO LINES WHEN YOU CHANGE STATION ── #
STATION_ID = "96749"
DATA_FILE  = Path(f"/home/rzby/airpressure_ann/ann_pressure_prediction/data/processed/clean_{STATION_ID}.parquet")
# ──────────────────────────────────────────────────────── #

SEED = 42


In [2]:
def add_time_feats(df):
    ang_h = 2*np.pi*df.index.hour/24
    ang_d = 2*np.pi*df.index.dayofyear/365.25
    df["sin_hour"], df["cos_hour"] = np.sin(ang_h), np.cos(ang_h)
    df["sin_doy"],  df["cos_doy"]  = np.sin(ang_d), np.cos(ang_d)
    return df

def add_lags(df, cols, lags=[1,2,4]):
    for c in cols:
        for k in lags:
            df[f"{c}_lag{k}"] = df[c].shift(k)
    return df

def kge(y, yhat):
    r = np.corrcoef(y, yhat)[0,1]
    alpha = yhat.std()/y.std()
    beta  = yhat.mean()/y.mean()
    return 1-np.sqrt((r-1)**2+(alpha-1)**2+(beta-1)**2)

from sklearn.cluster import MiniBatchKMeans
from numpy.linalg import pinv
import numpy as np

class RBFNetwork:
    """
    Radial-Basis-Function Neural Network with:
      • K centres from (MiniBatch) K-means
      • Gaussian basis ϕ(||x−c||)  with shared gamma
      • Closed-form ridge regression for output weights
    """
    def __init__(self, n_centres=300, gamma=None,
                 ridge=1e-4, random_state=42, batch_size=1024):
        self.n_centres  = n_centres
        self.gamma      = gamma        # if None compute from centres spread
        self.ridge      = ridge
        self.rs         = random_state
        self.batch_size = batch_size   # for MiniBatchKMeans

    # ───────────────── helpers ──────────────────
    @staticmethod
    def _euclid(a,b):
        return np.linalg.norm(a[:,None,:] - b[None,:,:], axis=2)

    def _rbf(self, X):
        d = self._euclid(X, self.C_)
        return np.exp(-(self.gamma * d**2))

    # ─────────────── public API ─────────────────
    def fit(self, X, y):
        # 1) choose K centres
        km = MiniBatchKMeans(n_clusters=self.n_centres,
                             random_state=self.rs,
                             batch_size=self.batch_size)
        km.fit(X)
        self.C_ = km.cluster_centers_

        # 2) set gamma if not user-supplied
        if self.gamma is None:
            d_max = np.max(self._euclid(self.C_, self.C_))
            sigma = d_max / np.sqrt(2*self.n_centres)
            self.gamma = 1 / (2*sigma**2)

        # 3) hidden matrix
        H = self._rbf(X)

        # 4) ridge regression: β = (HᵀH + λI)⁻¹ Hᵀ y
        I = self.ridge * np.eye(self.n_centres)
        self.B_ = pinv(H.T @ H + I) @ H.T @ y
        return self

    def predict(self, X):
        return self._rbf(X) @ self.B_
    
    def score(self, X, y):
        """Sklearn-compatible score = R-squared."""
        return r2_score(y, self.predict(X))



In [3]:
df = pd.read_parquet(DATA_FILE).set_index('index')
df.head()

Unnamed: 0_level_0,WMO_ID,RAINFALL_LAST_MM,TEMP_DEWPOINT_C_TDTDTD,TEMP_DRYBULB_C_TTTTTT,TEMP_WETBULB_C,WIND_SPEED_FF,RELATIVE_HUMIDITY_PC,PRESSURE_QFF_MB_DERIVED,PRESSURE_QFE_MB_DERIVED,month
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-01-01 00:00:00+00:00,96749.0,-1.0,-0.121212,-0.578947,-0.515152,0.166667,0.71746,1010.8,1009.5,-1.0
2015-01-01 03:00:00+00:00,96749.0,-0.986147,0.484848,-0.302632,0.090909,0.833333,0.704762,1011.2,1009.8,-1.0
2015-01-01 06:00:00+00:00,96749.0,-1.0,0.30303,-0.092105,0.090909,0.833333,0.355556,1008.8,1007.5,-1.0
2015-01-01 09:00:00+00:00,96749.0,-0.863203,0.424242,-0.407895,0.0,0.333333,0.822222,1008.5,1007.2,-1.0
2015-01-01 12:00:00+00:00,96749.0,-0.984416,0.272727,-0.355263,-0.090909,0.166667,0.653968,1010.2,1008.8,-1.0


In [4]:
# feature engineering
df = add_time_feats(df)
core = ["PRESSURE_QFF_MB_DERIVED","PRESSURE_QFE_MB_DERIVED","TEMP_DEWPOINT_C_TDTDTD","TEMP_DRYBULB_C_TTTTTT", "TEMP_WETBULB_C", "WIND_SPEED_FF","RAINFALL_LAST_MM", "RELATIVE_HUMIDITY_PC"]
df = add_lags(df, core, lags=[1,2,4]).dropna()
df.head()

Unnamed: 0_level_0,WMO_ID,RAINFALL_LAST_MM,TEMP_DEWPOINT_C_TDTDTD,TEMP_DRYBULB_C_TTTTTT,TEMP_WETBULB_C,WIND_SPEED_FF,RELATIVE_HUMIDITY_PC,PRESSURE_QFF_MB_DERIVED,PRESSURE_QFE_MB_DERIVED,month,...,TEMP_WETBULB_C_lag4,WIND_SPEED_FF_lag1,WIND_SPEED_FF_lag2,WIND_SPEED_FF_lag4,RAINFALL_LAST_MM_lag1,RAINFALL_LAST_MM_lag2,RAINFALL_LAST_MM_lag4,RELATIVE_HUMIDITY_PC_lag1,RELATIVE_HUMIDITY_PC_lag2,RELATIVE_HUMIDITY_PC_lag4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 12:00:00+00:00,96749.0,-0.984416,0.272727,-0.355263,-0.090909,0.166667,0.653968,1010.2,1008.8,-1.0,...,-0.515152,0.333333,0.833333,0.166667,-0.863203,-1.0,-1.0,0.822222,0.355556,0.71746
2015-01-01 15:00:00+00:00,96749.0,-0.996537,0.181818,-0.407895,-0.181818,0.333333,0.67619,1012.5,1011.1,-1.0,...,0.090909,0.166667,0.333333,0.833333,-0.984416,-0.863203,-0.986147,0.653968,0.822222,0.704762
2015-01-01 18:00:00+00:00,96749.0,-0.987879,0.181818,-0.486842,-0.242424,0.166667,0.771429,1011.2,1009.9,-1.0,...,0.090909,0.333333,0.166667,0.833333,-0.996537,-0.984416,-1.0,0.67619,0.653968,0.355556
2015-01-01 21:00:00+00:00,96749.0,-1.0,0.090909,-0.592105,-0.363636,-0.333333,0.869841,1010.8,1009.5,-1.0,...,0.0,0.166667,0.333333,0.333333,-0.987879,-0.996537,-0.863203,0.771429,0.67619,0.822222
2015-01-02 00:00:00+00:00,96749.0,-1.0,0.151515,-0.460526,-0.242424,0.333333,0.72381,1012.2,1010.9,-1.0,...,-0.090909,-0.333333,0.166667,0.166667,-1.0,-0.987879,-0.984416,0.869841,0.771429,0.653968


In [11]:

# chronological 80 / 20 split
split = int(len(df)*0.8)
train, test = df.iloc[:split], df.iloc[split:]

y_tr, y_te = train["PRESSURE_QFE_MB_DERIVED"].values.astype(np.float32), test["PRESSURE_QFE_MB_DERIVED"].values.astype(np.float32)
X_cols     = train.drop(columns=["PRESSURE_QFF_MB_DERIVED","PRESSURE_QFE_MB_DERIVED", "WMO_ID", "month"]).columns
print("Column for X: ", X_cols, len(X_cols))
X_tr       = train[X_cols].values.astype(np.float32)
X_te       = test[X_cols].values.astype(np.float32)


Column for X:  Index(['RAINFALL_LAST_MM', 'TEMP_DEWPOINT_C_TDTDTD', 'TEMP_DRYBULB_C_TTTTTT',
       'TEMP_WETBULB_C', 'WIND_SPEED_FF', 'RELATIVE_HUMIDITY_PC', 'sin_hour',
       'cos_hour', 'sin_doy', 'cos_doy', 'PRESSURE_QFF_MB_DERIVED_lag1',
       'PRESSURE_QFF_MB_DERIVED_lag2', 'PRESSURE_QFF_MB_DERIVED_lag4',
       'PRESSURE_QFE_MB_DERIVED_lag1', 'PRESSURE_QFE_MB_DERIVED_lag2',
       'PRESSURE_QFE_MB_DERIVED_lag4', 'TEMP_DEWPOINT_C_TDTDTD_lag1',
       'TEMP_DEWPOINT_C_TDTDTD_lag2', 'TEMP_DEWPOINT_C_TDTDTD_lag4',
       'TEMP_DRYBULB_C_TTTTTT_lag1', 'TEMP_DRYBULB_C_TTTTTT_lag2',
       'TEMP_DRYBULB_C_TTTTTT_lag4', 'TEMP_WETBULB_C_lag1',
       'TEMP_WETBULB_C_lag2', 'TEMP_WETBULB_C_lag4', 'WIND_SPEED_FF_lag1',
       'WIND_SPEED_FF_lag2', 'WIND_SPEED_FF_lag4', 'RAINFALL_LAST_MM_lag1',
       'RAINFALL_LAST_MM_lag2', 'RAINFALL_LAST_MM_lag4',
       'RELATIVE_HUMIDITY_PC_lag1', 'RELATIVE_HUMIDITY_PC_lag2',
       'RELATIVE_HUMIDITY_PC_lag4'],
      dtype='object') 34


In [12]:
def rmse(y_true, y_pred):
    """
    Returns root-mean-squared-error regardless of sklearn
    version (handles absence of 'squared' kwarg).
    """
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

In [13]:
sel_cols = [
    'PRESSURE_QFE_MB_DERIVED_lag1','PRESSURE_QFF_MB_DERIVED_lag1',
    'WIND_SPEED_FF',
    'TEMP_DRYBULB_C_TTTTTT',
    'WIND_SPEED_FF_lag1',
    'RAINFALL_LAST_MM_lag2',
    'RAINFALL_LAST_MM_lag1',
    'RAINFALL_LAST_MM_lag4'
]

In [14]:
# ──────────────────────────────────────────────────────────────
#  Grid search: n_centres × gamma × ridge  → picks lowest RMSE
# ──────────────────────────────────────────────────────────────
import itertools, pandas as pd, numpy as np
from sklearn.metrics import mean_absolute_error

# --- Editable search space ------------------------------------
grid_n   = [500, 700, 900, 1200, 1500]          # hidden centres
grid_gam = [0.005, 0.01, 0.02, 0.05]        # RBF spread
grid_reg = [1e-3, 1e-2, 1e-1]       # ridge λ
# --------------------------------------------------------------

results = []

for n, g, r in itertools.product(grid_n, grid_gam, grid_reg):
    model = RBFNetwork(n_centres=n, gamma=g, ridge=r).fit(
                train[sel_cols].values.astype(np.float32),
                y_tr.astype(np.float32))
    pred = model.predict(test[sel_cols].values.astype(np.float32))
    res  = {
        "n_centres": n,
        "gamma":     g,
        "ridge":     r,
        "RMSE":      rmse(y_te, pred),
        "MAE":       mean_absolute_error(y_te, pred),
        "R2": r2_score(y_te, pred),
        "KGE":       kge(y_te, pred)
    }
    results.append(res)

grid_df = pd.DataFrame(results)\
            .sort_values("RMSE")\
            .reset_index(drop=True)

display(grid_df.head(10).style.format({"RMSE":"{:.3f}",
                                       "MAE":"{:.3f}",
                                       "R-Sq": "{:.3f}",
                                       "KGE":"{:.3f}"}))
best = grid_df.iloc[0]
print(f"Results for Station {STATION_ID}\n =========")
print(f"\n🏆  Best config → n={best.n_centres}, γ={best.gamma}, "
      f"ridge={best.ridge}  •  RMSE={best.RMSE:.3f} hPa | R²={best.R2:.3f}")


Unnamed: 0,n_centres,gamma,ridge,RMSE,MAE,R2,KGE
0,1500,0.005,0.001,2.889,1.736,-1.588862,0.08
1,1200,0.005,0.1,2.978,1.763,-1.75129,0.038
2,900,0.005,0.001,3.037,1.827,-1.861009,0.011
3,1200,0.005,0.01,3.037,1.823,-1.861409,0.013
4,700,0.005,0.1,3.122,1.839,-2.022947,-0.032
5,1500,0.005,0.01,3.199,1.792,-2.173472,-0.061
6,500,0.005,0.001,3.224,2.015,-2.223097,-0.082
7,700,0.005,0.001,3.266,1.911,-2.308106,-0.099
8,1500,0.005,0.1,3.402,1.967,-2.589092,-0.174
9,700,0.005,0.01,3.462,1.984,-2.718195,-0.195


Results for Station 96749

🏆  Best config → n=1500.0, γ=0.005, ridge=0.001  •  RMSE=2.889 hPa | R²=-1.589


In [15]:
# Persistence forecast: predict the previous value
y_pred_persist = y_te[:-1]          # forecast for t is value at t-1
y_true_shifted = y_te[1:]           # matching truth (start at t)

rmse_persist = rmse(y_true_shifted, y_pred_persist)
print(f"Persistence RMSE ≈ {rmse_persist:.3f} hPa")


Persistence RMSE ≈ 1.568 hPa
