#### Tuning for RBF-PSO Modelling of QFE Air Pressure
Some code using the "03" notebook cell.

In [1]:
from pathlib import Path
import pandas as pd, numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np
import pyswarms as ps, warnings ; warnings.filterwarnings("ignore")

# ── EDIT ONLY THESE TWO LINES WHEN YOU CHANGE STATION ── #
STATION_ID = "97430"
DATA_FILE  = Path(f"/home/rzby/airpressure_ann/ann_pressure_prediction/data/processed/clean_{STATION_ID}.parquet")
# ──────────────────────────────────────────────────────── #

SEED = 42


In [2]:
def add_time_feats(df):
    ang_h = 2*np.pi*df.index.hour/24
    ang_d = 2*np.pi*df.index.dayofyear/365.25
    df["sin_hour"], df["cos_hour"] = np.sin(ang_h), np.cos(ang_h)
    df["sin_doy"],  df["cos_doy"]  = np.sin(ang_d), np.cos(ang_d)
    return df

def add_lags(df, cols, lags=[1,2,4]):
    for c in cols:
        for k in lags:
            df[f"{c}_lag{k}"] = df[c].shift(k)
    return df

def kge(y, yhat):
    r = np.corrcoef(y, yhat)[0,1]
    alpha = yhat.std()/y.std()
    beta  = yhat.mean()/y.mean()
    return 1-np.sqrt((r-1)**2+(alpha-1)**2+(beta-1)**2)

from sklearn.cluster import MiniBatchKMeans
from numpy.linalg import pinv
import numpy as np

class RBFNetwork:
    """
    Radial-Basis-Function Neural Network with:
      • K centres from (MiniBatch) K-means
      • Gaussian basis ϕ(||x−c||)  with shared gamma
      • Closed-form ridge regression for output weights
    """
    def __init__(self, n_centres=300, gamma=None,
                 ridge=1e-4, random_state=42, batch_size=1024):
        self.n_centres  = n_centres
        self.gamma      = gamma        # if None compute from centres spread
        self.ridge      = ridge
        self.rs         = random_state
        self.batch_size = batch_size   # for MiniBatchKMeans

    # ───────────────── helpers ──────────────────
    @staticmethod
    def _euclid(a,b):
        return np.linalg.norm(a[:,None,:] - b[None,:,:], axis=2)

    def _rbf(self, X):
        d = self._euclid(X, self.C_)
        return np.exp(-(self.gamma * d**2))

    # ─────────────── public API ─────────────────
    def fit(self, X, y):
        # 1) choose K centres
        km = MiniBatchKMeans(n_clusters=self.n_centres,
                             random_state=self.rs,
                             batch_size=self.batch_size)
        km.fit(X)
        self.C_ = km.cluster_centers_

        # 2) set gamma if not user-supplied
        if self.gamma is None:
            d_max = np.max(self._euclid(self.C_, self.C_))
            sigma = d_max / np.sqrt(2*self.n_centres)
            self.gamma = 1 / (2*sigma**2)

        # 3) hidden matrix
        H = self._rbf(X)

        # 4) ridge regression: β = (HᵀH + λI)⁻¹ Hᵀ y
        I = self.ridge * np.eye(self.n_centres)
        self.B_ = pinv(H.T @ H + I) @ H.T @ y
        return self

    def predict(self, X):
        return self._rbf(X) @ self.B_
    
    def score(self, X, y):
        """Sklearn-compatible score = R-squared."""
        return r2_score(y, self.predict(X))



In [3]:
df = pd.read_parquet(DATA_FILE).set_index('index')
df.head()

Unnamed: 0_level_0,WMO_ID,DATA_TIMESTAMP,RAINFALL_LAST_MM,TEMP_DEWPOINT_C_TDTDTD,TEMP_DRYBULB_C_TTTTTT,TEMP_WETBULB_C,WIND_SPEED_FF,RELATIVE_HUMIDITY_PC,PRESSURE_QFF_MB_DERIVED,PRESSURE_QFE_MB_DERIVED,month
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-01-01 00:00:00+00:00,97430,2015-01-01 00:00:00+00:00,-1.0,0.193548,-0.035714,0.111111,-0.333333,0.311628,1011.1,1007.2,-1.0
2015-01-01 03:00:00+00:00,97430,2015-01-01 03:00:00+00:00,-1.0,0.451613,0.428571,0.444444,-0.166667,-0.083721,1009.7,1005.8,-1.0
2015-01-01 06:00:00+00:00,97430,2015-01-01 06:00:00+00:00,-1.0,0.193548,0.25,0.222222,0.666667,-0.04186,1007.9,1004.0,-1.0
2015-01-01 09:00:00+00:00,97430,2015-01-01 09:00:00+00:00,-1.0,0.225806,0.071429,0.166667,0.166667,0.190698,1008.2,1004.3,-1.0
2015-01-01 12:00:00+00:00,97430,2015-01-01 12:00:00+00:00,-1.0,0.16129,-0.107143,0.055556,-0.166667,0.376744,1010.5,1006.6,-1.0


In [4]:
# feature engineering
df = add_time_feats(df)
core = ["PRESSURE_QFF_MB_DERIVED","PRESSURE_QFE_MB_DERIVED","TEMP_DEWPOINT_C_TDTDTD","TEMP_DRYBULB_C_TTTTTT", "TEMP_WETBULB_C", "WIND_SPEED_FF","RAINFALL_LAST_MM", "RELATIVE_HUMIDITY_PC"]
df = add_lags(df, core, lags=[1,2,4]).dropna()
df.head()

Unnamed: 0_level_0,WMO_ID,DATA_TIMESTAMP,RAINFALL_LAST_MM,TEMP_DEWPOINT_C_TDTDTD,TEMP_DRYBULB_C_TTTTTT,TEMP_WETBULB_C,WIND_SPEED_FF,RELATIVE_HUMIDITY_PC,PRESSURE_QFF_MB_DERIVED,PRESSURE_QFE_MB_DERIVED,...,TEMP_WETBULB_C_lag4,WIND_SPEED_FF_lag1,WIND_SPEED_FF_lag2,WIND_SPEED_FF_lag4,RAINFALL_LAST_MM_lag1,RAINFALL_LAST_MM_lag2,RAINFALL_LAST_MM_lag4,RELATIVE_HUMIDITY_PC_lag1,RELATIVE_HUMIDITY_PC_lag2,RELATIVE_HUMIDITY_PC_lag4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01 12:00:00+00:00,97430,2015-01-01 12:00:00+00:00,-1.0,0.16129,-0.107143,0.055556,-0.166667,0.376744,1010.5,1006.6,...,0.111111,0.166667,0.666667,-0.333333,-1.0,-1.0,-1.0,0.190698,-0.04186,0.311628
2015-01-01 15:00:00+00:00,97430,2015-01-01 15:00:00+00:00,-1.0,0.225806,-0.071429,0.111111,0.333333,0.376744,1010.8,1006.9,...,0.444444,-0.166667,0.166667,-0.166667,-1.0,-1.0,-1.0,0.376744,0.190698,-0.083721
2015-01-01 18:00:00+00:00,97430,2015-01-01 18:00:00+00:00,-1.0,0.225806,-0.107143,0.111111,0.0,0.44186,1009.2,1005.3,...,0.222222,0.333333,-0.166667,0.666667,-1.0,-1.0,-1.0,0.376744,0.376744,-0.04186
2015-01-01 21:00:00+00:00,97430,2015-01-01 21:00:00+00:00,-1.0,0.096774,-0.035714,0.055556,-0.166667,0.246512,1009.7,1005.8,...,0.166667,0.0,0.333333,0.166667,-1.0,-1.0,-1.0,0.44186,0.376744,0.190698
2015-01-02 00:00:00+00:00,97430,2015-01-02 00:00:00+00:00,-1.0,0.516129,0.142857,0.388889,0.166667,0.325581,1011.0,1007.1,...,0.055556,-0.166667,0.0,-0.166667,-1.0,-1.0,-1.0,0.246512,0.44186,0.376744


In [6]:

# chronological 80 / 20 split
split = int(len(df)*0.8)
train, test = df.iloc[:split], df.iloc[split:]

y_tr, y_te = train["PRESSURE_QFE_MB_DERIVED"].values.astype(np.float32), test["PRESSURE_QFE_MB_DERIVED"].values.astype(np.float32)
X_cols     = train.drop(columns=["PRESSURE_QFF_MB_DERIVED","PRESSURE_QFE_MB_DERIVED", "WMO_ID", "month", "DATA_TIMESTAMP"]).columns
print("Column for X: ", X_cols, len(X_cols))
X_tr       = train[X_cols].values.astype(np.float32)
X_te       = test[X_cols].values.astype(np.float32)


Column for X:  Index(['RAINFALL_LAST_MM', 'TEMP_DEWPOINT_C_TDTDTD', 'TEMP_DRYBULB_C_TTTTTT',
       'TEMP_WETBULB_C', 'WIND_SPEED_FF', 'RELATIVE_HUMIDITY_PC', 'sin_hour',
       'cos_hour', 'sin_doy', 'cos_doy', 'PRESSURE_QFF_MB_DERIVED_lag1',
       'PRESSURE_QFF_MB_DERIVED_lag2', 'PRESSURE_QFF_MB_DERIVED_lag4',
       'PRESSURE_QFE_MB_DERIVED_lag1', 'PRESSURE_QFE_MB_DERIVED_lag2',
       'PRESSURE_QFE_MB_DERIVED_lag4', 'TEMP_DEWPOINT_C_TDTDTD_lag1',
       'TEMP_DEWPOINT_C_TDTDTD_lag2', 'TEMP_DEWPOINT_C_TDTDTD_lag4',
       'TEMP_DRYBULB_C_TTTTTT_lag1', 'TEMP_DRYBULB_C_TTTTTT_lag2',
       'TEMP_DRYBULB_C_TTTTTT_lag4', 'TEMP_WETBULB_C_lag1',
       'TEMP_WETBULB_C_lag2', 'TEMP_WETBULB_C_lag4', 'WIND_SPEED_FF_lag1',
       'WIND_SPEED_FF_lag2', 'WIND_SPEED_FF_lag4', 'RAINFALL_LAST_MM_lag1',
       'RAINFALL_LAST_MM_lag2', 'RAINFALL_LAST_MM_lag4',
       'RELATIVE_HUMIDITY_PC_lag1', 'RELATIVE_HUMIDITY_PC_lag2',
       'RELATIVE_HUMIDITY_PC_lag4'],
      dtype='object') 34


In [7]:
def rmse(y_true, y_pred):
    """
    Returns root-mean-squared-error regardless of sklearn
    version (handles absence of 'squared' kwarg).
    """
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

In [8]:
sel_cols = [
    'PRESSURE_QFF_MB_DERIVED_lag4',
    'PRESSURE_QFE_MB_DERIVED_lag4',
    'RELATIVE_HUMIDITY_PC_lag2',
    'TEMP_WETBULB_C_lag4',
    'WIND_SPEED_FF_lag2',
    'TEMP_DEWPOINT_C_TDTDTD_lag1',
    'RAINFALL_LAST_MM_lag4',
    'RAINFALL_LAST_MM_lag2'
]

In [9]:
# ──────────────────────────────────────────────────────────────
#  Grid search: n_centres × gamma × ridge  → picks lowest RMSE
# ──────────────────────────────────────────────────────────────
import itertools, pandas as pd, numpy as np
from sklearn.metrics import mean_absolute_error

# --- Editable search space ------------------------------------
grid_n   = [1900, 1950, 2000]          # hidden centres
grid_gam = [0.0001, 0.00001]        # RBF spread
grid_reg = [1e-3, 1e-2, 1e-1, 1e-4]       # ridge λ
# --------------------------------------------------------------

results = []

for n, g, r in itertools.product(grid_n, grid_gam, grid_reg):
    model = RBFNetwork(n_centres=n, gamma=g, ridge=r).fit(
                train[sel_cols].values.astype(np.float32),
                y_tr.astype(np.float32))
    pred = model.predict(test[sel_cols].values.astype(np.float32))
    res  = {
        "n_centres": n,
        "gamma":     g,
        "ridge":     r,
        "RMSE":      rmse(y_te, pred),
        "MAE":       mean_absolute_error(y_te, pred),
        "R2": r2_score(y_te, pred),
        "KGE":       kge(y_te, pred)
    }
    results.append(res)

grid_df = pd.DataFrame(results)\
            .sort_values("RMSE")\
            .reset_index(drop=True)

display(grid_df.head(10).style.format({"RMSE":"{:.3f}",
                                       "MAE":"{:.3f}",
                                       "R2": "{:.3f}",
                                       "KGE":"{:.3f}"}))
best = grid_df.iloc[0]
print(f"Results for Station {STATION_ID}\n =========")
print(f"\n🏆  Best config → n={best.n_centres}, γ={best.gamma}, "
      f"ridge={best.ridge}  •  RMSE={best.RMSE:.3f} hPa | R²={best.R2:.3f}")


Unnamed: 0,n_centres,gamma,ridge,RMSE,MAE,R2,KGE
0,1950,1e-05,0.1,1.12,0.907,0.571,0.649
1,1900,1e-05,0.1,1.12,0.91,0.571,0.7
2,1950,1e-05,0.0001,1.142,0.921,0.554,0.546
3,1900,1e-05,0.0001,1.143,0.933,0.553,0.701
4,2000,1e-05,0.0001,1.166,0.956,0.535,0.678
5,1900,1e-05,0.001,1.168,0.956,0.533,0.74
6,2000,1e-05,0.001,1.169,0.944,0.533,0.53
7,1950,1e-05,0.01,1.245,1.008,0.469,0.449
8,2000,1e-05,0.01,1.257,1.016,0.459,0.39
9,1950,1e-05,0.001,1.295,1.066,0.426,0.728


Results for Station 97430

🏆  Best config → n=1950.0, γ=1e-05, ridge=0.1  •  RMSE=1.120 hPa | R²=0.571


In [10]:
# Persistence forecast: predict the previous value
y_pred_persist = y_te[:-1]          # forecast for t is value at t-1
y_true_shifted = y_te[1:]           # matching truth (start at t)

rmse_persist = rmse(y_true_shifted, y_pred_persist)
print(f"Persistence RMSE ≈ {rmse_persist:.3f} hPa")


Persistence RMSE ≈ 1.455 hPa
