# IoT Weather Station Code 

Involves Data Preprocessing & Initial ML development

In [20]:
%pip install pandas numpy matplotlib seaborn scikit-learn plotly pyarrow fastparquet xgboost

Collecting xgboost
  Downloading xgboost-3.2.0-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.2.0-py3-none-macosx_12_0_arm64.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m5.3 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from sklearn.ensemble import IsolationForest, GradientBoostingRegressor, StackingRegressor
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import plotly.express as px
import plotly.figure_factory as ff 
import math
import pathlib
from pathlib import Path
import os

In [3]:
#getting the directory of notebook 

cwd = os.getcwd()

print(f"Current Working Directory: {cwd}")

Current Working Directory: /Users/saikeerthan/Coding/NYP/IOTA/IoT_Weather_project/model-training/code


In [4]:
DATA_ROOT = Path("/Users/saikeerthan/Coding/NYP/IOTA/IoT_Weather_project/model-training/datasets/official_data.csv")

print(DATA_ROOT.exists())

True


## Data Processing 

In [5]:
# retrieving the dataset

df = pd.read_csv(DATA_ROOT) 

df


Unnamed: 0,time,cidx,cattr,temp,humi,pres,windspeed,winddirection,rainfall,uvindex
0,2026-01-24 20:23:02,1,7,28.6,74,1016.746582,5.5,54,,
1,2026-01-24 20:28:02,2,7,28.5,74,1016.802490,5.3,53,,
2,2026-01-24 20:33:02,3,7,28.6,74,1016.859131,5.0,53,,
3,2026-01-24 20:38:02,4,7,28.5,74,1016.934326,4.9,50,,
4,2026-01-24 20:43:02,5,7,28.6,74,1016.971436,5.4,50,,
...,...,...,...,...,...,...,...,...,...,...
2271,2026-02-02 10:26:15,1045,7,38.5,47,1018.157959,6.9,75,,2.0
2272,2026-02-02 10:31:15,1046,7,38.5,47,1018.206055,8.2,73,,2.0
2273,2026-02-02 10:36:15,1047,7,38.5,47,1018.149414,8.2,66,,2.0
2274,2026-02-02 10:41:15,1048,7,38.5,47,1018.127930,8.1,47,,2.0


In [6]:
#checking for duplicates 

duplicates = df.duplicated().sum()

print(f"Duplicates present in DF: {duplicates}")

Duplicates present in DF: 0


In [7]:
# checking for missing values 

missing = df.isnull().sum().sum()

print(f"Missing values in df: {missing}")

# missing values is because of last two columns

Missing values in df: 3768


In [8]:
#columns in df 

columns = df.columns

print("Columns in Dataset:\n")
for i, col in enumerate(df.columns, 1):
    print(f"{i}. {col}")

Columns in Dataset:

1. time
2. cidx
3. cattr
4. temp
5. humi
6. pres
7. windspeed
8. winddirection
9. rainfall
10. uvindex


In [9]:
# drop entries where cattr is below 7

df = df[df["cattr"] >=7]

df

Unnamed: 0,time,cidx,cattr,temp,humi,pres,windspeed,winddirection,rainfall,uvindex
0,2026-01-24 20:23:02,1,7,28.6,74,1016.746582,5.5,54,,
1,2026-01-24 20:28:02,2,7,28.5,74,1016.802490,5.3,53,,
2,2026-01-24 20:33:02,3,7,28.6,74,1016.859131,5.0,53,,
3,2026-01-24 20:38:02,4,7,28.5,74,1016.934326,4.9,50,,
4,2026-01-24 20:43:02,5,7,28.6,74,1016.971436,5.4,50,,
...,...,...,...,...,...,...,...,...,...,...
2271,2026-02-02 10:26:15,1045,7,38.5,47,1018.157959,6.9,75,,2.0
2272,2026-02-02 10:31:15,1046,7,38.5,47,1018.206055,8.2,73,,2.0
2273,2026-02-02 10:36:15,1047,7,38.5,47,1018.149414,8.2,66,,2.0
2274,2026-02-02 10:41:15,1048,7,38.5,47,1018.127930,8.1,47,,2.0


In [10]:
columns_to_delete = ["cattr", "windspeed", "winddirection", "rainfall", "uvindex", "cidx"]

df = df.drop(columns=columns_to_delete)

columns = df.columns
print(f"Remaining Columns:{columns}")

Remaining Columns:Index(['time', 'temp', 'humi', 'pres'], dtype='object')


In [11]:
# check the dtype of every column in the df 

print(df.dtypes)

time     object
temp    float64
humi      int64
pres    float64
dtype: object


In [12]:
df["time"] = pd.to_datetime(df["time"])

print(f"New Dtypes for columns in df: {df.dtypes}")

New Dtypes for columns in df: time    datetime64[ns]
temp           float64
humi             int64
pres           float64
dtype: object


In [13]:

# Convert to datetime safely
df["time"] = pd.to_datetime(df["time"], errors="coerce")

# Drop invalid timestamps
df = df.dropna(subset=["time"])

# Sort chronologically (CRITICAL for time-series ML)
df = df.sort_values("time").reset_index(drop=True)

# (Recommended) Set as index if doing time-based operations later
df = df.set_index("time")

# Extract time features (daily patterns help weather prediction)
df["hour"] = df.index.hour
df["dayofweek"] = df.index.dayofweek

# Cyclic encoding (prevents 23 → 0 discontinuity)
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)


In [14]:
df

Unnamed: 0_level_0,temp,humi,pres,hour,dayofweek,hour_sin,hour_cos
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2026-01-24 20:23:02,28.6,74,1016.746582,20,5,-0.866025,0.500000
2026-01-24 20:28:02,28.5,74,1016.802490,20,5,-0.866025,0.500000
2026-01-24 20:33:02,28.6,74,1016.859131,20,5,-0.866025,0.500000
2026-01-24 20:38:02,28.5,74,1016.934326,20,5,-0.866025,0.500000
2026-01-24 20:43:02,28.6,74,1016.971436,20,5,-0.866025,0.500000
...,...,...,...,...,...,...,...
2026-02-02 10:26:15,38.5,47,1018.157959,10,0,0.500000,-0.866025
2026-02-02 10:31:15,38.5,47,1018.206055,10,0,0.500000,-0.866025
2026-02-02 10:36:15,38.5,47,1018.149414,10,0,0.500000,-0.866025
2026-02-02 10:41:15,38.5,47,1018.127930,10,0,0.500000,-0.866025


In [15]:
# save the new df 

df.to_csv("df_preprocessed.csv")

In [16]:
%pip install -U pyarrow
%pip instlal -U fastparquet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
ERROR: unknown command "instlal" - maybe you meant "install"
Note: you may need to restart the kernel to use updated packages.


In [17]:
WEATHER_COLS = ["temp", "humi", "pres"]

for col in WEATHER_COLS:
    df[f"{col}_lag1"] = df[col].shift(1)
    df[f"{col}_lag2"] = df[col].shift(2)
    df[f"{col}_lag3"] = df[col].shift(3)

# -----------------------------
# 5️⃣ CREATE NEXT-HOUR TARGETS
# -----------------------------
# ⚠ Adjust this based on sampling rate
# If data every 5 min → 12 steps = 1 hour
# If data every 10 min → 6 steps
# If data hourly → 1 step

SHIFT_STEPS = 12   # ← CHANGE IF NEEDED

for col in WEATHER_COLS:
    df[f"{col}_next1h"] = df[col].shift(-SHIFT_STEPS)

# -----------------------------
# 6️⃣ REMOVE NAN ROWS FROM SHIFTS
# -----------------------------
df = df.dropna()

# -----------------------------
# 7️⃣ OPTIONAL: DROP RAW HOUR IF USING SIN/COS
# -----------------------------
# df = df.drop(columns=["hour"])

# -----------------------------
# 8️⃣ SAVE ML-READY DATASET
# -----------------------------
df.to_parquet("weather_ml_ready.parquet")
df.to_csv("weather_ml_ready.csv")

print("✅ ML-ready dataset created")
print(df.shape)

✅ ML-ready dataset created
(2182, 19)


## Model Training Code 

Model Name - Climatrix

In [6]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
print(f"Current Working Directory: {os.getcwd()}")

Current Working Directory: /Users/saikeerthan/Coding/NYP/IOTA/IoT_Weather_project/model-training/code


In [3]:
ML_DATA_ROOT = Path("/Users/saikeerthan/Coding/NYP/IOTA/IoT_Weather_project/model-training/datasets/weather_ml_ready.csv")

if ML_DATA_ROOT.exists():
    print("Data Root Exists!")
else: 
    print("Data Root doesn't exist!")

Data Root Exists!


In [4]:
df = pd.read_csv(ML_DATA_ROOT)

df

Unnamed: 0,time,temp,humi,pres,hour,dayofweek,hour_sin,hour_cos,temp_lag1,temp_lag2,temp_lag3,humi_lag1,humi_lag2,humi_lag3,pres_lag1,pres_lag2,pres_lag3,temp_next1h,humi_next1h,pres_next1h
0,2026-01-24 20:38:02,28.5,74,1016.934326,20,5,-0.866025,0.500000,28.6,28.5,28.6,74.0,74.0,74.0,1016.859131,1016.802490,1016.746582,28.5,75.0,1017.522461
1,2026-01-24 20:43:02,28.6,74,1016.971436,20,5,-0.866025,0.500000,28.5,28.6,28.5,74.0,74.0,74.0,1016.934326,1016.859131,1016.802490,28.5,75.0,1017.663086
2,2026-01-24 20:48:02,28.9,74,1017.078125,20,5,-0.866025,0.500000,28.6,28.5,28.6,74.0,74.0,74.0,1016.971436,1016.934326,1016.859131,28.5,75.0,1017.696777
3,2026-01-24 20:53:02,28.7,74,1017.150879,20,5,-0.866025,0.500000,28.9,28.6,28.5,74.0,74.0,74.0,1017.078125,1016.971436,1016.934326,28.5,75.0,1017.770752
4,2026-01-24 20:58:02,28.5,74,1017.206299,20,5,-0.866025,0.500000,28.7,28.9,28.6,74.0,74.0,74.0,1017.150879,1017.078125,1016.971436,28.5,75.0,1017.850830
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,2026-02-02 09:26:15,38.3,48,1017.980957,9,0,0.707107,-0.707107,38.1,38.3,38.2,49.0,49.0,49.0,1018.028076,1017.984131,1017.997070,38.5,47.0,1018.157959
2178,2026-02-02 09:31:15,38.3,48,1018.037842,9,0,0.707107,-0.707107,38.3,38.1,38.3,48.0,49.0,49.0,1017.980957,1018.028076,1017.984131,38.5,47.0,1018.206055
2179,2026-02-02 09:36:15,38.3,48,1018.024414,9,0,0.707107,-0.707107,38.3,38.3,38.1,48.0,48.0,49.0,1018.037842,1017.980957,1018.028076,38.5,47.0,1018.149414
2180,2026-02-02 09:41:15,38.3,48,1018.083984,9,0,0.707107,-0.707107,38.3,38.3,38.3,48.0,48.0,48.0,1018.024414,1018.037842,1017.980957,38.5,47.0,1018.127930


In [5]:
# Targets 
TARGETS = ["temp_next1h", "humi_next1h", "pres_next1h"]

# Drop rows with missing target(s)
df = df.dropna(subset=TARGETS).reset_index(drop=True)

exclude_cols = set(TARGETS)

# If your dataset has time columns you don't want as raw strings, exclude them:
for maybe_time_col in ["time", "timestamp", "datetime", "date"]:
    if maybe_time_col in df.columns:
        exclude_cols.add(maybe_time_col)

# Keep numeric features only
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c not in exclude_cols]

X = df[feature_cols].copy()
y = df[TARGETS].copy()


In [7]:
# =========================
# 3) Train / Val / Test split (time-based)
# =========================
n = len(df)
train_end = int(n * 0.70)
val_end   = int(n * 0.85)  # train 70%, val 15%, test 15%

X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
X_val,   y_val   = X.iloc[train_end:val_end], y.iloc[train_end:val_end]
X_test,  y_test  = X.iloc[val_end:], y.iloc[val_end:]

print(f"Total rows: {n}")
print(f"Train: {len(X_train)}  Val: {len(X_val)}  Test: {len(X_test)}")

# TimeSeries CV inside TRAIN only
tscv = TimeSeriesSplit(n_splits=5)

def report_split(y_true, y_pred, split_name="SPLIT", model_name="Model"):
    mae = mean_absolute_error(y_true, y_pred, multioutput="raw_values")
    rmse = np.sqrt(mean_squared_error(y_true, y_pred, multioutput="raw_values"))
    r2 = r2_score(y_true, y_pred, multioutput="raw_values")

    print(f"\n===== {model_name} on {split_name} =====")
    for i, t in enumerate(TARGETS):
        print(f"{t}:  MAE={mae[i]:.4f}  RMSE={rmse[i]:.4f}  R2={r2[i]:.4f}")
    print(f"Avg MAE: {float(np.mean(mae)):.4f}")

def avg_mae(y_true, y_pred) -> float:
    mae = mean_absolute_error(y_true, y_pred, multioutput="raw_values")
    return float(np.mean(mae))

Total rows: 2182
Train: 1527  Val: 327  Test: 328


### Ridge Model

In [8]:
# =========================
# 4) Ridge: Pipeline + GridSearch (TRAIN only)
# =========================
ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", MultiOutputRegressor(
        Ridge(random_state=42, max_iter=20000)
    ))
])

ridge_param_grid = {
    "model__estimator__alpha": np.logspace(-4, 4, 25)  # robust sweep
}

ridge_search = GridSearchCV(
    estimator=ridge_pipe,
    param_grid=ridge_param_grid,
    scoring="neg_mean_absolute_error",
    cv=tscv,
    n_jobs=-1
)

ridge_search.fit(X_train, y_train)
best_ridge = ridge_search.best_estimator_
best_ridge_alpha = ridge_search.best_params_["model__estimator__alpha"]
print("\nBest Ridge alpha:", best_ridge_alpha)

# Evaluate Ridge on VAL
ridge_val_pred = best_ridge.predict(X_val)
report_split(y_val.values, ridge_val_pred, split_name="VAL", model_name="Ridge")


Best Ridge alpha: 215.44346900318823

===== Ridge on VAL =====
temp_next1h:  MAE=0.5847  RMSE=0.6671  R2=0.3522
humi_next1h:  MAE=0.8879  RMSE=1.0851  R2=0.8246
pres_next1h:  MAE=0.5267  RMSE=0.6301  R2=0.7695
Avg MAE: 0.6664


### ElasticNet

In [9]:
# =========================
# 5) ElasticNet: Pipeline + GridSearch (TRAIN only)
# =========================
enet_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", MultiOutputRegressor(
        ElasticNet(
            random_state=42,
            max_iter=50000,
            tol=1e-4,
            selection="cyclic"
        )
    ))
])

enet_param_grid = {
    "model__estimator__alpha": np.logspace(-4, 2, 18),     # 1e-4 ... 1e2
    "model__estimator__l1_ratio": [0.05, 0.1, 0.2, 0.35, 0.5, 0.7, 0.9]
}

enet_search = GridSearchCV(
    estimator=enet_pipe,
    param_grid=enet_param_grid,
    scoring="neg_mean_absolute_error",
    cv=tscv,
    n_jobs=-1
)

enet_search.fit(X_train, y_train)
best_enet = enet_search.best_estimator_
print("\nBest ElasticNet params:", enet_search.best_params_)

# Evaluate ElasticNet on VAL
enet_val_pred = best_enet.predict(X_val)
report_split(y_val.values, enet_val_pred, split_name="VAL", model_name="ElasticNet")


Best ElasticNet params: {'model__estimator__alpha': np.float64(0.06660846290809161), 'model__estimator__l1_ratio': 0.9}

===== ElasticNet on VAL =====
temp_next1h:  MAE=0.4811  RMSE=0.5455  R2=0.5668
humi_next1h:  MAE=0.9008  RMSE=1.1164  R2=0.8144
pres_next1h:  MAE=0.5307  RMSE=0.6365  R2=0.7648
Avg MAE: 0.6375


In [10]:
# =========================
# 6) Select model using VAL performance
# =========================
ridge_val_mae = avg_mae(y_val.values, ridge_val_pred)
enet_val_mae  = avg_mae(y_val.values, enet_val_pred)

if ridge_val_mae <= enet_val_mae:
    selected_name = "ridge"
    selected_model = best_ridge
else:
    selected_name = "elasticnet"
    selected_model = best_enet

print(f"\nSelected model (by VAL avg MAE): {selected_name.upper()}")
print(f"Ridge VAL avg MAE: {ridge_val_mae:.4f}")
print(f"Enet  VAL avg MAE: {enet_val_mae:.4f}")


Selected model (by VAL avg MAE): ELASTICNET
Ridge VAL avg MAE: 0.6664
Enet  VAL avg MAE: 0.6375


In [11]:
# =========================
# 7) Refit selected model on TRAIN+VAL (best practice), then test
# =========================
X_trainval = pd.concat([X_train, X_val], axis=0)
y_trainval = pd.concat([y_train, y_val], axis=0)

selected_model.fit(X_trainval, y_trainval)

# Evaluate on TEST
test_pred = selected_model.predict(X_test)
report_split(y_test.values, test_pred, split_name="TEST", model_name=selected_name.upper())


===== ELASTICNET on TEST =====
temp_next1h:  MAE=0.2849  RMSE=0.3679  R2=0.7898
humi_next1h:  MAE=0.7788  RMSE=0.9512  R2=0.7631
pres_next1h:  MAE=0.4991  RMSE=0.5864  R2=0.7630
Avg MAE: 0.5209


In [12]:
# ----- Baseline: persistence (next hour = current value) -----
# Assuming your features include current readings named temp, humi, pres
# If not, use the closest available like temp_lag1 as "current".
def persistence_baseline(df_features, y_true, use="current"):
    # prefer current if present; else fall back to lag1
    temp_col = "temp" if "temp" in df_features.columns else "temp_lag1"
    humi_col = "humi" if "humi" in df_features.columns else "humi_lag1"
    pres_col = "pres" if "pres" in df_features.columns else "pres_lag1"

    y_pred = np.column_stack([
        df_features[temp_col].values,
        df_features[humi_col].values,
        df_features[pres_col].values
    ])
    return y_pred

baseline_test_pred = persistence_baseline(X_test, y_test.values)
report_split(y_test.values, baseline_test_pred, split_name="TEST", model_name="PERSISTENCE BASELINE")


===== PERSISTENCE BASELINE on TEST =====
temp_next1h:  MAE=0.2555  RMSE=0.3406  R2=0.8198
humi_next1h:  MAE=0.6768  RMSE=0.9969  R2=0.7398
pres_next1h:  MAE=0.5426  RMSE=0.6325  R2=0.7243
Avg MAE: 0.4916


### Tree-Based Models

In [25]:
%pip install xgboost

Collecting xgboost
  Using cached xgboost-3.2.0-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Using cached xgboost-3.2.0-py3-none-macosx_12_0_arm64.whl (2.3 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [28]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor

In [14]:
# =========================
# 1) Load dataset
# =========================
CSV_PATH = "/Users/saikeerthan/Coding/NYP/IOTA/IoT_Weather_project/model-training/datasets/weather_ml_ready.csv"
df = pd.read_csv(CSV_PATH)

TARGETS = ["temp_next1h", "humi_next1h", "pres_next1h"]
df = df.dropna(subset=TARGETS).reset_index(drop=True)

# =========================
# 2) Features = numeric cols except targets (and exclude raw time cols if present)
# =========================
exclude_cols = set(TARGETS)
for maybe_time_col in ["time", "timestamp", "datetime", "date"]:
    if maybe_time_col in df.columns:
        exclude_cols.add(maybe_time_col)

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c not in exclude_cols]

X = df[feature_cols].copy()
y = df[TARGETS].copy()


In [15]:
# =========================
# 3) Train / Val / Test split (time-based)
# =========================
n = len(df)
train_end = int(n * 0.70)
val_end   = int(n * 0.85)

X_train, y_train = X.iloc[:train_end], y.iloc[:train_end]
X_val,   y_val   = X.iloc[train_end:val_end], y.iloc[train_end:val_end]
X_test,  y_test  = X.iloc[val_end:], y.iloc[val_end:]

print(f"Rows: total={n} train={len(X_train)} val={len(X_val)} test={len(X_test)}")

tscv = TimeSeriesSplit(n_splits=5)


def report_split(y_true, y_pred, split_name="SPLIT", model_name="Model"):
    mae = mean_absolute_error(y_true, y_pred, multioutput="raw_values")
    rmse = np.sqrt(mean_squared_error(y_true, y_pred, multioutput="raw_values"))
    r2 = r2_score(y_true, y_pred, multioutput="raw_values")

    print(f"\n===== {model_name} on {split_name} =====")
    for i, t in enumerate(TARGETS):
        print(f"{t}:  MAE={mae[i]:.4f}  RMSE={rmse[i]:.4f}  R2={r2[i]:.4f}")
    print(f"Avg MAE: {float(np.mean(mae)):.4f}")


def avg_mae(y_true, y_pred) -> float:
    return float(np.mean(mean_absolute_error(y_true, y_pred, multioutput="raw_values")))

Rows: total=2182 train=1527 val=327 test=328


In [16]:
# =========================
# 4) Persistence baseline (strong baseline)
# =========================
def persistence_baseline(df_features: pd.DataFrame):
    # prefer current columns if present; else lag1 as proxy
    temp_col = "temp" if "temp" in df_features.columns else "temp_lag1"
    humi_col = "humi" if "humi" in df_features.columns else "humi_lag1"
    pres_col = "pres" if "pres" in df_features.columns else "pres_lag1"

    return np.column_stack([
        df_features[temp_col].values,
        df_features[humi_col].values,
        df_features[pres_col].values
    ])

base_test_pred = persistence_baseline(X_test)
report_split(y_test.values, base_test_pred, split_name="TEST", model_name="PERSISTENCE BASELINE")
baseline_test_mae = avg_mae(y_test.values, base_test_pred)


===== PERSISTENCE BASELINE on TEST =====
temp_next1h:  MAE=0.2555  RMSE=0.3406  R2=0.8198
humi_next1h:  MAE=0.6768  RMSE=0.9969  R2=0.7398
pres_next1h:  MAE=0.5426  RMSE=0.6325  R2=0.7243
Avg MAE: 0.4916


In [17]:
# =========================
# 5) Models + robust grids (not crazy big)
# =========================
candidates = []

# A) HistGradientBoosting (fast + strong)
hgb = MultiOutputRegressor(
    HistGradientBoostingRegressor(random_state=42)
)
hgb_grid = {
    "estimator__learning_rate": [0.03, 0.06, 0.1],
    "estimator__max_depth": [3, 5, None],
    "estimator__max_leaf_nodes": [31, 63, 127],
    "estimator__min_samples_leaf": [10, 20, 40],
    "estimator__l2_regularization": [0.0, 0.1, 1.0],
}
candidates.append(("HGB", hgb, hgb_grid))

# B) ExtraTrees (often excellent on tabular)
etr = MultiOutputRegressor(
    ExtraTreesRegressor(random_state=42, n_jobs=-1)
)
etr_grid = {
    "estimator__n_estimators": [300, 600],
    "estimator__max_depth": [None, 10, 20],
    "estimator__min_samples_leaf": [1, 2, 5],
    "estimator__max_features": ["sqrt", 0.7, 1.0],
}
candidates.append(("EXTRATREES", etr, etr_grid))

# C) RandomForest (solid baseline)
rf = MultiOutputRegressor(
    RandomForestRegressor(random_state=42, n_jobs=-1)
)
rf_grid = {
    "estimator__n_estimators": [300, 600],
    "estimator__max_depth": [None, 10, 20],
    "estimator__min_samples_leaf": [1, 2, 5],
    "estimator__max_features": ["sqrt", 0.7, 1.0],
}
candidates.append(("RANDOMFOREST", rf, rf_grid))


In [18]:

# =========================
# 6) Tune each model on TRAIN (TimeSeriesSplit), choose by VAL
# =========================
best_name = None
best_model = None
best_val_mae = float("inf")

for name, model, grid in candidates:
    print(f"\n--- Tuning {name} (TRAIN only) ---")

    search = GridSearchCV(
        estimator=model,
        param_grid=grid,
        scoring="neg_mean_absolute_error",
        cv=tscv,
        n_jobs=-1,
        verbose=0
    )
    search.fit(X_train, y_train)

    tuned = search.best_estimator_
    print("Best params:", search.best_params_)

    val_pred = tuned.predict(X_val)
    report_split(y_val.values, val_pred, split_name="VAL", model_name=name)
    val_mae = avg_mae(y_val.values, val_pred)

    if val_mae < best_val_mae:
        best_val_mae = val_mae
        best_name = name
        best_model = tuned

print(f"\nSelected by VAL avg MAE: {best_name} (VAL avg MAE={best_val_mae:.4f})")


--- Tuning HGB (TRAIN only) ---
Best params: {'estimator__l2_regularization': 0.1, 'estimator__learning_rate': 0.03, 'estimator__max_depth': 5, 'estimator__max_leaf_nodes': 31, 'estimator__min_samples_leaf': 40}

===== HGB on VAL =====
temp_next1h:  MAE=0.5112  RMSE=0.6213  R2=0.4382
humi_next1h:  MAE=0.7049  RMSE=0.9611  R2=0.8624
pres_next1h:  MAE=0.3150  RMSE=0.3914  R2=0.9110
Avg MAE: 0.5103

--- Tuning EXTRATREES (TRAIN only) ---




Best params: {'estimator__max_depth': None, 'estimator__max_features': 0.7, 'estimator__min_samples_leaf': 5, 'estimator__n_estimators': 600}

===== EXTRATREES on VAL =====
temp_next1h:  MAE=0.3585  RMSE=0.4846  R2=0.6582
humi_next1h:  MAE=0.6618  RMSE=0.9080  R2=0.8772
pres_next1h:  MAE=0.2670  RMSE=0.3345  R2=0.9350
Avg MAE: 0.4291

--- Tuning RANDOMFOREST (TRAIN only) ---
Best params: {'estimator__max_depth': 20, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 2, 'estimator__n_estimators': 300}

===== RANDOMFOREST on VAL =====
temp_next1h:  MAE=0.3597  RMSE=0.4933  R2=0.6457
humi_next1h:  MAE=0.7934  RMSE=1.0704  R2=0.8294
pres_next1h:  MAE=0.2677  RMSE=0.3281  R2=0.9375
Avg MAE: 0.4736

Selected by VAL avg MAE: EXTRATREES (VAL avg MAE=0.4291)


In [19]:

# =========================
# 7) Refit selected on TRAIN+VAL, evaluate on TEST
# =========================
X_trainval = pd.concat([X_train, X_val], axis=0)
y_trainval = pd.concat([y_train, y_val], axis=0)

best_model.fit(X_trainval, y_trainval)

test_pred = best_model.predict(X_test)
report_split(y_test.values, test_pred, split_name="TEST", model_name=best_name)
tree_test_mae = avg_mae(y_test.values, test_pred)

print(f"\nBaseline TEST avg MAE: {baseline_test_mae:.4f}")
print(f"{best_name} TEST avg MAE: {tree_test_mae:.4f}")

if tree_test_mae < baseline_test_mae:
    print("✅ Tree model beats persistence baseline overall on TEST.")
else:
    print("⚠️ Tree model does NOT beat persistence baseline overall on TEST (still may beat it for one target).")



===== EXTRATREES on TEST =====
temp_next1h:  MAE=0.2385  RMSE=0.3105  R2=0.8503
humi_next1h:  MAE=0.7612  RMSE=0.9387  R2=0.7693
pres_next1h:  MAE=0.3088  RMSE=0.3938  R2=0.8931
Avg MAE: 0.4361

Baseline TEST avg MAE: 0.4916
EXTRATREES TEST avg MAE: 0.4361
✅ Tree model beats persistence baseline overall on TEST.


#### XGBoost

In [29]:
# =========================
# 5) XGBoost model + tuning (TRAIN only)
# =========================
# Good defaults for tabular regression; fast + robust.
xgb_base = XGBRegressor(
    objective="reg:squarederror",
    tree_method="hist",          # fast on CPU
    random_state=42,
    n_jobs=-1,
    eval_metric="mae",
    verbosity=0
)

xgb = MultiOutputRegressor(xgb_base)

# A compact but strong grid (won't take forever)
param_grid = {
    "estimator__n_estimators": [300, 600, 900],
    "estimator__learning_rate": [0.03, 0.05, 0.08],
    "estimator__max_depth": [3, 4, 6],
    "estimator__subsample": [0.7, 0.9, 1.0],
    "estimator__colsample_bytree": [0.7, 0.9, 1.0],
    "estimator__min_child_weight": [1, 5, 10],
    "estimator__reg_lambda": [1.0, 5.0, 10.0],   # L2
}

search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=tscv,
    n_jobs=-1,
    verbose=0
)

print("\n--- Tuning XGB (TRAIN only) ---")
search.fit(X_train, y_train)

best_xgb = search.best_estimator_
print("Best XGB params:", search.best_params_)

# VAL
val_pred = best_xgb.predict(X_val)
report_split(y_val.values, val_pred, split_name="VAL", model_name="XGB")


--- Tuning XGB (TRAIN only) ---
Best XGB params: {'estimator__colsample_bytree': 0.7, 'estimator__learning_rate': 0.08, 'estimator__max_depth': 3, 'estimator__min_child_weight': 1, 'estimator__n_estimators': 300, 'estimator__reg_lambda': 10.0, 'estimator__subsample': 0.7}

===== XGB on VAL =====
temp_next1h:  MAE=0.4069  RMSE=0.5845  R2=0.5027
humi_next1h:  MAE=0.7682  RMSE=1.0914  R2=0.8226
pres_next1h:  MAE=0.3107  RMSE=0.3819  R2=0.9153
Avg MAE: 0.4953


In [30]:
# =========================
# 6) Refit on TRAIN+VAL, evaluate on TEST
# =========================
X_trainval = pd.concat([X_train, X_val], axis=0)
y_trainval = pd.concat([y_train, y_val], axis=0)

best_xgb.fit(X_trainval, y_trainval)

test_pred = best_xgb.predict(X_test)
report_split(y_test.values, test_pred, split_name="TEST", model_name="XGB")
xgb_test_mae = avg_mae(y_test.values, test_pred)

print(f"\nBaseline TEST avg MAE: {baseline_test_mae:.4f}")
print(f"XGB TEST avg MAE: {xgb_test_mae:.4f}")

if xgb_test_mae < baseline_test_mae:
    print("✅ XGB beats persistence baseline overall on TEST.")
else:
    print("⚠️ XGB does NOT beat persistence baseline overall on TEST.")


===== XGB on TEST =====
temp_next1h:  MAE=0.2347  RMSE=0.3632  R2=0.7952
humi_next1h:  MAE=0.7568  RMSE=0.9473  R2=0.7651
pres_next1h:  MAE=0.2801  RMSE=0.3400  R2=0.9204
Avg MAE: 0.4239

Baseline TEST avg MAE: 0.4916
XGB TEST avg MAE: 0.4239
✅ XGB beats persistence baseline overall on TEST.
