### Prepare feature group with historical weather data for a sensor location

In [245]:
import argparse
import pandas as pd
import hopsworks
from datetime import date
import json
import requests

import math
import os


parser = argparse.ArgumentParser()
parser.add_argument("--latitude", type=float, default=59.3284)
parser.add_argument("--longitude", type=float, default=18.0664)
parser.add_argument("--sensor-id", type=int, default=20389)
parser.add_argument("--sensor-name", type=str, default="malaren_w")
parser.add_argument("--weather-fg-version", type=int, default=4)

args, _ = parser.parse_known_args()

latitude = args.latitude
longitude = args.longitude
sensor_id = args.sensor_id
sensor_name = args.sensor_name
weather_fg_version = args.weather_fg_version

latitude = 63.2983
longitude = 14.4601
sensor_id = 20048
sensor_name = "storsjon_jamtland"
water_level_fg_version = 1
weather_fg_version = 1
fv_version = 1
model_version = 1
sensor_url = "https://opendata-download-hydroobs.smhi.se/api/version/1.0/parameter/3/station/20048/period/corrected-archive/data.json"

import pkgutil
import importlib.metadata as md  # py3.8+
print(md.version("hopsworks"))      # replace with package name

4.6.0


### Step 1: Login to hopsworks

Environment variable `HOPSWORKS_API_KEY` should be set

In [223]:
api_key = os.environ.get("HOPSWORKS_API_KEY")
if not api_key:
    raise RuntimeError("HOPSWORKS_API_KEY is not set")

project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    port=443,
    project="ml_project",
    api_key_value=api_key,
)

wtoN33kI3N6NEoXN.uGN8iXjW9Cqw6lSFkvNMrDJh8xe9VEXnhhWhbaeiY1DmMHspWCFJrRzYSkzdfk0R
2026-01-11 18:31:40,660 INFO: Closing external client and cleaning up certificates.
2026-01-11 18:31:40,663 INFO: Connection closed.
2026-01-11 18:31:40,664 INFO: Initializing external client
2026-01-11 18:31:40,665 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 18:31:42,301 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2184


### Step 2: Fetch historical weather data for the water level sensor location

In [224]:
from utils import fetch_daily_weather

OPEN_METEO_WEATHER_FEATURES = [
    "precipitation_sum",
    "snowfall_sum",
    "rain_sum",
    "temperature_2m_mean",
    "wind_speed_10m_mean",
    "surface_pressure_mean",
]

df_weather = fetch_daily_weather(
    latitude,
    longitude,
    OPEN_METEO_WEATHER_FEATURES,
    start_date="2020-01-01",
)
print(df_weather)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-01-01                6.2          0.21       5.9                  3.2   
2020-01-02                0.2          0.00       0.2                  3.1   
2020-01-03                3.4          1.61       1.2                 -0.5   
2020-01-04                0.7          0.49       0.0                 -1.5   
2020-01-05                1.8          1.33       0.0                 -1.9   
...                       ...           ...       ...                  ...   
2026-01-07                2.8          1.96       0.0                -11.7   
2026-01-08                6.0          4.20       0.0                 -7.1   
2026-01-09                0.4          0.28       0.0                 -7.1   
2026-01-10                0.0          0.00       0.0                -12.3   
2026-01-11                0.0          0.00       0.0           

In [225]:
from utils import fetch_daily_weather_forecast

df_weather_forecast = fetch_daily_weather_forecast(
    latitude,
    longitude,
    OPEN_METEO_WEATHER_FEATURES,
    forecast_days=14,
)
print(df_weather_forecast)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2026-01-11                0.0          0.00       0.0                -19.1   
2026-01-12                0.0          0.00       0.0                -15.7   
2026-01-13                0.2          0.21       0.0                 -7.3   
2026-01-14                0.7          0.49       0.0                 -3.2   
2026-01-15                0.0          0.00       0.0                 -2.5   
2026-01-16                1.5          1.05       0.0                 -0.2   
2026-01-17                0.6          0.42       0.0                 -0.1   
2026-01-18                3.4          2.38       0.0                 -0.8   
2026-01-19                0.2          0.14       0.0                  0.1   
2026-01-20                0.5          0.00       0.5                  1.2   
2026-01-21                0.1          0.00       0.1           

In [226]:
# Sort and remove duplicates if any
df_weather_all = pd.concat([df_weather, df_weather_forecast], axis=0).sort_index()
df_weather_all = df_weather_all[~df_weather_all.index.duplicated(keep="last")]

In [227]:
print(df_weather_all)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-01-01                6.2          0.21       5.9                  3.2   
2020-01-02                0.2          0.00       0.2                  3.1   
2020-01-03                3.4          1.61       1.2                 -0.5   
2020-01-04                0.7          0.49       0.0                 -1.5   
2020-01-05                1.8          1.33       0.0                 -1.9   
...                       ...           ...       ...                  ...   
2026-01-20                0.5          0.00       0.5                  1.2   
2026-01-21                0.1          0.00       0.1                  1.8   
2026-01-22                0.0          0.00       0.0                 -3.0   
2026-01-23                0.0          0.00       0.0                 -7.6   
2026-01-24                0.0          0.00       0.0           

### Validate the data and display dates which are missing

In [228]:
from utils import find_missing_dates

missing = find_missing_dates(df_weather_all)

✅ No missing dates found.


### Step 3: Add lagged and aggregate features

In [229]:
def add_lagged_aggregated_features(
    df_weather: pd.DataFrame
) -> pd.DataFrame:
    """
    Create lagged and aggregated features for water-level prediction.

    Returns a single DataFrame aligned by date.
    """

    # --- Safety checks ---
    if not isinstance(df_weather.index, pd.DatetimeIndex):
        raise ValueError("df_weather index must be DatetimeIndex")

    df = df_weather.sort_index()

    # ======================
    # Precipitation features
    # ======================
    df["precip_sum_3d"] = (
        df["precipitation_sum"]
        .rolling(window=3, min_periods=3)
        .sum()
    )

    df["precip_sum_7d"] = (
        df["precipitation_sum"]
        .rolling(window=7, min_periods=7)
        .sum()
    )

    df["precip_sum_14d"] = (
        df["precipitation_sum"]
        .rolling(window=14, min_periods=14)
        .sum()
    )

    # ======================
    # Snowfall features
    # ======================
    df["snow_sum_14d"] = (
        df["snowfall_sum"]
        .rolling(window=14, min_periods=14)
        .sum()
    )

    df["snow_sum_30d"] = (
        df["snowfall_sum"]
        .rolling(window=30, min_periods=30)
        .sum()
    )

    df["snow_sum_60d"] = (
        df["snowfall_sum"]
        .rolling(window=60, min_periods=60)
        .sum()
    )

    return df

In [230]:
df_weather_full = add_lagged_aggregated_features(
    df_weather_all,
)
print(df_weather_full)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-01-01                6.2          0.21       5.9                  3.2   
2020-01-02                0.2          0.00       0.2                  3.1   
2020-01-03                3.4          1.61       1.2                 -0.5   
2020-01-04                0.7          0.49       0.0                 -1.5   
2020-01-05                1.8          1.33       0.0                 -1.9   
...                       ...           ...       ...                  ...   
2026-01-20                0.5          0.00       0.5                  1.2   
2026-01-21                0.1          0.00       0.1                  1.8   
2026-01-22                0.0          0.00       0.0                 -3.0   
2026-01-23                0.0          0.00       0.0                 -7.6   
2026-01-24                0.0          0.00       0.0           

### Step 4: Remove rows with NaN

In [231]:
df_weather_full = df_weather_full.dropna()
print(df_weather_full)

missing = find_missing_dates(df_weather_full)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-02-29                0.4          0.28       0.0                 -6.5   
2020-03-01                3.1          2.17       0.0                 -9.2   
2020-03-02               12.7          8.89       0.0                 -7.6   
2020-03-03                2.9          2.03       0.0                 -6.2   
2020-03-04                7.7          5.32       0.1                 -4.5   
...                       ...           ...       ...                  ...   
2026-01-20                0.5          0.00       0.5                  1.2   
2026-01-21                0.1          0.00       0.1                  1.8   
2026-01-22                0.0          0.00       0.0                 -3.0   
2026-01-23                0.0          0.00       0.0                 -7.6   
2026-01-24                0.0          0.00       0.0           

### Fetch weather for points at distance from the sensor point

In [232]:
from utils import fetch_spatial_weather_75km

df_weather_75km = fetch_spatial_weather_75km(
    latitude=latitude,
    longitude=longitude,
    features=OPEN_METEO_WEATHER_FEATURES,
    start_date="2020-01-01",
    end_date="2026-01-10"
)
# df_weather_75km["time"] = pd.to_datetime(df_weather_75km["time"])
# df_weather_75km.set_index("time").sort_index()
print(df_weather_75km)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2020-01-01                      17.7                 6.23              8.8   
2020-01-02                       4.6                 0.28              4.2   
2020-01-03                       4.2                 2.94              0.0   
2020-01-04                       0.7                 0.56              0.0   
2020-01-05                       4.5                 3.15              0.0   
...                              ...                  ...              ...   
2026-01-06                       0.2                 0.14              0.0   
2026-01-07                       1.6                 1.12              0.0   
2026-01-08                      14.6                10.22              0.0   
2026-01-09                       2.3                 1.61              0.0   
2026-01-10                       1.5                 1.19       

In [233]:
print(df_weather_75km.head())
print(df_weather_75km.columns)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2020-01-01                      17.7                 6.23              8.8   
2020-01-02                       4.6                 0.28              4.2   
2020-01-03                       4.2                 2.94              0.0   
2020-01-04                       0.7                 0.56              0.0   
2020-01-05                       4.5                 3.15              0.0   

            temperature_2m_mean_n_75km  wind_speed_10m_mean_n_75km  \
time                                                                 
2020-01-01                         2.8                        25.5   
2020-01-02                         1.6                        10.3   
2020-01-03                        -2.5                        19.9   
2020-01-04                        -3.6                        24.9   
2020-01-05                       

In [234]:
from utils import fetch_spatial_weather_forecast_75km

df_weather_75km_forecast = fetch_spatial_weather_forecast_75km(
    latitude=latitude,
    longitude=longitude,
    features=OPEN_METEO_WEATHER_FEATURES
)
print(df_weather_75km_forecast)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2026-01-11                       0.0                 0.00              0.0   
2026-01-12                       1.1                 0.77              0.0   
2026-01-13                       1.2                 0.84              0.0   
2026-01-14                       5.1                 3.50              0.1   
2026-01-15                       5.1                 3.78              0.0   
2026-01-16                       5.6                 3.92              0.0   
2026-01-17                       1.4                 0.98              0.0   
2026-01-18                       8.3                 5.81              0.0   
2026-01-19                       2.4                 1.26              0.6   
2026-01-20                       2.1                 1.47              0.0   
2026-01-21                       0.4                 0.28       

In [235]:
df_weather_75km_forecast_all = pd.concat([df_weather_75km, df_weather_75km_forecast], axis=0).sort_index()
df_weather_75km_forecast_all = df_weather_75km_forecast_all[~df_weather_75km_forecast_all.index.duplicated(keep="last")]

In [236]:
print(df_weather_75km_forecast_all)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2020-01-01                      17.7                 6.23              8.8   
2020-01-02                       4.6                 0.28              4.2   
2020-01-03                       4.2                 2.94              0.0   
2020-01-04                       0.7                 0.56              0.0   
2020-01-05                       4.5                 3.15              0.0   
...                              ...                  ...              ...   
2026-01-20                       2.1                 1.47              0.0   
2026-01-21                       0.4                 0.28              0.0   
2026-01-22                       0.0                 0.00              0.0   
2026-01-23                       0.0                 0.00              0.0   
2026-01-24                       0.0                 0.00       

### TODO: Add data validation rules

### Store the data in hopsworks feature store

In [237]:
fs = project.get_feature_store()

In [238]:
df_all_weather_features = df_weather_full.join(df_weather_75km_forecast_all, how="inner")
print(df_all_weather_features)

missing = find_missing_dates(df_all_weather_features)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-02-29                0.4          0.28       0.0                 -6.5   
2020-03-01                3.1          2.17       0.0                 -9.2   
2020-03-02               12.7          8.89       0.0                 -7.6   
2020-03-03                2.9          2.03       0.0                 -6.2   
2020-03-04                7.7          5.32       0.1                 -4.5   
...                       ...           ...       ...                  ...   
2026-01-20                0.5          0.00       0.5                  1.2   
2026-01-21                0.1          0.00       0.1                  1.8   
2026-01-22                0.0          0.00       0.0                 -3.0   
2026-01-23                0.0          0.00       0.0                 -7.6   
2026-01-24                0.0          0.00       0.0           

In [239]:
print(df_all_weather_features.head())
print(df_all_weather_features.columns)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-02-29                0.4          0.28       0.0                 -6.5   
2020-03-01                3.1          2.17       0.0                 -9.2   
2020-03-02               12.7          8.89       0.0                 -7.6   
2020-03-03                2.9          2.03       0.0                 -6.2   
2020-03-04                7.7          5.32       0.1                 -4.5   

            wind_speed_10m_mean  surface_pressure_mean  precip_sum_3d  \
time                                                                    
2020-02-29                  9.1                  960.3            0.4   
2020-03-01                 10.6                  953.1            3.5   
2020-03-02                 15.2                  959.4           16.2   
2020-03-03                 11.6                  976.0           18.7   
2020-03-04     

In [240]:
from utils import normalize_date_to_utc_day

df_fg = df_all_weather_features.copy()

# Make event time explicit
df_fg = df_fg.reset_index().rename(columns={"time": "date"})

# Add primary key
df_fg["sensor_id"] = sensor_id  # e.g. "malaren_w_20389"

# Reorder columns (optional but nice)
df_fg = df_fg[["sensor_id", "date"] +
              [c for c in df_fg.columns if c not in ["sensor_id", "date"]]]

df_fg = normalize_date_to_utc_day(df_fg)

In [241]:
print(df_fg["date"].head(10))

0   2020-02-29 00:00:00+00:00
1   2020-03-01 00:00:00+00:00
2   2020-03-02 00:00:00+00:00
3   2020-03-03 00:00:00+00:00
4   2020-03-04 00:00:00+00:00
5   2020-03-05 00:00:00+00:00
6   2020-03-06 00:00:00+00:00
7   2020-03-07 00:00:00+00:00
8   2020-03-08 00:00:00+00:00
9   2020-03-09 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]


In [242]:
weather_fg = fs.get_or_create_feature_group(
    name=f"weather_features_{sensor_name}_{sensor_id}",
    description=(
        "Daily weather features for water level prediction, including "
        "local conditions, lagged aggregates, and spatial features at 75 km "
        "in cardinal directions."
    ),
    version=weather_fg_version,
    primary_key=["sensor_id", "date"],
    event_time="date",
)

In [243]:
weather_fg.insert(df_fg)

Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/2184/fs/2136/fg/2331

2026-01-11 18:32:01,933 INFO: Computing insert statistics


(None, None)

In [244]:
# Core identifiers
weather_fg.update_feature_description(
    "date",
    "Date of the weather observation (daily resolution, UTC)"
)

weather_fg.update_feature_description(
    "sensor_id",
    "Unique identifier of the water level sensor / location"
)

# Local weather
weather_fg.update_feature_description(
    "precipitation_sum",
    "Total daily precipitation at the sensor location (mm/day)"
)

weather_fg.update_feature_description(
    "snowfall_sum",
    "Total daily snowfall at the sensor location (cm/day)"
)

weather_fg.update_feature_description(
    "rain_sum",
    "Total daily rainfall at the sensor location (mm/day)"
)

weather_fg.update_feature_description(
    "temperature_2m_mean",
    "Mean daily air temperature at 2 meters above ground (°C)"
)

weather_fg.update_feature_description(
    "wind_speed_10m_mean",
    "Mean daily wind speed at 10 meters above ground (km/h)"
)

weather_fg.update_feature_description(
    "surface_pressure_mean",
    "Mean daily surface air pressure (hPa)"
)

# Lagged / aggregated local features
weather_fg.update_feature_description(
    "precip_sum_3d",
    "Total precipitation accumulated over the previous 3 days (mm)"
)

weather_fg.update_feature_description(
    "precip_sum_7d",
    "Total precipitation accumulated over the previous 7 days (mm)"
)

weather_fg.update_feature_description(
    "precip_sum_14d",
    "Total precipitation accumulated over the previous 14 days (mm)"
)

weather_fg.update_feature_description(
    "snow_sum_14d",
    "Total snowfall accumulated over the previous 14 days (cm)"
)

weather_fg.update_feature_description(
    "snow_sum_30d",
    "Total snowfall accumulated over the previous 30 days (cm)"
)

weather_fg.update_feature_description(
    "snow_sum_60d",
    "Total snowfall accumulated over the previous 60 days (cm)"
)

# Spatial features (75 km)
for direction in ["n", "s", "e", "w"]:
    weather_fg.update_feature_description(
        f"precipitation_sum_{direction}_75km",
        f"Total daily precipitation (mm) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"snowfall_sum_{direction}_75km",
        f"Total daily snowfall (cm) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"rain_sum_{direction}_75km",
        f"Total daily rainfall (mm) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"temperature_2m_mean_{direction}_75km",
        f"Mean daily air temperature (°C) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"wind_speed_10m_mean_{direction}_75km",
        f"Mean daily wind speed (km/h) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"surface_pressure_mean_{direction}_75km",
        f"Mean daily surface air pressure (hPa) measured {direction} of the sensor at approximately 75 km distance"
    )
