### Prepare feature group with historical weather data for a sensor location

In [1]:
import argparse
import pandas as pd
import hopsworks
from datetime import date
import json
import requests

import math
import os


parser = argparse.ArgumentParser()
parser.add_argument("--latitude", type=float, default=59.3284)
parser.add_argument("--longitude", type=float, default=18.0664)
parser.add_argument("--sensor-id", type=int, default=20389)
parser.add_argument("--sensor-name", type=str, default="malaren_w")

args, _ = parser.parse_known_args()

latitude = args.latitude
longitude = args.longitude
sensor_id = args.sensor_id
sensor_name = args.sensor_name

### Step 1: Login to hopsworks

Environment variable `HOPSWORKS_API_KEY` should be set

In [2]:
# please get the value of HOPSWORKS_API_KEY environment variable
print(os.environ["HOPSWORKS_API_KEY"])

project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    port=443,
    project="ml_project",
    api_key_value="wtoN33kI3N6NEoXN.uGN8iXjW9Cqw6lSFkvNMrDJh8xe9VEXnhhWhbaeiY1DmMHspWCFJrRzYSkzdfk0R"
)

wtoN33kI3N6NEoXN.uGN8iXjW9Cqw6lSFkvNMrDJh8xe9VEXnhhWhbaeiY1DmMHspWCFJrRzYSkzdfk0R
2026-01-11 00:32:29,405 INFO: Initializing external client
2026-01-11 00:32:29,406 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 00:32:30,755 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2184


### Step 2: Fetch historical weather data for the water level sensor location

In [4]:
from utils import fetch_daily_weather

OPEN_METEO_WEATHER_FEATURES = [
    "precipitation_sum",
    "snowfall_sum",
    "rain_sum",
    "temperature_2m_mean",
    "wind_speed_10m_mean",
    "surface_pressure_mean",
]

df_weather = fetch_daily_weather(
    latitude,
    longitude,
    OPEN_METEO_WEATHER_FEATURES,
    start_date="2020-01-01",
    end_date="2026-01-10",  #TODO: remove
)
print(df_weather)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-01-01                0.0          0.00       0.0                  2.9   
2020-01-02                0.0          0.00       0.0                  5.1   
2020-01-03                0.0          0.00       0.0                  5.1   
2020-01-04                0.0          0.00       0.0                  1.5   
2020-01-05                2.9          1.68       0.5                 -0.4   
...                       ...           ...       ...                  ...   
2026-01-06               10.5          7.28       0.1                 -2.5   
2026-01-07                1.4          0.91       0.1                 -5.4   
2026-01-08                0.7          0.49       0.0                 -0.7   
2026-01-09                0.1          0.07       0.0                 -4.4   
2026-01-10                0.0          0.00       0.0           

In [6]:
from utils import fetch_daily_weather_forecast

df_weather_forecast = fetch_daily_weather_forecast(
    latitude,
    longitude,
    OPEN_METEO_WEATHER_FEATURES,
    forecast_days=14,
)
print(df_weather_forecast)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2026-01-10                0.0          0.00       0.0                 -4.9   
2026-01-11                0.6          0.42       0.0                 -2.6   
2026-01-12                0.5          0.35       0.0                 -3.4   
2026-01-13                0.4          0.21       0.1                 -2.7   
2026-01-14                3.7          2.52       0.1                  0.5   
2026-01-15                6.5          3.92       0.9                  1.2   
2026-01-16                0.0          0.00       0.0                  1.6   
2026-01-17                1.2          0.00       1.2                  2.0   
2026-01-18                3.6          0.84       1.2                  1.9   
2026-01-19                3.8          0.00       3.3                  2.0   
2026-01-20                5.8          0.84       3.9           

In [7]:
# Sort and remove duplicates if any
df_weather_all = pd.concat([df_weather, df_weather_forecast], axis=0).sort_index()
df_weather_all = df_weather_all[~df_weather_all.index.duplicated(keep="last")]

In [8]:
print(df_weather_all)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-01-01                0.0          0.00       0.0                  2.9   
2020-01-02                0.0          0.00       0.0                  5.1   
2020-01-03                0.0          0.00       0.0                  5.1   
2020-01-04                0.0          0.00       0.0                  1.5   
2020-01-05                2.9          1.68       0.5                 -0.4   
...                       ...           ...       ...                  ...   
2026-01-19                3.8          0.00       3.3                  2.0   
2026-01-20                5.8          0.84       3.9                  1.6   
2026-01-21                1.1          0.00       0.6                  1.2   
2026-01-22                0.1          0.00       0.0                 -1.4   
2026-01-23                0.0          0.00       0.0           

### Validate the data and display dates which are missing

In [10]:
from utils import find_missing_dates

missing = find_missing_dates(df_weather_all)

✅ No missing dates found.


### Step 3: Add lagged and aggregate features

In [11]:
def add_lagged_aggregated_features(
    df_weather: pd.DataFrame
) -> pd.DataFrame:
    """
    Create lagged and aggregated features for water-level prediction.

    Returns a single DataFrame aligned by date.
    """

    # --- Safety checks ---
    if not isinstance(df_weather.index, pd.DatetimeIndex):
        raise ValueError("df_weather index must be DatetimeIndex")

    df = df_weather.sort_index()

    # ======================
    # Precipitation features
    # ======================
    df["precip_sum_3d"] = (
        df["precipitation_sum"]
        .rolling(window=3, min_periods=3)
        .sum()
    )

    df["precip_sum_7d"] = (
        df["precipitation_sum"]
        .rolling(window=7, min_periods=7)
        .sum()
    )

    df["precip_sum_14d"] = (
        df["precipitation_sum"]
        .rolling(window=14, min_periods=14)
        .sum()
    )

    # ======================
    # Snowfall features
    # ======================
    df["snow_sum_14d"] = (
        df["snowfall_sum"]
        .rolling(window=14, min_periods=14)
        .sum()
    )

    df["snow_sum_30d"] = (
        df["snowfall_sum"]
        .rolling(window=30, min_periods=30)
        .sum()
    )

    df["snow_sum_60d"] = (
        df["snowfall_sum"]
        .rolling(window=60, min_periods=60)
        .sum()
    )

    return df

In [12]:
df_weather_full = add_lagged_aggregated_features(
    df_weather_all,
)
print(df_weather_full)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-01-01                0.0          0.00       0.0                  2.9   
2020-01-02                0.0          0.00       0.0                  5.1   
2020-01-03                0.0          0.00       0.0                  5.1   
2020-01-04                0.0          0.00       0.0                  1.5   
2020-01-05                2.9          1.68       0.5                 -0.4   
...                       ...           ...       ...                  ...   
2026-01-19                3.8          0.00       3.3                  2.0   
2026-01-20                5.8          0.84       3.9                  1.6   
2026-01-21                1.1          0.00       0.6                  1.2   
2026-01-22                0.1          0.00       0.0                 -1.4   
2026-01-23                0.0          0.00       0.0           

### Step 4: Remove rows with NaN

In [13]:
df_weather_full = df_weather_full.dropna()
print(df_weather_full)

missing = find_missing_dates(df_weather_full)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-02-29                0.9          0.07       0.8                  0.5   
2020-03-01                9.0          1.19       7.3                  3.0   
2020-03-02                3.8          0.00       3.8                  3.6   
2020-03-03                2.9          0.00       2.9                  3.1   
2020-03-04               16.5          5.25       9.0                  1.7   
...                       ...           ...       ...                  ...   
2026-01-19                3.8          0.00       3.3                  2.0   
2026-01-20                5.8          0.84       3.9                  1.6   
2026-01-21                1.1          0.00       0.6                  1.2   
2026-01-22                0.1          0.00       0.0                 -1.4   
2026-01-23                0.0          0.00       0.0           

### Fetch weather for points at distance from the sensor point

In [17]:
from utils import fetch_spatial_weather_75km

df_weather_75km = fetch_spatial_weather_75km(
    latitude=latitude,
    longitude=longitude,
    features=OPEN_METEO_WEATHER_FEATURES,
    start_date="2020-01-01",
    end_date="2026-01-10"
)
# df_weather_75km["time"] = pd.to_datetime(df_weather_75km["time"])
# df_weather_75km.set_index("time").sort_index()
print(df_weather_75km)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2020-01-01                       0.4                 0.00              0.4   
2020-01-02                       0.0                 0.00              0.0   
2020-01-03                       0.1                 0.00              0.1   
2020-01-04                       0.0                 0.00              0.0   
2020-01-05                       2.9                 1.82              0.3   
...                              ...                  ...              ...   
2026-01-06                       9.5                 6.65              0.0   
2026-01-07                       1.7                 1.19              0.0   
2026-01-08                       2.4                 1.68              0.0   
2026-01-09                       0.5                 0.28              0.1   
2026-01-10                       0.0                 0.00       

In [18]:
print(df_weather_75km.head())
print(df_weather_75km.columns)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2020-01-01                       0.4                 0.00              0.4   
2020-01-02                       0.0                 0.00              0.0   
2020-01-03                       0.1                 0.00              0.1   
2020-01-04                       0.0                 0.00              0.0   
2020-01-05                       2.9                 1.82              0.3   

            temperature_2m_mean_n_75km  wind_speed_10m_mean_n_75km  \
time                                                                 
2020-01-01                         3.1                        17.3   
2020-01-02                         4.8                        23.2   
2020-01-03                         5.1                        25.4   
2020-01-04                         1.5                        22.5   
2020-01-05                       

In [21]:
from utils import fetch_spatial_weather_forecast_75km

df_weather_75km_forecast = fetch_spatial_weather_forecast_75km(
    latitude=latitude,
    longitude=longitude,
    features=OPEN_METEO_WEATHER_FEATURES
)
print(df_weather_75km_forecast)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2026-01-10                      0.00                 0.00              0.0   
2026-01-11                      0.10                 0.07              0.0   
2026-01-12                      0.20                 0.14              0.0   
2026-01-13                      0.28                 0.12              0.1   
2026-01-14                      2.00                 1.40              0.0   
2026-01-15                      6.50                 4.13              0.6   
2026-01-16                      0.00                 0.00              0.0   
2026-01-17                      0.60                 0.00              0.6   
2026-01-18                      4.80                 0.84              3.6   
2026-01-19                      2.10                 0.35              1.6   
2026-01-20                      5.70                 2.17       

In [22]:
df_weather_75km_forecast_all = pd.concat([df_weather_75km, df_weather_75km_forecast], axis=0).sort_index()
df_weather_75km_forecast_all = df_weather_75km_forecast_all[~df_weather_75km_forecast_all.index.duplicated(keep="last")]

In [23]:
print(df_weather_75km_forecast_all)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2020-01-01                       0.4                 0.00              0.4   
2020-01-02                       0.0                 0.00              0.0   
2020-01-03                       0.1                 0.00              0.1   
2020-01-04                       0.0                 0.00              0.0   
2020-01-05                       2.9                 1.82              0.3   
...                              ...                  ...              ...   
2026-01-19                       2.1                 0.35              1.6   
2026-01-20                       5.7                 2.17              2.6   
2026-01-21                       0.0                 0.00              0.0   
2026-01-22                       0.0                 0.00              0.0   
2026-01-23                       0.0                 0.00       

### TODO: Add data validation rules

### Store the data in hopsworks feature store

In [24]:
fs = project.get_feature_store()

In [25]:
df_all_weather_features = df_weather_full.join(df_weather_75km_forecast_all, how="inner")
print(df_all_weather_features)

missing = find_missing_dates(df_all_weather_features)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-02-29                0.9          0.07       0.8                  0.5   
2020-03-01                9.0          1.19       7.3                  3.0   
2020-03-02                3.8          0.00       3.8                  3.6   
2020-03-03                2.9          0.00       2.9                  3.1   
2020-03-04               16.5          5.25       9.0                  1.7   
...                       ...           ...       ...                  ...   
2026-01-19                3.8          0.00       3.3                  2.0   
2026-01-20                5.8          0.84       3.9                  1.6   
2026-01-21                1.1          0.00       0.6                  1.2   
2026-01-22                0.1          0.00       0.0                 -1.4   
2026-01-23                0.0          0.00       0.0           

In [26]:
print(df_all_weather_features.head())
print(df_all_weather_features.columns)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-02-29                0.9          0.07       0.8                  0.5   
2020-03-01                9.0          1.19       7.3                  3.0   
2020-03-02                3.8          0.00       3.8                  3.6   
2020-03-03                2.9          0.00       2.9                  3.1   
2020-03-04               16.5          5.25       9.0                  1.7   

            wind_speed_10m_mean  surface_pressure_mean  precip_sum_3d  \
time                                                                    
2020-02-29                 10.1                  994.1            7.7   
2020-03-01                 16.8                  982.1           10.3   
2020-03-02                 11.6                  991.4           13.7   
2020-03-03                 14.0                 1000.9           15.7   
2020-03-04     

In [27]:
from utils import normalize_date_to_utc_day

df_fg = df_all_weather_features.copy()

# Make event time explicit
df_fg = df_fg.reset_index().rename(columns={"time": "date"})

# Add primary key
df_fg["sensor_id"] = sensor_id  # e.g. "malaren_w_20389"

# Reorder columns (optional but nice)
df_fg = df_fg[["sensor_id", "date"] +
              [c for c in df_fg.columns if c not in ["sensor_id", "date"]]]

df_fg = normalize_date_to_utc_day(df_fg)

In [28]:
print(df_fg["date"].head(10))

0   2020-02-29 00:00:00+00:00
1   2020-03-01 00:00:00+00:00
2   2020-03-02 00:00:00+00:00
3   2020-03-03 00:00:00+00:00
4   2020-03-04 00:00:00+00:00
5   2020-03-05 00:00:00+00:00
6   2020-03-06 00:00:00+00:00
7   2020-03-07 00:00:00+00:00
8   2020-03-08 00:00:00+00:00
9   2020-03-09 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]


In [29]:
weather_fg = fs.get_or_create_feature_group(
    name=f"weather_features_{sensor_name}_{sensor_id}",
    description=(
        "Daily weather features for water level prediction, including "
        "local conditions, lagged aggregates, and spatial features at 75 km "
        "in cardinal directions."
    ),
    version=4,
    primary_key=["sensor_id", "date"],
    event_time="date",
)

In [30]:
weather_fg.insert(df_fg)


2026-01-11 00:32:54,565 INFO: Computing insert statistics


(None, None)

In [31]:
# Core identifiers
weather_fg.update_feature_description(
    "date",
    "Date of the weather observation (daily resolution, UTC)"
)

weather_fg.update_feature_description(
    "sensor_id",
    "Unique identifier of the water level sensor / location"
)

# Local weather
weather_fg.update_feature_description(
    "precipitation_sum",
    "Total daily precipitation at the sensor location (mm/day)"
)

weather_fg.update_feature_description(
    "snowfall_sum",
    "Total daily snowfall at the sensor location (cm/day)"
)

weather_fg.update_feature_description(
    "rain_sum",
    "Total daily rainfall at the sensor location (mm/day)"
)

weather_fg.update_feature_description(
    "temperature_2m_mean",
    "Mean daily air temperature at 2 meters above ground (°C)"
)

weather_fg.update_feature_description(
    "wind_speed_10m_mean",
    "Mean daily wind speed at 10 meters above ground (km/h)"
)

weather_fg.update_feature_description(
    "surface_pressure_mean",
    "Mean daily surface air pressure (hPa)"
)

# Lagged / aggregated local features
weather_fg.update_feature_description(
    "precip_sum_3d",
    "Total precipitation accumulated over the previous 3 days (mm)"
)

weather_fg.update_feature_description(
    "precip_sum_7d",
    "Total precipitation accumulated over the previous 7 days (mm)"
)

weather_fg.update_feature_description(
    "precip_sum_14d",
    "Total precipitation accumulated over the previous 14 days (mm)"
)

weather_fg.update_feature_description(
    "snow_sum_14d",
    "Total snowfall accumulated over the previous 14 days (cm)"
)

weather_fg.update_feature_description(
    "snow_sum_30d",
    "Total snowfall accumulated over the previous 30 days (cm)"
)

weather_fg.update_feature_description(
    "snow_sum_60d",
    "Total snowfall accumulated over the previous 60 days (cm)"
)

# Spatial features (75 km)
for direction in ["n", "s", "e", "w"]:
    weather_fg.update_feature_description(
        f"precipitation_sum_{direction}_75km",
        f"Total daily precipitation (mm) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"snowfall_sum_{direction}_75km",
        f"Total daily snowfall (cm) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"rain_sum_{direction}_75km",
        f"Total daily rainfall (mm) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"temperature_2m_mean_{direction}_75km",
        f"Mean daily air temperature (°C) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"wind_speed_10m_mean_{direction}_75km",
        f"Mean daily wind speed (km/h) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"surface_pressure_mean_{direction}_75km",
        f"Mean daily surface air pressure (hPa) measured {direction} of the sensor at approximately 75 km distance"
    )
