### Prepare feature group with historical weather data for a sensor location

In [1]:
import argparse
import pandas as pd
import hopsworks
from datetime import date
import json
import requests

import math
import os


parser = argparse.ArgumentParser()
parser.add_argument("--latitude", type=float, default=59.3284)
parser.add_argument("--longitude", type=float, default=18.0664)
parser.add_argument("--sensor-id", type=int, default=20389)
parser.add_argument("--sensor-name", type=str, default="malaren_w")

args, _ = parser.parse_known_args()

latitude = args.latitude
longitude = args.longitude
sensor_id = args.sensor_id
sensor_name = args.sensor_name

### Step 1: Login to hopsworks

Environment variable `HOPSWORKS_API_KEY` should be set

In [2]:
# please get the value of HOPSWORKS_API_KEY environment variable
print(os.environ["HOPSWORKS_API_KEY"])

project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    port=443,
    project="ml_project",
    api_key_value="wtoN33kI3N6NEoXN.uGN8iXjW9Cqw6lSFkvNMrDJh8xe9VEXnhhWhbaeiY1DmMHspWCFJrRzYSkzdfk0R"
)

SYYGDSi08NPw0xgb.oX9wCdH8TBrLdp9r4ER3XUM1tbfHZPYcsozPkyvi8pmTWtdRHbIgaGeQKkfCuqO1
2026-01-06 15:48:43,797 INFO: Initializing external client
2026-01-06 15:48:43,798 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-06 15:48:45,491 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2184


### Step 2: Fetch historical weather data for the water level sensor location

In [48]:
def fetch_daily_weather(
    latitude: float,
    longitude: float,
    start_date: str = "2020-01-01",
    end_date: str | None = None,
    timezone: str = "UTC",
) -> pd.DataFrame:
    """
    Fetch daily historical weather data from Open-Meteo
    and return it as a pandas DataFrame.
    """

    if end_date is None:
        end_date = date.today().isoformat()

    url = "https://archive-api.open-meteo.com/v1/archive"

    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_date,
        "end_date": end_date,
        "daily": ",".join([
            "precipitation_sum",
            "snowfall_sum",
            "rain_sum",
            "temperature_2m_mean",
            "wind_speed_10m_mean",
            "surface_pressure_mean",
        ]),
        "timezone": timezone,
    }

    response = requests.get(url, params=params, timeout=30)
    response.raise_for_status()

    data = response.json()
    daily = data["daily"]

    df = pd.DataFrame(daily)
    df["time"] = pd.to_datetime(df["time"])
    df = df.set_index("time").sort_index()

    return df

In [49]:
df_weather = fetch_daily_weather(latitude, longitude)
print(df_weather)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-01-01                0.0          0.00       0.0                  2.9   
2020-01-02                0.0          0.00       0.0                  5.1   
2020-01-03                0.0          0.00       0.0                  5.1   
2020-01-04                0.0          0.00       0.0                  1.5   
2020-01-05                2.9          1.68       0.5                 -0.4   
...                       ...           ...       ...                  ...   
2026-01-02                9.1          6.16       0.3                 -1.4   
2026-01-03                5.8          4.06       0.0                 -2.2   
2026-01-04                0.0          0.00       0.0                 -5.4   
2026-01-05                4.6          3.22       0.0                 -7.6   
2026-01-06               11.7          8.12       0.1           

### Validate the data and display dates which are missing

In [50]:
def find_missing_dates(
    df: pd.DataFrame,
    freq: str = "D",
    verbose: bool = True
) -> pd.DatetimeIndex:
    """
    Check for missing dates in a time-indexed DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with a DatetimeIndex.
    freq : str
        Expected frequency (default: daily 'D').
    verbose : bool
        Whether to print missing dates.

    Returns
    -------
    pd.DatetimeIndex
        Missing dates.
    """

    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("DataFrame index must be a DatetimeIndex")

    full_range = pd.date_range(
        start=df.index.min(),
        end=df.index.max(),
        freq=freq
    )

    missing_dates = full_range.difference(df.index)

    if verbose:
        if len(missing_dates) == 0:
            print("✅ No missing dates found.")
        else:
            print(f"⚠️ Missing {len(missing_dates)} dates:")
            for d in missing_dates:
                print(d.date())

    return missing_dates

In [51]:
missing = find_missing_dates(df_weather)

✅ No missing dates found.


### Step 3: Add lagged and aggregate features

In [52]:
def add_lagged_aggregated_features(
    df_weather: pd.DataFrame
) -> pd.DataFrame:
    """
    Create lagged and aggregated features for water-level prediction.

    Returns a single DataFrame aligned by date.
    """

    # --- Safety checks ---
    if not isinstance(df_weather.index, pd.DatetimeIndex):
        raise ValueError("df_weather index must be DatetimeIndex")

    df = df_weather.sort_index()

    # ======================
    # Precipitation features
    # ======================
    df["precip_sum_3d"] = (
        df["precipitation_sum"]
        .rolling(window=3, min_periods=3)
        .sum()
    )

    df["precip_sum_7d"] = (
        df["precipitation_sum"]
        .rolling(window=7, min_periods=7)
        .sum()
    )

    df["precip_sum_14d"] = (
        df["precipitation_sum"]
        .rolling(window=14, min_periods=14)
        .sum()
    )

    # ======================
    # Snowfall features
    # ======================
    df["snow_sum_14d"] = (
        df["snowfall_sum"]
        .rolling(window=14, min_periods=14)
        .sum()
    )

    df["snow_sum_30d"] = (
        df["snowfall_sum"]
        .rolling(window=30, min_periods=30)
        .sum()
    )

    df["snow_sum_60d"] = (
        df["snowfall_sum"]
        .rolling(window=60, min_periods=60)
        .sum()
    )

    return df

In [53]:
df_weather_full = add_lagged_aggregated_features(
    df_weather,
)
print(df_weather_full)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-01-01                0.0          0.00       0.0                  2.9   
2020-01-02                0.0          0.00       0.0                  5.1   
2020-01-03                0.0          0.00       0.0                  5.1   
2020-01-04                0.0          0.00       0.0                  1.5   
2020-01-05                2.9          1.68       0.5                 -0.4   
...                       ...           ...       ...                  ...   
2026-01-02                9.1          6.16       0.3                 -1.4   
2026-01-03                5.8          4.06       0.0                 -2.2   
2026-01-04                0.0          0.00       0.0                 -5.4   
2026-01-05                4.6          3.22       0.0                 -7.6   
2026-01-06               11.7          8.12       0.1           

### Step 4: Remove rows with NaN

In [54]:
df_weather_full = df_weather_full.dropna()
print(df_weather_full)

missing = find_missing_dates(df_weather_full)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-02-29                0.9          0.07       0.8                  0.5   
2020-03-01                9.0          1.19       7.3                  3.0   
2020-03-02                3.8          0.00       3.8                  3.6   
2020-03-03                2.9          0.00       2.9                  3.1   
2020-03-04               16.5          5.25       9.0                  1.7   
...                       ...           ...       ...                  ...   
2026-01-02                9.1          6.16       0.3                 -1.4   
2026-01-03                5.8          4.06       0.0                 -2.2   
2026-01-04                0.0          0.00       0.0                 -5.4   
2026-01-05                4.6          3.22       0.0                 -7.6   
2026-01-06               11.7          8.12       0.1           

### Fetch weather for points at distance from the sensor point

In [55]:
import math

def offset_coordinates(lat: float, lon: float, distance_km: float):
    """
    Compute lat/lon offsets for N, S, E, W directions.
    """
    lat_rad = math.radians(lat)

    dlat = distance_km / 111.0
    dlon = distance_km / (111.0 * math.cos(lat_rad))

    return {
        "N": (lat + dlat, lon),
        "S": (lat - dlat, lon),
        "E": (lat, lon + dlon),
        "W": (lat, lon - dlon),
    }

In [56]:
def fetch_daily_weather(
    latitude: float,
    longitude: float,
    start_date: str = "2020-01-01",
    end_date: str | None = None,
    timezone: str = "UTC",
) -> pd.DataFrame:

    if end_date is None:
        end_date = date.today().isoformat()

    url = "https://archive-api.open-meteo.com/v1/archive"

    params = {
        "latitude": latitude,
        "longitude": longitude,
        "start_date": start_date,
        "end_date": end_date,
        "daily": ",".join([
            "precipitation_sum",
            "snowfall_sum",
            "rain_sum",
            "temperature_2m_mean",
            "wind_speed_10m_mean",
            "surface_pressure_mean",
        ]),
        "timezone": timezone,
    }

    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()

    data = r.json()["daily"]
    df = pd.DataFrame(data)
    df["time"] = pd.to_datetime(df["time"])
    return df.set_index("time").sort_index()

In [57]:
def fetch_spatial_weather_75km(
    latitude: float,
    longitude: float,
    start_date: str = "2020-01-01",
) -> pd.DataFrame:
    """
    Fetch daily weather features at N/S/E/W points 75 km from the sensor.
    """

    offsets = offset_coordinates(latitude, longitude, distance_km=75)

    dfs = []

    for direction, (lat, lon) in offsets.items():
        df = fetch_daily_weather(lat, lon, start_date=start_date)

        df = df.rename(columns={
            "precipitation_sum": f"precipitation_sum_{direction}_75km",
            "snowfall_sum": f"snowfall_sum_{direction}_75km",
            "rain_sum": f"rain_sum_{direction}_75km",
            "temperature_2m_mean": f"temperature_2m_mean_{direction}_75km",
            "wind_speed_10m_mean": f"wind_speed_10m_mean_{direction}_75km",
            "surface_pressure_mean": f"surface_pressure_mean_{direction}_75km",
        })

        dfs.append(df)

    # Merge all directions on time index
    df_spatial = pd.concat(dfs, axis=1).sort_index()

    return df_spatial

In [58]:
df_weather_75km = fetch_spatial_weather_75km(
    latitude=latitude,
    longitude=longitude,
    start_date="2020-01-01",
)
print(df_weather_75km)

            precipitation_sum_N_75km  snowfall_sum_N_75km  rain_sum_N_75km  \
time                                                                         
2020-01-01                       0.4                 0.00              0.4   
2020-01-02                       0.0                 0.00              0.0   
2020-01-03                       0.1                 0.00              0.1   
2020-01-04                       0.0                 0.00              0.0   
2020-01-05                       2.9                 1.82              0.3   
...                              ...                  ...              ...   
2026-01-02                      11.9                 8.33              0.0   
2026-01-03                      12.5                 8.75              0.0   
2026-01-04                       0.2                 0.14              0.0   
2026-01-05                       0.1                 0.07              0.0   
2026-01-06                      17.2                12.04       

In [59]:
print(df_weather_75km.head())
print(df_weather_75km.columns)

            precipitation_sum_N_75km  snowfall_sum_N_75km  rain_sum_N_75km  \
time                                                                         
2020-01-01                       0.4                 0.00              0.4   
2020-01-02                       0.0                 0.00              0.0   
2020-01-03                       0.1                 0.00              0.1   
2020-01-04                       0.0                 0.00              0.0   
2020-01-05                       2.9                 1.82              0.3   

            temperature_2m_mean_N_75km  wind_speed_10m_mean_N_75km  \
time                                                                 
2020-01-01                         3.1                        17.3   
2020-01-02                         4.8                        23.2   
2020-01-03                         5.1                        25.4   
2020-01-04                         1.5                        22.5   
2020-01-05                       

### TODO: Add data validation rules

### Store the data in hopsworks feature store

In [60]:
fs = project.get_feature_store()

In [61]:
df_all_weather_features = df_weather_full.join(df_weather_75km, how="inner")
print(df_all_weather_features)

missing = find_missing_dates(df_all_weather_features)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-02-29                0.9          0.07       0.8                  0.5   
2020-03-01                9.0          1.19       7.3                  3.0   
2020-03-02                3.8          0.00       3.8                  3.6   
2020-03-03                2.9          0.00       2.9                  3.1   
2020-03-04               16.5          5.25       9.0                  1.7   
...                       ...           ...       ...                  ...   
2026-01-02                9.1          6.16       0.3                 -1.4   
2026-01-03                5.8          4.06       0.0                 -2.2   
2026-01-04                0.0          0.00       0.0                 -5.4   
2026-01-05                4.6          3.22       0.0                 -7.6   
2026-01-06               11.7          8.12       0.1           

In [62]:
print(df_all_weather_features.head())
print(df_all_weather_features.columns)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2020-02-29                0.9          0.07       0.8                  0.5   
2020-03-01                9.0          1.19       7.3                  3.0   
2020-03-02                3.8          0.00       3.8                  3.6   
2020-03-03                2.9          0.00       2.9                  3.1   
2020-03-04               16.5          5.25       9.0                  1.7   

            wind_speed_10m_mean  surface_pressure_mean  precip_sum_3d  \
time                                                                    
2020-02-29                 10.1                  994.1            7.7   
2020-03-01                 16.8                  982.1           10.3   
2020-03-02                 11.6                  991.4           13.7   
2020-03-03                 14.0                 1000.9           15.7   
2020-03-04     

In [64]:
df_fg = df_all_weather_features.copy()

# Make event time explicit
df_fg = df_fg.reset_index().rename(columns={"time": "date"})

# Add primary key
df_fg["sensor_id"] = sensor_id  # e.g. "malaren_w_20389"

# Reorder columns (optional but nice)
df_fg = df_fg[["sensor_id", "date"] +
              [c for c in df_fg.columns if c not in ["sensor_id", "date"]]]

df_fg["date"] = (
    pd.to_datetime(df_fg["date"], utc=True)
      .dt.floor("D")
)

In [68]:
print(df_fg["date"].head(10))

0   2020-02-29 00:00:00+00:00
1   2020-03-01 00:00:00+00:00
2   2020-03-02 00:00:00+00:00
3   2020-03-03 00:00:00+00:00
4   2020-03-04 00:00:00+00:00
5   2020-03-05 00:00:00+00:00
6   2020-03-06 00:00:00+00:00
7   2020-03-07 00:00:00+00:00
8   2020-03-08 00:00:00+00:00
9   2020-03-09 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]


In [65]:
weather_fg = fs.get_or_create_feature_group(
    name=f"weather_features_{sensor_name}_{sensor_id}",
    description=(
        "Daily weather features for water level prediction, including "
        "local conditions, lagged aggregates, and spatial features at 75 km "
        "in cardinal directions."
    ),
    version=3,
    primary_key=["sensor_id", "date"],
    event_time="date",
)

In [66]:
weather_fg.insert(df_fg)


Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/2184/fs/2136/fg/3339

2026-01-06 10:53:21,144 INFO: Computing insert statistics


(None, None)

In [67]:
# Core identifiers
weather_fg.update_feature_description(
    "date",
    "Date of the weather observation (daily resolution, UTC)"
)

weather_fg.update_feature_description(
    "sensor_id",
    "Unique identifier of the water level sensor / location"
)

# Local weather
weather_fg.update_feature_description(
    "precipitation_sum",
    "Total daily precipitation at the sensor location (mm/day)"
)

weather_fg.update_feature_description(
    "snowfall_sum",
    "Total daily snowfall at the sensor location (cm/day)"
)

weather_fg.update_feature_description(
    "rain_sum",
    "Total daily rainfall at the sensor location (mm/day)"
)

weather_fg.update_feature_description(
    "temperature_2m_mean",
    "Mean daily air temperature at 2 meters above ground (°C)"
)

weather_fg.update_feature_description(
    "wind_speed_10m_mean",
    "Mean daily wind speed at 10 meters above ground (km/h)"
)

weather_fg.update_feature_description(
    "surface_pressure_mean",
    "Mean daily surface air pressure (hPa)"
)

# Lagged / aggregated local features
weather_fg.update_feature_description(
    "precip_sum_3d",
    "Total precipitation accumulated over the previous 3 days (mm)"
)

weather_fg.update_feature_description(
    "precip_sum_7d",
    "Total precipitation accumulated over the previous 7 days (mm)"
)

weather_fg.update_feature_description(
    "precip_sum_14d",
    "Total precipitation accumulated over the previous 14 days (mm)"
)

weather_fg.update_feature_description(
    "snow_sum_14d",
    "Total snowfall accumulated over the previous 14 days (cm)"
)

weather_fg.update_feature_description(
    "snow_sum_30d",
    "Total snowfall accumulated over the previous 30 days (cm)"
)

weather_fg.update_feature_description(
    "snow_sum_60d",
    "Total snowfall accumulated over the previous 60 days (cm)"
)

# Spatial features (75 km)
for direction in ["n", "s", "e", "w"]:
    weather_fg.update_feature_description(
        f"precipitation_sum_{direction}_75km",
        f"Total daily precipitation (mm) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"snowfall_sum_{direction}_75km",
        f"Total daily snowfall (cm) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"rain_sum_{direction}_75km",
        f"Total daily rainfall (mm) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"temperature_2m_mean_{direction}_75km",
        f"Mean daily air temperature (°C) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"wind_speed_10m_mean_{direction}_75km",
        f"Mean daily wind speed (km/h) measured {direction} of the sensor at approximately 75 km distance"
    )

    weather_fg.update_feature_description(
        f"surface_pressure_mean_{direction}_75km",
        f"Mean daily surface air pressure (hPa) measured {direction} of the sensor at approximately 75 km distance"
    )
