In [1]:
import argparse
import pandas as pd
import hopsworks
import os


parser = argparse.ArgumentParser()
parser.add_argument("--latitude", type=float, default=59.3284)
parser.add_argument("--longitude", type=float, default=18.0664)
parser.add_argument("--sensor-id", type=int, default=20389)
parser.add_argument("--sensor-name", type=str, default="malaren_w")
parser.add_argument("--water-level-fg-version", type=int, default=4)
parser.add_argument("--weather-fg-version", type=int, default=4)
parser.add_argument("--fv-version", type=int, default=2)
parser.add_argument("--model-version", type=int, default=1)
parser.add_argument("--sensor-url", type=str, default="invalid")

args, _ = parser.parse_known_args()

latitude = args.latitude
longitude = args.longitude
sensor_id = args.sensor_id
sensor_name = args.sensor_name
weather_fg_version = args.weather_fg_version

In [3]:
api_key = os.environ.get("HOPSWORKS_API_KEY")
if not api_key:
    raise RuntimeError("HOPSWORKS_API_KEY is not set")

project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    port=443,
    project="ml_project",
    api_key_value=api_key,
)
fs = project.get_feature_store()

2026-01-11 22:15:31,267 INFO: Closing external client and cleaning up certificates.
2026-01-11 22:15:31,280 INFO: Connection closed.
2026-01-11 22:15:31,281 INFO: Initializing external client
2026-01-11 22:15:31,282 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 22:15:32,245 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2184


In [4]:
weather_fg = fs.get_or_create_feature_group(
    name=f"weather_features_{sensor_name}_{sensor_id}",
    description=(
        "Daily weather features for water level prediction, including "
        "local conditions, lagged aggregates, and spatial features at 75 km "
        "in cardinal directions."
    ),
    version=weather_fg_version,
    primary_key=["sensor_id", "date"],
    event_time="date",
)

In [5]:
df_existing = None

try:
    df_existing = weather_fg.read(dataframe_type="pandas")
    df_existing = df_existing[df_existing["sensor_id"] == sensor_id].copy()

    if df_existing.empty:
        latest_ingested = None
    else:
        df_existing["date"] = pd.to_datetime(df_existing["date"], utc=True).dt.floor("D")
        latest_ingested = df_existing["date"].max()

except Exception:
    # If FG exists but reading fails for any reason, safest is to treat as empty
    latest_ingested = None

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.53s) 


In [6]:
start_date = "2020-01-01"

if latest_ingested:
    start_date = pd.Timestamp(latest_ingested) - pd.DateOffset(months=3)
print(start_date)

2025-10-24 00:00:00+00:00


In [8]:
from utils import fetch_daily_weather

OPEN_METEO_WEATHER_FEATURES = [
    "precipitation_sum",
    "snowfall_sum",
    "rain_sum",
    "temperature_2m_mean",
    "wind_speed_10m_mean",
    "surface_pressure_mean",
]

df_weather = fetch_daily_weather(
    latitude,
    longitude,
    OPEN_METEO_WEATHER_FEATURES,
    start_date=pd.to_datetime(start_date).date().isoformat(),
)
print(df_weather)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2025-10-24                1.1          0.00       1.1                  7.2   
2025-10-25               23.5          0.00      23.5                  5.6   
2025-10-26               13.7          0.00      13.7                  5.0   
2025-10-27                2.8          0.00       2.8                  4.3   
2025-10-28                1.2          0.00       1.2                  4.2   
...                       ...           ...       ...                  ...   
2026-01-07                2.8          1.96       0.0                -11.7   
2026-01-08                6.0          4.20       0.0                 -7.1   
2026-01-09                0.4          0.28       0.0                 -7.1   
2026-01-10                0.0          0.00       0.0                -12.3   
2026-01-11                0.0          0.00       0.0           

In [9]:
from utils import fetch_daily_weather_forecast

df_weather_forecast = fetch_daily_weather_forecast(
    latitude,
    longitude,
    OPEN_METEO_WEATHER_FEATURES,
    forecast_days=14,
)
print(df_weather_forecast)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2026-01-11                0.0          0.00       0.0                -19.1   
2026-01-12                0.0          0.00       0.0                -15.7   
2026-01-13                0.0          0.00       0.0                 -7.8   
2026-01-14                4.3          3.01       0.0                 -3.4   
2026-01-15                0.3          0.21       0.0                 -2.1   
2026-01-16                0.0          0.00       0.0                 -0.1   
2026-01-17                0.0          0.00       0.0                 -1.3   
2026-01-18                0.0          0.00       0.0                 -2.6   
2026-01-19                0.0          0.00       0.0                 -1.0   
2026-01-20                0.0          0.00       0.0                 -2.9   
2026-01-21                0.0          0.00       0.0           

In [10]:
# Sort and remove duplicates if any
df_weather_all = pd.concat([df_weather, df_weather_forecast], axis=0).sort_index()
df_weather_all = df_weather_all[~df_weather_all.index.duplicated(keep="last")]
print(df_weather_all)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2025-10-24                1.1          0.00       1.1                  7.2   
2025-10-25               23.5          0.00      23.5                  5.6   
2025-10-26               13.7          0.00      13.7                  5.0   
2025-10-27                2.8          0.00       2.8                  4.3   
2025-10-28                1.2          0.00       1.2                  4.2   
...                       ...           ...       ...                  ...   
2026-01-20                0.0          0.00       0.0                 -2.9   
2026-01-21                0.0          0.00       0.0                 -6.0   
2026-01-22                1.5          1.05       0.0                 -7.6   
2026-01-23                5.8          4.06       0.0                 -4.6   
2026-01-24                1.7          1.19       0.0           

In [11]:
from utils import find_missing_dates

missing = find_missing_dates(df_weather_all)

✅ No missing dates found.


In [12]:
from utils import add_weather_lagged_aggregated_features

df_weather_full = add_weather_lagged_aggregated_features(
    df_weather_all,
)
print(df_weather_full)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2025-10-24                1.1          0.00       1.1                  7.2   
2025-10-25               23.5          0.00      23.5                  5.6   
2025-10-26               13.7          0.00      13.7                  5.0   
2025-10-27                2.8          0.00       2.8                  4.3   
2025-10-28                1.2          0.00       1.2                  4.2   
...                       ...           ...       ...                  ...   
2026-01-20                0.0          0.00       0.0                 -2.9   
2026-01-21                0.0          0.00       0.0                 -6.0   
2026-01-22                1.5          1.05       0.0                 -7.6   
2026-01-23                5.8          4.06       0.0                 -4.6   
2026-01-24                1.7          1.19       0.0           

In [13]:
df_weather_full = df_weather_full.dropna()
print(df_weather_full)

missing = find_missing_dates(df_weather_full)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2025-12-22                0.0          0.00       0.0                 -5.4   
2025-12-23                0.0          0.00       0.0                 -6.3   
2025-12-24                0.0          0.00       0.0                 -0.8   
2025-12-25                0.0          0.00       0.0                  4.4   
2025-12-26                0.0          0.00       0.0                  3.4   
2025-12-27                0.7          0.28       0.4                  0.0   
2025-12-28                7.9          4.83       1.0                  0.3   
2025-12-29                1.4          0.98       0.0                 -4.0   
2025-12-30                3.6          2.52       0.0                 -5.0   
2025-12-31                2.6          1.82       0.0                 -7.1   
2026-01-01                7.2          5.04       0.0           

In [15]:
from utils import fetch_spatial_weather_75km

df_weather_75km = fetch_spatial_weather_75km(
    latitude=latitude,
    longitude=longitude,
    features=OPEN_METEO_WEATHER_FEATURES,
    start_date=pd.to_datetime(start_date).date().isoformat(),
)
# df_weather_75km["time"] = pd.to_datetime(df_weather_75km["time"])
# df_weather_75km.set_index("time").sort_index()
print(df_weather_75km)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2025-10-24                       0.6                 0.00              0.6   
2025-10-25                      24.4                 0.28             24.0   
2025-10-26                      17.3                 0.00             17.3   
2025-10-27                      12.1                 0.00             12.1   
2025-10-28                       2.6                 0.00              2.6   
...                              ...                  ...              ...   
2026-01-07                       1.6                 1.12              0.0   
2026-01-08                      14.6                10.22              0.0   
2026-01-09                       2.3                 1.61              0.0   
2026-01-10                       1.5                 1.19              0.0   
2026-01-11                       0.0                 0.00       

In [16]:
from utils import fetch_spatial_weather_forecast_75km

df_weather_75km_forecast = fetch_spatial_weather_forecast_75km(
    latitude=latitude,
    longitude=longitude,
    features=OPEN_METEO_WEATHER_FEATURES
)
print(df_weather_75km_forecast)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2026-01-11                       0.0                 0.00              0.0   
2026-01-12                       1.1                 0.77              0.0   
2026-01-13                       0.3                 0.21              0.0   
2026-01-14                       9.3                 6.58              0.0   
2026-01-15                       0.8                 0.56              0.0   
2026-01-16                       3.0                 2.10              0.0   
2026-01-17                       4.3                 3.01              0.0   
2026-01-18                       0.0                 0.00              0.0   
2026-01-19                       0.0                 0.00              0.0   
2026-01-20                       0.0                 0.00              0.0   
2026-01-21                       0.6                 0.42       

In [17]:
df_weather_75km_forecast_all = pd.concat([df_weather_75km, df_weather_75km_forecast], axis=0).sort_index()
df_weather_75km_forecast_all = df_weather_75km_forecast_all[~df_weather_75km_forecast_all.index.duplicated(keep="last")]
print(df_weather_75km_forecast_all)

            precipitation_sum_n_75km  snowfall_sum_n_75km  rain_sum_n_75km  \
time                                                                         
2025-10-24                       0.6                 0.00              0.6   
2025-10-25                      24.4                 0.28             24.0   
2025-10-26                      17.3                 0.00             17.3   
2025-10-27                      12.1                 0.00             12.1   
2025-10-28                       2.6                 0.00              2.6   
...                              ...                  ...              ...   
2026-01-20                       0.0                 0.00              0.0   
2026-01-21                       0.6                 0.42              0.0   
2026-01-22                       0.5                 0.35              0.0   
2026-01-23                       1.8                 1.26              0.0   
2026-01-24                       0.7                 0.49       

In [18]:
df_all_weather_features = df_weather_full.join(df_weather_75km_forecast_all, how="inner")
print(df_all_weather_features)

missing = find_missing_dates(df_all_weather_features)

            precipitation_sum  snowfall_sum  rain_sum  temperature_2m_mean  \
time                                                                         
2025-12-22                0.0          0.00       0.0                 -5.4   
2025-12-23                0.0          0.00       0.0                 -6.3   
2025-12-24                0.0          0.00       0.0                 -0.8   
2025-12-25                0.0          0.00       0.0                  4.4   
2025-12-26                0.0          0.00       0.0                  3.4   
2025-12-27                0.7          0.28       0.4                  0.0   
2025-12-28                7.9          4.83       1.0                  0.3   
2025-12-29                1.4          0.98       0.0                 -4.0   
2025-12-30                3.6          2.52       0.0                 -5.0   
2025-12-31                2.6          1.82       0.0                 -7.1   
2026-01-01                7.2          5.04       0.0           

In [19]:
from utils import normalize_date_to_utc_day

df_fg = df_all_weather_features.copy()

# Make event time explicit
df_fg = df_fg.reset_index().rename(columns={"time": "date"})

# Add primary key
df_fg["sensor_id"] = sensor_id  # e.g. "malaren_w_20389"

# Reorder columns (optional but nice)
df_fg = df_fg[["sensor_id", "date"] +
              [c for c in df_fg.columns if c not in ["sensor_id", "date"]]]

df_fg = normalize_date_to_utc_day(df_fg)

In [20]:
print(df_fg["date"].head(10))

0   2025-12-22 00:00:00+00:00
1   2025-12-23 00:00:00+00:00
2   2025-12-24 00:00:00+00:00
3   2025-12-25 00:00:00+00:00
4   2025-12-26 00:00:00+00:00
5   2025-12-27 00:00:00+00:00
6   2025-12-28 00:00:00+00:00
7   2025-12-29 00:00:00+00:00
8   2025-12-30 00:00:00+00:00
9   2025-12-31 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]


In [21]:
weather_fg.insert(df_fg, operation="upsert")


2026-01-11 22:19:23,848 INFO: Computing insert statistics


(None, None)