In [1]:
import argparse
import pandas as pd
import hopsworks
import os

from xgboost import XGBRegressor

parser = argparse.ArgumentParser()
parser.add_argument("--latitude", type=float, default=59.3284)
parser.add_argument("--longitude", type=float, default=18.0664)
parser.add_argument("--sensor-id", type=int, default=20389)
parser.add_argument("--sensor-name", type=str, default="malaren_w")
parser.add_argument("--water-level-fg-version", type=int, default=4)
parser.add_argument("--weather-fg-version", type=int, default=4)
parser.add_argument("--fv-version", type=int, default=2)
parser.add_argument("--model-version", type=int, default=1)
parser.add_argument("--sensor-url", type=str, default="invalid")

args, _ = parser.parse_known_args()

latitude = args.latitude
longitude = args.longitude
sensor_id = args.sensor_id
sensor_name = args.sensor_name
water_level_fg_version = args.water_level_fg_version
weather_fg_version = args.weather_fg_version
fv_version = args.fv_version
model_version = args.model_version
sensor_url = args.sensor_url

In [2]:
api_key = os.environ.get("HOPSWORKS_API_KEY")
if not api_key:
    raise RuntimeError("HOPSWORKS_API_KEY is not set")

project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    port=443,
    project="ml_project",
    api_key_value=api_key,
)

2026-01-11 20:43:43,471 INFO: Initializing external client
2026-01-11 20:43:43,472 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 20:43:45,342 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2184


In [3]:
from utils import fetch_smhi_water_level, normalize_date_to_utc_day

df_water_level = fetch_smhi_water_level(sensor_url)

cutoff = pd.Timestamp("2020-01-01", tz="UTC")

df_water_level = df_water_level[
    df_water_level["date"] >= cutoff
].reset_index(drop=True)

df_water_level = normalize_date_to_utc_day(df_water_level)

print(df_water_level)

                          date  water_level_cm
0    2020-01-01 00:00:00+00:00         29237.0
1    2020-01-02 00:00:00+00:00         29239.0
2    2020-01-03 00:00:00+00:00         29243.0
3    2020-01-04 00:00:00+00:00         29247.0
4    2020-01-05 00:00:00+00:00         29250.0
...                        ...             ...
2196 2026-01-05 00:00:00+00:00         29263.0
2197 2026-01-06 00:00:00+00:00         29263.0
2198 2026-01-07 00:00:00+00:00         29262.0
2199 2026-01-08 00:00:00+00:00         29262.0
2200 2026-01-09 00:00:00+00:00         29259.0

[2201 rows x 2 columns]


In [4]:
from utils import add_water_level_lags

df_water_level_lagged = add_water_level_lags(df_water_level)

print(df_water_level_lagged)

                          date  water_level_cm  water_level_cm_t_1  \
0    2020-01-01 00:00:00+00:00         29237.0                 NaN   
1    2020-01-02 00:00:00+00:00         29239.0             29237.0   
2    2020-01-03 00:00:00+00:00         29243.0             29239.0   
3    2020-01-04 00:00:00+00:00         29247.0             29243.0   
4    2020-01-05 00:00:00+00:00         29250.0             29247.0   
...                        ...             ...                 ...   
2196 2026-01-05 00:00:00+00:00         29263.0             29262.0   
2197 2026-01-06 00:00:00+00:00         29263.0             29263.0   
2198 2026-01-07 00:00:00+00:00         29262.0             29263.0   
2199 2026-01-08 00:00:00+00:00         29262.0             29262.0   
2200 2026-01-09 00:00:00+00:00         29259.0             29262.0   

      water_level_cm_t_3  water_level_cm_t_7  water_level_cm_t_14  
0                    NaN                 NaN                  NaN  
1                    Na

In [5]:
df_water_level_lagged = df_water_level_lagged.dropna().reset_index(drop=True)

print(df_water_level_lagged)

                          date  water_level_cm  water_level_cm_t_1  \
0    2020-01-15 00:00:00+00:00         29252.0             29251.0   
1    2020-01-16 00:00:00+00:00         29251.0             29252.0   
2    2020-01-17 00:00:00+00:00         29250.0             29251.0   
3    2020-01-18 00:00:00+00:00         29249.0             29250.0   
4    2020-01-19 00:00:00+00:00         29249.0             29249.0   
...                        ...             ...                 ...   
2182 2026-01-05 00:00:00+00:00         29263.0             29262.0   
2183 2026-01-06 00:00:00+00:00         29263.0             29263.0   
2184 2026-01-07 00:00:00+00:00         29262.0             29263.0   
2185 2026-01-08 00:00:00+00:00         29262.0             29262.0   
2186 2026-01-09 00:00:00+00:00         29259.0             29262.0   

      water_level_cm_t_3  water_level_cm_t_7  water_level_cm_t_14  
0                29253.0             29255.0              29237.0  
1                29251.

In [6]:
# 4) Get feature store + feature group
fs = project.get_feature_store()

water_level_fg = fs.get_or_create_feature_group(
    name=f"water_level_lagged_{sensor_name}_{sensor_id}",
    description="Daily water level observations with lagged features for water level prediction",
    version=water_level_fg_version,
    primary_key=["sensor_id", "date"],
    event_time="date",
)

In [7]:
# 5) Find latest date already in FG for this sensor
try:
    df_existing = water_level_fg.read(dataframe_type="pandas")
    df_existing = df_existing[df_existing["sensor_id"] == sensor_id].copy()

    if df_existing.empty:
        latest_ingested = None
    else:
        df_existing["date"] = pd.to_datetime(df_existing["date"], utc=True).dt.floor("D")
        latest_ingested = df_existing["date"].max()

except Exception:
    # If FG exists but reading fails for any reason, safest is to treat as empty
    latest_ingested = None

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.94s) 


In [11]:
print(latest_ingested)

# 6) Keep only strictly new rows
if latest_ingested is None:
    df_new = df_water_level_lagged
else:
    df_new = df_water_level_lagged[df_water_level_lagged["date"] > latest_ingested].copy()

2026-01-09 00:00:00+00:00


In [14]:
df_new["sensor_id"] = sensor_id

print(df_new)

                          date  water_level_cm  water_level_cm_t_1  \
2184 2026-01-07 00:00:00+00:00         29262.0             29263.0   
2185 2026-01-08 00:00:00+00:00         29262.0             29262.0   
2186 2026-01-09 00:00:00+00:00         29259.0             29262.0   

      water_level_cm_t_3  water_level_cm_t_7  water_level_cm_t_14  sensor_id  
2184             29262.0             29266.0              29271.0      20048  
2185             29263.0             29266.0              29270.0      20048  
2186             29263.0             29263.0              29269.0      20048  


In [15]:
if not df_new.empty:
    # 7) Upsert new rows (safe if duplicates ever happen)
    water_level_fg.insert(df_new, operation="upsert")


2026-01-11 20:46:15,706 INFO: Computing insert statistics


In [16]:
print(f"✅ Inserted {len(df_new)} new rows for sensor_id={sensor_id}. Latest inserted: {df_new['date'].max()}")

✅ Inserted 3 new rows for sensor_id=20048. Latest inserted: 2026-01-09 00:00:00+00:00
