TODO: fetch historical data and store in hopsworks

In [116]:
import os
import argparse
import pandas as pd
import hopsworks


parser = argparse.ArgumentParser()
parser.add_argument("--latitude", type=float, default=59.3284)
parser.add_argument("--longitude", type=float, default=18.0664)
parser.add_argument("--sensor-id", type=int, default=20389)
parser.add_argument("--sensor-name", type=str, default="malaren_w")
parser.add_argument("--water-level-fg-version", type=int, default=4)
parser.add_argument("--sensor-url", type=str, default="https://opendata-download-hydroobs.smhi.se/api/version/1.0/parameter/3/station/20389/period/corrected-archive/data.json")

args, _ = parser.parse_known_args()

latitude = args.latitude
longitude = args.longitude
sensor_id = args.sensor_id
sensor_name = args.sensor_name
water_level_fg_version = args.water_level_fg_version
sensor_url = args.sensor_url

### Step 1: Login to hopsworks

Environment variable `HOPSWORKS_API_KEY` should be set

In [117]:
api_key = os.environ.get("HOPSWORKS_API_KEY")
if not api_key:
    raise RuntimeError("HOPSWORKS_API_KEY is not set")

project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    port=443,
    project="ml_project",
    api_key_value=api_key,
)

2026-01-11 18:33:01,263 INFO: Closing external client and cleaning up certificates.
2026-01-11 18:33:01,264 INFO: Connection closed.
2026-01-11 18:33:01,265 INFO: Initializing external client
2026-01-11 18:33:01,266 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 18:33:02,919 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2184


### Step 2: Fetch historical water level data for a water level sensor

In [118]:
from utils import fetch_smhi_water_level

df_water_level = fetch_smhi_water_level(sensor_url)

cutoff = pd.Timestamp("2020-01-01", tz="UTC")

df_water_level = df_water_level[
    df_water_level["date"] >= cutoff
].reset_index(drop=True)

print(df_water_level)

                          date  water_level_cm
0    2020-01-01 22:00:00+00:00         29237.0
1    2020-01-02 22:00:00+00:00         29239.0
2    2020-01-03 22:00:00+00:00         29243.0
3    2020-01-04 22:00:00+00:00         29247.0
4    2020-01-05 22:00:00+00:00         29250.0
...                        ...             ...
2196 2026-01-05 22:00:00+00:00         29263.0
2197 2026-01-06 22:00:00+00:00         29263.0
2198 2026-01-07 22:00:00+00:00         29262.0
2199 2026-01-08 22:00:00+00:00         29262.0
2200 2026-01-09 22:00:00+00:00         29259.0

[2201 rows x 2 columns]


Only keep the data from 2020 onwards

### Add lagged features

In [119]:
from utils import add_water_level_lags

df_water_level_lagged = add_water_level_lags(df_water_level)

print(df_water_level_lagged)

                          date  water_level_cm  water_level_cm_t_1  \
0    2020-01-01 22:00:00+00:00         29237.0                 NaN   
1    2020-01-02 22:00:00+00:00         29239.0             29237.0   
2    2020-01-03 22:00:00+00:00         29243.0             29239.0   
3    2020-01-04 22:00:00+00:00         29247.0             29243.0   
4    2020-01-05 22:00:00+00:00         29250.0             29247.0   
...                        ...             ...                 ...   
2196 2026-01-05 22:00:00+00:00         29263.0             29262.0   
2197 2026-01-06 22:00:00+00:00         29263.0             29263.0   
2198 2026-01-07 22:00:00+00:00         29262.0             29263.0   
2199 2026-01-08 22:00:00+00:00         29262.0             29262.0   
2200 2026-01-09 22:00:00+00:00         29259.0             29262.0   

      water_level_cm_t_3  water_level_cm_t_7  water_level_cm_t_14  
0                    NaN                 NaN                  NaN  
1                    Na

In [120]:
df_water_level_lagged = df_water_level_lagged.dropna()

print(df_water_level_lagged)

                          date  water_level_cm  water_level_cm_t_1  \
14   2020-01-15 22:00:00+00:00         29252.0             29251.0   
15   2020-01-16 22:00:00+00:00         29251.0             29252.0   
16   2020-01-17 22:00:00+00:00         29250.0             29251.0   
17   2020-01-18 22:00:00+00:00         29249.0             29250.0   
18   2020-01-19 22:00:00+00:00         29249.0             29249.0   
...                        ...             ...                 ...   
2196 2026-01-05 22:00:00+00:00         29263.0             29262.0   
2197 2026-01-06 22:00:00+00:00         29263.0             29263.0   
2198 2026-01-07 22:00:00+00:00         29262.0             29263.0   
2199 2026-01-08 22:00:00+00:00         29262.0             29262.0   
2200 2026-01-09 22:00:00+00:00         29259.0             29262.0   

      water_level_cm_t_3  water_level_cm_t_7  water_level_cm_t_14  
14               29253.0             29255.0              29237.0  
15               29251.

### Store the data in hopsworks feature store

In [121]:
fs = project.get_feature_store()

In [122]:
from utils import normalize_date_to_utc_day

df_fg_level = df_water_level_lagged.copy()

# Add primary key
df_fg_level["sensor_id"] = sensor_id  # e.g. "malaren_w_20389"

# Reorder columns (optional but clean)
df_fg_level = df_fg_level[
    ["sensor_id", "date"] +
    [c for c in df_fg_level.columns if c not in ["sensor_id", "date"]]
]

df_fg_level = normalize_date_to_utc_day(df_fg_level)

In [123]:
print(df_fg_level["date"].head(10))

14   2020-01-15 00:00:00+00:00
15   2020-01-16 00:00:00+00:00
16   2020-01-17 00:00:00+00:00
17   2020-01-18 00:00:00+00:00
18   2020-01-19 00:00:00+00:00
19   2020-01-20 00:00:00+00:00
20   2020-01-21 00:00:00+00:00
21   2020-01-22 00:00:00+00:00
22   2020-01-23 00:00:00+00:00
23   2020-01-24 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]


In [124]:
water_level_fg = fs.get_or_create_feature_group(
    name=f"water_level_lagged_{sensor_name}_{sensor_id}",
    description=(
        "Daily water level observations with lagged features "
        "for water level prediction"
    ),
    version=water_level_fg_version,
    primary_key=["sensor_id", "date"],
    event_time="date",
)

In [125]:
water_level_fg.insert(df_fg_level)

Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/2184/fs/2136/fg/3425

2026-01-11 18:33:16,847 INFO: Computing insert statistics


(None, None)

In [126]:
# Core identifiers
water_level_fg.update_feature_description(
    "date",
    "Date of the water level observation (daily resolution, UTC)"
)

water_level_fg.update_feature_description(
    "sensor_id",
    "Unique identifier of the water level sensor / station"
)

# Raw water level
water_level_fg.update_feature_description(
    "water_level_cm",
    "Observed daily water level measured at the station (cm)"
)

# Lagged features
water_level_fg.update_feature_description(
    "water_level_cm_t_1",
    "Water level measured 1 day before the current date (cm)"
)

water_level_fg.update_feature_description(
    "water_level_cm_t_3",
    "Water level measured 3 days before the current date (cm)"
)

water_level_fg.update_feature_description(
    "water_level_cm_t_7",
    "Water level measured 7 days before the current date (cm)"
)

water_level_fg.update_feature_description(
    "water_level_cm_t_14",
    "Water level measured 14 days before the current date (cm)"
)

<hsfs.feature_group.FeatureGroup at 0x78ca38865490>