TODO: fetch historical data and store in hopsworks

In [26]:
import argparse
import pandas as pd
import hopsworks


parser = argparse.ArgumentParser()
parser.add_argument("--latitude", type=float, default=59.3284)
parser.add_argument("--longitude", type=float, default=18.0664)
parser.add_argument("--sensor-id", type=int, default=20389)
parser.add_argument("--sensor-name", type=str, default="malaren_w")
parser.add_argument("--sensor-url", type=str, default="https://opendata-download-hydroobs.smhi.se/api/version/1.0/parameter/3/station/20389/period/corrected-archive/data.json")

args, _ = parser.parse_known_args()

latitude = args.latitude
longitude = args.longitude
sensor_id = args.sensor_id
sensor_name = args.sensor_name
sensor_url = args.sensor_url

### Step 1: Login to hopsworks

Environment variable `HOPSWORKS_API_KEY` should be set

In [27]:
project = hopsworks.login(
    host="eu-west.cloud.hopsworks.ai",
    port=443,
    project="ml_project",
    api_key_value="wtoN33kI3N6NEoXN.uGN8iXjW9Cqw6lSFkvNMrDJh8xe9VEXnhhWhbaeiY1DmMHspWCFJrRzYSkzdfk0R"
)

2026-01-10 20:10:35,323 INFO: Closing external client and cleaning up certificates.
2026-01-10 20:10:35,324 INFO: Connection closed.
2026-01-10 20:10:35,325 INFO: Initializing external client
2026-01-10 20:10:35,325 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-10 20:10:36,745 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2184


### Step 2: Fetch historical water level data for a water level sensor

In [28]:
from utils import fetch_smhi_water_level

df_water_level = fetch_smhi_water_level(sensor_url)

cutoff = pd.Timestamp("2020-01-01", tz="UTC")

df_water_level = df_water_level[
    df_water_level["date"] >= cutoff
].reset_index(drop=True)

print(df_water_level)

                          date  water_level_cm
0    2020-01-01 22:00:00+00:00           110.0
1    2020-01-02 22:00:00+00:00           109.0
2    2020-01-03 22:00:00+00:00           108.0
3    2020-01-04 22:00:00+00:00           108.0
4    2020-01-05 22:00:00+00:00           107.0
...                        ...             ...
2195 2026-01-04 22:00:00+00:00            85.0
2196 2026-01-05 22:00:00+00:00            85.0
2197 2026-01-06 22:00:00+00:00            85.0
2198 2026-01-07 22:00:00+00:00            86.0
2199 2026-01-08 22:00:00+00:00            85.0

[2200 rows x 2 columns]


Only keep the data from 2020 onwards

### Add lagged features

In [29]:
from utils import add_water_level_lags

df_water_level_lagged = add_water_level_lags(df_water_level)

print(df_water_level_lagged)

                          date  water_level_cm  water_level_cm_t_1  \
0    2020-01-01 22:00:00+00:00           110.0                 NaN   
1    2020-01-02 22:00:00+00:00           109.0               110.0   
2    2020-01-03 22:00:00+00:00           108.0               109.0   
3    2020-01-04 22:00:00+00:00           108.0               108.0   
4    2020-01-05 22:00:00+00:00           107.0               108.0   
...                        ...             ...                 ...   
2195 2026-01-04 22:00:00+00:00            85.0                86.0   
2196 2026-01-05 22:00:00+00:00            85.0                85.0   
2197 2026-01-06 22:00:00+00:00            85.0                85.0   
2198 2026-01-07 22:00:00+00:00            86.0                85.0   
2199 2026-01-08 22:00:00+00:00            85.0                86.0   

      water_level_cm_t_3  water_level_cm_t_7  water_level_cm_t_14  
0                    NaN                 NaN                  NaN  
1                    Na

In [30]:
df_water_level_lagged = df_water_level_lagged.dropna()

print(df_water_level_lagged)

                          date  water_level_cm  water_level_cm_t_1  \
14   2020-01-15 22:00:00+00:00           102.0               104.0   
15   2020-01-16 22:00:00+00:00           103.0               102.0   
16   2020-01-17 22:00:00+00:00           103.0               103.0   
17   2020-01-18 22:00:00+00:00           102.0               103.0   
18   2020-01-19 22:00:00+00:00           101.0               102.0   
...                        ...             ...                 ...   
2195 2026-01-04 22:00:00+00:00            85.0                86.0   
2196 2026-01-05 22:00:00+00:00            85.0                85.0   
2197 2026-01-06 22:00:00+00:00            85.0                85.0   
2198 2026-01-07 22:00:00+00:00            86.0                85.0   
2199 2026-01-08 22:00:00+00:00            85.0                86.0   

      water_level_cm_t_3  water_level_cm_t_7  water_level_cm_t_14  
14                 103.0               105.0                110.0  
15                 104.

### Store the data in hopsworks feature store

In [31]:
fs = project.get_feature_store()

In [32]:
from utils import normalize_date_to_utc_day

df_fg_level = df_water_level_lagged.copy()

# Add primary key
df_fg_level["sensor_id"] = sensor_id  # e.g. "malaren_w_20389"

# Reorder columns (optional but clean)
df_fg_level = df_fg_level[
    ["sensor_id", "date"] +
    [c for c in df_fg_level.columns if c not in ["sensor_id", "date"]]
]

df_fg_level = normalize_date_to_utc_day(df_fg_level)

In [33]:
print(df_fg_level["date"].head(10))

14   2020-01-15 00:00:00+00:00
15   2020-01-16 00:00:00+00:00
16   2020-01-17 00:00:00+00:00
17   2020-01-18 00:00:00+00:00
18   2020-01-19 00:00:00+00:00
19   2020-01-20 00:00:00+00:00
20   2020-01-21 00:00:00+00:00
21   2020-01-22 00:00:00+00:00
22   2020-01-23 00:00:00+00:00
23   2020-01-24 00:00:00+00:00
Name: date, dtype: datetime64[ns, UTC]


In [34]:
water_level_fg = fs.get_or_create_feature_group(
    name=f"water_level_lagged_{sensor_name}_{sensor_id}",
    description=(
        "Daily water level observations with lagged features "
        "for water level prediction"
    ),
    version=4,
    primary_key=["sensor_id", "date"],
    event_time="date",
)

In [35]:
water_level_fg.insert(df_fg_level)


2026-01-10 20:10:55,698 INFO: Computing insert statistics


(None, None)

In [36]:
# Core identifiers
water_level_fg.update_feature_description(
    "date",
    "Date of the water level observation (daily resolution, UTC)"
)

water_level_fg.update_feature_description(
    "sensor_id",
    "Unique identifier of the water level sensor / station"
)

# Raw water level
water_level_fg.update_feature_description(
    "water_level_cm",
    "Observed daily water level measured at the station (cm)"
)

# Lagged features
water_level_fg.update_feature_description(
    "water_level_cm_t_1",
    "Water level measured 1 day before the current date (cm)"
)

water_level_fg.update_feature_description(
    "water_level_cm_t_3",
    "Water level measured 3 days before the current date (cm)"
)

water_level_fg.update_feature_description(
    "water_level_cm_t_7",
    "Water level measured 7 days before the current date (cm)"
)

water_level_fg.update_feature_description(
    "water_level_cm_t_14",
    "Water level measured 14 days before the current date (cm)"
)

<hsfs.feature_group.FeatureGroup at 0x78ca6c8a7190>