In [43]:
import requests
import datetime
import pandas as pd
import requests
import hopsworks
from pathlib import Path
import json
import re
import os
import warnings
from dotenv import load_dotenv
import openmeteo_requests
import requests_cache
from retry_requests import retry

In [54]:
# Load environment variables from .env file
load_dotenv()

HW_API_KEY = os.getenv("HOPSWORKS_API_KEY")
HW_PROJECT = os.getenv("HOPSWORKS_PROJECT")

if not HW_API_KEY or not HW_PROJECT:
    raise ValueError("Missing credentials in .env file")

# Set root directory
root_dir = Path().absolute()
root_dir = str(root_dir)

#Read location from locations.json file to initialize variables
with open(f"{root_dir}/locations.json") as f:
    locations = json.load(f)
# location = locations[0] # Choose location manually when running locally

# city = location["city"]
# latitude = location["latitude"]
# longitude = location["longitude"]
# price_area = location["price_area"]

# # Read location from github actions environment variables if available
# if os.environ.get("CITY"):
#     city = os.environ.get("CITY")
# if os.environ.get("LATITUDE"):
#     latitude = os.environ.get("LATITUDE")
# if os.environ.get("LONGITUDE"):
#     longitude = os.environ.get("LONGITUDE")
# if os.environ.get("PRICE_AREA"):
#     price_area = os.environ.get("PRICE_AREA")

# Connect to Hopsworks project and feature store
project = hopsworks.login(
    project=HW_PROJECT,
    api_key_value=HW_API_KEY,
    host="eu-west.cloud.hopsworks.ai",
)
fs = project.get_feature_store()

2026-01-11 20:58:12,600 INFO: Closing external client and cleaning up certificates.
2026-01-11 20:58:12,602 INFO: Connection closed.
2026-01-11 20:58:12,604 INFO: Initializing external client
2026-01-11 20:58:12,604 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2026-01-11 20:58:13,794 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/2186


In [45]:
# Retrieve feature groups
electricity_fg = fs.get_feature_group(
    name='electricity_hourly',
    version=1,
)
weather_fg = fs.get_feature_group(
    name='weather',
    version=1,
)

# Daily weather pipeline

In [46]:
import pandas as pd
import requests_cache
import openmeteo_requests
from retry_requests import retry

def get_weather_forecast(locations, weather_key="se3_set_v1"):
    """
    locations: list of dicts with keys:
      - city (sanitized, e.g. "vasteras")
      - latitude
      - longitude
      - price_area (unused here, but ok to include)
    returns: wide df with one row per UTC timestamp and one column per (variable, city)
    """

    # Cache for 1 hour
    cache_session = requests_cache.CachedSession(".cache", expire_after=3600)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    openmeteo = openmeteo_requests.Client(session=retry_session)

    url = "https://api.open-meteo.com/v1/forecast"

    wide_parts = []

    for loc in locations:
        city = loc["city"]          # already sanitized
        latitude = loc["latitude"]
        longitude = loc["longitude"]

        params = {
            "latitude": latitude,
            "longitude": longitude,
            "hourly": ["temperature_2m", "precipitation", "cloud_cover", "wind_speed_10m"],
            "timezone": "UTC",  # important: force UTC timestamps
        }

        responses = openmeteo.weather_api(url, params=params)
        response = responses[0]

        hourly = response.Hourly()
        temperature_2m = hourly.Variables(0).ValuesAsNumpy()
        precipitation = hourly.Variables(1).ValuesAsNumpy()
        cloud_cover = hourly.Variables(2).ValuesAsNumpy()
        wind_speed_10m = hourly.Variables(3).ValuesAsNumpy()

        dates = pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left",
        )

        df_city = pd.DataFrame(
            {
                "date": dates,
                f"temperature_2m_{city}": temperature_2m,
                f"precipitation_{city}": precipitation,
                f"cloud_cover_{city}": cloud_cover,
                f"wind_speed_10m_{city}": wind_speed_10m,
            }
        ).dropna()

        wide_parts.append(df_city)

    # Outer merge on date to align all cities
    wide = wide_parts[0]
    for part in wide_parts[1:]:
        wide = wide.merge(part, on="date", how="outer")

    wide = wide.sort_values("date").reset_index(drop=True)
    wide["weather_key"] = weather_key

    return wide


In [47]:
weather_forecast_df = get_weather_forecast(locations)

In [48]:
weather_forecast_df

Unnamed: 0,date,temperature_2m_stockholm,precipitation_stockholm,cloud_cover_stockholm,wind_speed_10m_stockholm,temperature_2m_uppsala,precipitation_uppsala,cloud_cover_uppsala,wind_speed_10m_uppsala,temperature_2m_vasteras,...,wind_speed_10m_karlstad,temperature_2m_sundsvall,precipitation_sundsvall,cloud_cover_sundsvall,wind_speed_10m_sundsvall,temperature_2m_malmo,precipitation_malmo,cloud_cover_malmo,wind_speed_10m_malmo,weather_key
0,2026-01-11 00:00:00+00:00,-2.9825,0.0,100.0,29.519999,-5.637,0.0,100.0,24.840000,-8.122,...,12.959999,-16.9835,0.0,100.0,11.879999,-4.6955,0.0,0.0,30.239998,se3_set_v1
1,2026-01-11 01:00:00+00:00,-3.0825,0.0,100.0,32.039997,-5.737,0.0,100.0,25.919998,-8.272,...,13.320000,-15.4835,0.0,100.0,11.159999,-4.0955,0.0,0.0,30.960001,se3_set_v1
2,2026-01-11 02:00:00+00:00,-3.3325,0.0,100.0,32.399998,-5.737,0.0,100.0,25.199999,-8.872,...,12.959999,-14.4835,0.0,100.0,11.159999,-4.0455,0.0,0.0,32.399998,se3_set_v1
3,2026-01-11 03:00:00+00:00,-3.6825,0.0,100.0,31.319998,-5.887,0.0,100.0,24.480000,-9.222,...,12.959999,-13.9835,0.0,95.0,11.520000,-4.1955,0.0,0.0,29.879999,se3_set_v1
4,2026-01-11 04:00:00+00:00,-3.8325,0.0,100.0,29.519999,-5.887,0.0,99.0,23.759998,-9.622,...,12.599999,-13.9335,0.0,95.0,10.799999,-3.9955,0.0,0.0,29.519999,se3_set_v1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,2026-01-17 19:00:00+00:00,1.2000,0.0,100.0,12.320471,0.850,0.0,100.0,12.514855,0.950,...,11.532650,0.5500,0.0,100.0,10.188700,2.6415,0.0,100.0,11.190442,se3_set_v1
164,2026-01-17 20:00:00+00:00,1.2000,0.0,100.0,11.866355,0.850,0.0,100.0,11.844492,0.950,...,12.065372,0.5000,0.0,100.0,9.693296,2.6415,0.0,100.0,11.503113,se3_set_v1
165,2026-01-17 21:00:00+00:00,1.2500,0.0,100.0,11.503113,0.850,0.0,100.0,11.391313,0.950,...,12.611567,0.4500,0.0,100.0,9.139322,2.6915,0.0,100.0,11.874544,se3_set_v1
166,2026-01-17 22:00:00+00:00,1.3000,0.0,100.0,10.985900,0.900,0.0,100.0,10.867290,0.950,...,13.339445,0.4000,0.0,100.0,8.490582,2.7415,0.0,100.0,12.251904,se3_set_v1


In [49]:
# Insert data
weather_fg.insert(weather_forecast_df, wait=True)



2026-01-11 19:43:42,206 INFO: Computing insert statistics


(None, None)

# Daily electricity pipeline

In [50]:
def get_prices(date, price_class):
    url = (
        f"https://www.elprisetjustnu.se/api/v1/prices/"
        f"{date.year}/{date.month:02d}-{date.day:02d}_{price_class}.json"
    )
    r = requests.get(url)
    r.raise_for_status()
    return r.json()


In [51]:
electricity_data = get_prices(datetime.datetime.now(), price_area)
electricity_daily_df = pd.DataFrame(electricity_data)

In [52]:
# --- Parse timestamps to UTC (DST-safe) ---
time_start_utc = pd.to_datetime(electricity_daily_df["time_start"], utc=True)
time_end_utc   = pd.to_datetime(electricity_daily_df["time_end"], utc=True)

# --- Compute resolution in minutes ---
electricity_daily_df = electricity_daily_df.copy()
electricity_daily_df["resolution_minutes"] = (
    (time_end_utc - time_start_utc).dt.total_seconds().astype(int) // 60
)

# Use start time as event time (UTC)
electricity_daily_df["date"] = time_start_utc

# Add hour bucket for aggregation
electricity_daily_df["date_hour"] = electricity_daily_df["date"].dt.floor("h")

# --- Aggregate to hourly (works for both 60-min and 15-min rows) ---
# If an hour already has a 60-min row, mean() just returns that value.
# If an hour has four 15-min rows, mean() aggregates them to hourly.
electricity_hourly = (
    electricity_daily_df.groupby(["date_hour"], as_index=False)
      .agg(
          SEK_per_kWh=("SEK_per_kWh", "mean"),
          EUR_per_kWh=("EUR_per_kWh", "mean"),
          EXR=("EXR", "last"),  # or "mean"
          n_intervals=("resolution_minutes", "size"),  # 1 (hourly) or 4 (15-min)
          resolution_minutes=("resolution_minutes", "sum"),  # should be 60 for a full hour
      )
      .rename(columns={"date_hour": "date"})
)

# Add price area
electricity_hourly["price_area"] = "SE3"

# Optional: sanity checks
print("n_intervals distribution:\n", electricity_hourly["n_intervals"].value_counts().sort_index())
print("resolution_minutes distribution:\n", electricity_hourly["resolution_minutes"].value_counts().sort_index())

# Reorder columns
electricity_hourly = electricity_hourly[
    ["date", "price_area", "SEK_per_kWh", "EUR_per_kWh", "EXR", "resolution_minutes", "n_intervals"]
]

print(electricity_hourly.dtypes)
print(electricity_hourly.head())


n_intervals distribution:
 n_intervals
4    24
Name: count, dtype: int64
resolution_minutes distribution:
 resolution_minutes
60    24
Name: count, dtype: int64
date                  datetime64[ns, UTC]
price_area                         object
SEK_per_kWh                       float64
EUR_per_kWh                       float64
EXR                               float64
resolution_minutes                  int64
n_intervals                         int64
dtype: object
                       date price_area  SEK_per_kWh  EUR_per_kWh        EXR  \
0 2026-01-10 23:00:00+00:00        SE3     0.857200     0.080025  10.711646   
1 2026-01-11 00:00:00+00:00        SE3     0.836633     0.078105  10.711646   
2 2026-01-11 01:00:00+00:00        SE3     0.822037     0.076742  10.711646   
3 2026-01-11 02:00:00+00:00        SE3     0.839337     0.078357  10.711646   
4 2026-01-11 03:00:00+00:00        SE3     0.838965     0.078323  10.711646   

   resolution_minutes  n_intervals  
0                  

In [53]:
# Insert data
electricity_fg.insert(electricity_hourly, wait=True)



2026-01-11 19:43:55,045 INFO: Computing insert statistics


(None, None)