### Fingrid Data

Fetched dynamically via API (time-window). First lets check if it has pagination as I get only 10 rows of data. 

In [11]:
import requests
import json

dataset_id = 191
url = f"https://data.fingrid.fi/api/datasets/{dataset_id}/data"

params = {
    "startTime": "2024-10-10T00:00:00Z",
    "endTime": "2024-10-17T00:00:00Z",
    "format": "json"
}

headers = {"x-api-key": API_KEY}

response = requests.get(url, params=params, headers=headers, timeout=30)

if response.status_code == 200:
    data = response.json()
    
    # Print entire response structure (pretty printed)
    print(json.dumps(data, indent=2, default=str))

StatementMeta(, ef37cba2-f5d7-4269-a17c-3df827b87511, 13, Finished, Available, Finished)

{
  "data": [
    {
      "datasetId": 191,
      "startTime": "2024-10-16T13:57:00.000Z",
      "endTime": "2024-10-16T14:00:00.000Z",
      "value": 1448.2
    },
    {
      "datasetId": 191,
      "startTime": "2024-10-16T13:54:00.000Z",
      "endTime": "2024-10-16T13:57:00.000Z",
      "value": 1383.5
    },
    {
      "datasetId": 191,
      "startTime": "2024-10-16T13:51:00.000Z",
      "endTime": "2024-10-16T13:54:00.000Z",
      "value": 1374.3
    },
    {
      "datasetId": 191,
      "startTime": "2024-10-16T13:48:00.000Z",
      "endTime": "2024-10-16T13:51:00.000Z",
      "value": 1356
    },
    {
      "datasetId": 191,
      "startTime": "2024-10-16T13:45:00.000Z",
      "endTime": "2024-10-16T13:48:00.000Z",
      "value": 1363.7
    },
    {
      "datasetId": 191,
      "startTime": "2024-10-16T13:42:00.000Z",
      "endTime": "2024-10-16T13:45:00.000Z",
      "value": 1351.9
    },
    {
      "datasetId": 191,
      "startTime": "2024-10-16T13:39:00.000Z",
     

In [19]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from time import sleep, time as current_time
from notebookutils import mssparkutils
from time import sleep  # ✅ Added this





dataset_id = 191
url = f"https://data.fingrid.fi/api/datasets/{dataset_id}/data"

# Get last 7 days of data
end_date = datetime.now()
start_date = end_date - timedelta(days=7)

base_params = {
    "startTime": start_date.strftime("%Y-%m-%dT00:00:00Z"),
    "endTime": end_date.strftime("%Y-%m-%dT00:00:00Z"),
    "format": "json"
    # Don't include perPage - API ignores it
}

print(f"Querying: {base_params['startTime']} to {base_params['endTime']}")
print("=" * 60)

headers = {"x-api-key": API_KEY}

all_data = []
current_page = 1
start_time = current_time()

# Store these from first page
total_records = None
last_page_num = None

print("Fetching data from Fingrid API...")

while True:
    params = base_params.copy()
    params['page'] = current_page
    
    try:
        response = requests.get(url, params=params, headers=headers, timeout=30)
        
        if response.status_code == 200:
            data = response.json()
            records = data.get('data', [])
            pagination = data.get('pagination', {})
            
            # Store total and last page from first response
            if current_page == 1:
                total_records = pagination.get('total')
                last_page_num = pagination.get('lastPage')
                print(f"📊 Total records: {total_records}, Pages: {last_page_num}")
                print(f"⏱️  Estimated time: {last_page_num * 2 / 60:.1f} minutes")
                print("=" * 60)
            
            if not records:
                print(f"⚠️  No records on page {current_page}")
                break
            
            all_data.extend(records)
            
            # Progress update every 20 pages
            if current_page % 20 == 0 or current_page == 1:
                elapsed = current_time() - start_time
                if last_page_num:
                    progress = (current_page / last_page_num * 100)
                    eta = (elapsed / current_page * last_page_num - elapsed)
                    print(f"Page {current_page}/{last_page_num} ({progress:.1f}%) | "
                          f"Rows: {len(all_data)}/{total_records} | "
                          f"ETA: {eta/60:.1f} min")
            
            # Check if done using stored last_page_num
            if last_page_num and current_page >= last_page_num:
                print(f"✅ Reached last page: {current_page}/{last_page_num}")
                break
            
            # Manually increment page (don't trust nextPage after page 1)
            current_page += 1
            sleep(2)
            
        elif response.status_code == 429:
            print(f"⚠️  Rate limited at page {current_page}. Waiting 5 seconds...")
            sleep(5)
            
        else:
            print(f"❌ Error {response.status_code}: {response.text}")
            break
            
    except Exception as e:
        print(f"❌ Exception on page {current_page}: {type(e).__name__}: {e}")
        
        # If we know the last page, try to continue
        if last_page_num and current_page < last_page_num:
            print(f"   Retrying in 3 seconds...")
            sleep(3)
            continue
        else:
            break

elapsed_total = current_time() - start_time
print("=" * 60)
print(f"\n✅ Fetch complete!")
print(f"   Total time: {elapsed_total/60:.1f} minutes")
print(f"   Total rows fetched: {len(all_data)}")
print(f"   Expected rows: {total_records}")

if total_records and len(all_data) < total_records:
    print(f"   ⚠️  Warning: Only got {len(all_data)}/{total_records} rows")

df = pd.DataFrame(all_data)

if len(df) > 0:
    print(f"\n📊 Data Summary:")
    print(f"   Date range: {df['startTime'].min()} to {df['startTime'].max()}")
    print(f"   Shape: {df.shape}")
    print(f"   Columns: {list(df.columns)}")
    
    print(f"\n💾 Saving data...")
    spark_df = spark.createDataFrame(df)
    spark_df.write.mode("overwrite").format("delta").saveAsTable("fingrid_data_complete")
    print("   ✓ Saved to Delta table: fingrid_data_complete")
    
    csv_string = df.to_csv(index=False)
    mssparkutils.fs.put("Files/fingrid_data_7days.csv", csv_string, overwrite=True)
    print("   ✓ Saved to Files/fingrid_data_7days.csv")
    
    print("\n✅ All done!")
    print(f"\n📋 First 5 rows:")
    print(df.head())
else:
    print("⚠️  No data fetched!")

StatementMeta(, ef37cba2-f5d7-4269-a17c-3df827b87511, 21, Finished, Available, Finished)

Querying: 2025-10-10T00:00:00Z to 2025-10-17T00:00:00Z
Fetching data from Fingrid API...
📊 Total records: 3360, Pages: 336
⏱️  Estimated time: 11.2 minutes
Page 1/336 (0.3%) | Rows: 10/3360 | ETA: 1.2 min
Page 20/336 (6.0%) | Rows: 200/3360 | ETA: 13.0 min
❌ Exception on page 31: ReadTimeout: HTTPSConnectionPool(host='data.fingrid.fi', port=443): Read timed out. (read timeout=30)
   Retrying in 3 seconds...
Page 40/336 (11.9%) | Rows: 400/3360 | ETA: 19.4 min
Page 60/336 (17.9%) | Rows: 600/3360 | ETA: 15.3 min
⚠️  Rate limited at page 61. Waiting 5 seconds...
⚠️  Rate limited at page 64. Waiting 5 seconds...
⚠️  Rate limited at page 65. Waiting 5 seconds...
Page 80/336 (23.8%) | Rows: 800/3360 | ETA: 13.7 min
Page 100/336 (29.8%) | Rows: 1000/3360 | ETA: 11.8 min
⚠️  Rate limited at page 105. Waiting 5 seconds...
Page 120/336 (35.7%) | Rows: 1200/3360 | ETA: 10.5 min
Page 140/336 (41.7%) | Rows: 1400/3360 | ETA: 9.3 min
⚠️  Rate limited at page 148. Waiting 5 seconds...
Page 160/336 (

### Zenodo Data

Downloaded once, rarely changes

In [7]:
from notebookutils import mssparkutils
import pandas as pd

# Get the full path to the file
file_path = "Files/JingHydro.csv"
full_path = f"/lakehouse/default/{file_path}"

zenodo_df = pd.read_csv(full_path, index_col=0)

StatementMeta(, 43221a26-c6a7-4903-bd2c-58dd30f140de, 9, Finished, Available, Finished)

In [8]:
zenodo_df.head()

StatementMeta(, 43221a26-c6a7-4903-bd2c-58dd30f140de, 10, Finished, Available, Finished)

Unnamed: 0,T1h,AT_ROR,AT_STO,BE_ROR,BE_STO,BG_ROR,BG_STO,CH_ROR,CH_STO,CZ_ROR,...,PT_ROR,PT_STO,RO_ROR,RO_STO,SE_ROR,SE_STO,SI_ROR,SI_STO,SK_ROR,SK_STO
1,1981-01-01 00:00:00,0.227995,0.228006,0.221667,0.223077,0.2,0.201153,0.156183,0.156181,0.155802,...,0.149087,0.149097,0.336824,0.33678,0.526756,0.526771,0.275335,0.275434,0.194583,0.194709
2,1981-01-01 01:00:00,0.227857,0.227864,0.22,0.223077,0.2,0.201269,0.155962,0.155969,0.155631,...,0.148913,0.148857,0.33705,0.337075,0.526672,0.526692,0.274397,0.274442,0.194444,0.194444
3,1981-01-01 02:00:00,0.22772,0.227723,0.22,0.223077,0.2,0.201326,0.155741,0.155757,0.155461,...,0.148696,0.148736,0.337387,0.337371,0.52663,0.526613,0.273592,0.273449,0.194306,0.194312
4,1981-01-01 03:00:00,0.227583,0.227581,0.22,0.223077,0.2,0.201442,0.155545,0.155544,0.15529,...,0.148478,0.148496,0.337613,0.337648,0.526547,0.526534,0.272654,0.272705,0.194028,0.194048
5,1981-01-01 04:00:00,0.227446,0.227439,0.22,0.223077,0.2,0.201557,0.155324,0.15532,0.155119,...,0.148304,0.148255,0.33795,0.337943,0.526463,0.526456,0.27185,0.271712,0.193889,0.193915


In [9]:
# Save to Fabric Lakehouse as Bronze
spark_df = spark.createDataFrame(zenodo_df)
spark_df.write.mode("overwrite").format("delta").save("Tables/bronze_zenodo_hydro")

StatementMeta(, 43221a26-c6a7-4903-bd2c-58dd30f140de, 11, Finished, Available, Finished)

### Metadata 

Static reference, rarely changes

In [10]:
metadata_url = "https://raw.githubusercontent.com/energy-modelling-toolkit/hydro-power-database/refs/heads/master/data/jrc-hydro-power-plant-database.csv"
metadata_df = pd.read_csv(metadata_url)

# Save to Fabric Lakehouse
spark_df = spark.createDataFrame(metadata_df)
spark_df.write.mode("overwrite").format("delta").save("Tables/bronze_hydro_metadata")


StatementMeta(, 43221a26-c6a7-4903-bd2c-58dd30f140de, 12, Finished, Available, Finished)