In [1]:
import xarray as xr
import fsspec
import logging

# Enable logging inside notebook
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# S3 filesystem (read-only)
fs = fsspec.filesystem("s3", anon=True)

# Months to check: March to December
months_to_check = [f"{m:02}" for m in range(3, 13)]
base_path = "s3://ocf-open-data-pvnet/data/uk/pvlive/v1/"
paths = [f"{base_path}target_data_2023_{month}.zarr" for month in months_to_check]


In [2]:
for path in paths:
    logging.info(f"Inspecting: {path}")
    try:
        mapper = fs.get_mapper(path)
        ds = xr.open_zarr(mapper, consolidated=False)

        # Report structure
        print(f"\n✅ Dataset at {path}")
        print("Data variables:", list(ds.data_vars))
        print("Coordinates:", list(ds.coords))
        print("Dims:", ds.dims)
        print("Attrs:", ds.attrs)

        # Try peeking into one variable if present
        if ds.data_vars:
            var = list(ds.data_vars)[0]
            print(f"\nSample data from variable '{var}':")
            print(ds[var].isel(time=0).values if "time" in ds[var].dims else ds[var].values)

    except Exception as e:
        logging.error(f"❌ Failed to load {path}")
        logging.exception(e)


INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_03.zarr
INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_04.zarr



✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_03.zarr
Data variables: []
Coordinates: []
Attrs: {}


INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_05.zarr



✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_04.zarr
Data variables: []
Coordinates: []
Attrs: {}


INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_06.zarr



✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_05.zarr
Data variables: []
Coordinates: []
Attrs: {}


INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_07.zarr



✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_06.zarr
Data variables: []
Coordinates: []
Attrs: {}


INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_08.zarr



✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_07.zarr
Data variables: []
Coordinates: []
Attrs: {}


INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_09.zarr



✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_08.zarr
Data variables: []
Coordinates: []
Attrs: {}


INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_10.zarr



✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_09.zarr
Data variables: []
Coordinates: []
Attrs: {}


INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_11.zarr



✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_10.zarr
Data variables: []
Coordinates: []
Attrs: {}


INFO: Inspecting: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_12.zarr



✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_11.zarr
Data variables: []
Coordinates: []
Attrs: {}

✅ Dataset at s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_12.zarr
Data variables: []
Coordinates: []
Attrs: {}


In [3]:
def inspect_zarr_keys(path):
    print(f"\n🗂 Keys inside Zarr store: {path}")
    mapper = fs.get_mapper(path)
    keys = list(mapper.keys())
    
    if keys:
        print(f"Found {len(keys)} keys. Sample keys:")
        for k in keys[:10]:  # Show first 10 keys
            print(f" - {k}")
    else:
        print("❌ No keys found – store appears empty or corrupted.")

# Example: Check March
inspect_zarr_keys("s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_03.zarr")

# You can loop through all months like this:
for month in range(3, 13):
    path = f"s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_{month:02}.zarr"
    inspect_zarr_keys(path)



🗂 Keys inside Zarr store: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_03.zarr
Found 50 keys. Sample keys:
 - .zgroup
 - target_data_2023_03.zarr/.zattrs
 - target_data_2023_03.zarr/.zgroup
 - target_data_2023_03.zarr/.zmetadata
 - target_data_2023_03.zarr/capacity_mwp/.zarray
 - target_data_2023_03.zarr/capacity_mwp/.zattrs
 - target_data_2023_03.zarr/capacity_mwp/0.0
 - target_data_2023_03.zarr/capacity_mwp/0.1
 - target_data_2023_03.zarr/capacity_mwp/1.0
 - target_data_2023_03.zarr/capacity_mwp/1.1

🗂 Keys inside Zarr store: s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_03.zarr
Found 50 keys. Sample keys:
 - .zgroup
 - target_data_2023_03.zarr/.zattrs
 - target_data_2023_03.zarr/.zgroup
 - target_data_2023_03.zarr/.zmetadata
 - target_data_2023_03.zarr/capacity_mwp/.zarray
 - target_data_2023_03.zarr/capacity_mwp/.zattrs
 - target_data_2023_03.zarr/capacity_mwp/0.0
 - target_data_2023_03.zarr/capacity_mwp/0.1
 - target_data_2023_03.zarr/capacity_mwp/1.0

In [9]:
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np


In [10]:
import xarray as xr
ds = xr.open_zarr("combined_2023_gsp.zarr", consolidated=True)
ds.info


<bound method Dataset.info of <xarray.Dataset> Size: 178MB
Dimensions:                (gsp_id: 318, datetime_gmt: 17521)
Coordinates:
  * datetime_gmt           (datetime_gmt) datetime64[ns] 140kB 2023-01-01 ......
  * gsp_id                 (gsp_id) int64 3kB 0 1 2 3 4 ... 313 314 315 316 317
Data variables:
    capacity_mwp           (gsp_id, datetime_gmt) float64 45MB dask.array<chunksize=(318, 1440), meta=np.ndarray>
    generation_mw          (gsp_id, datetime_gmt) float64 45MB dask.array<chunksize=(318, 1440), meta=np.ndarray>
    index                  (gsp_id, datetime_gmt) int64 45MB dask.array<chunksize=(318, 1440), meta=np.ndarray>
    installedcapacity_mwp  (gsp_id, datetime_gmt) float64 45MB dask.array<chunksize=(318, 1440), meta=np.ndarray>>

In [7]:
import xarray as xr

ds_monthly = xr.open_dataset(
    "s3://ocf-open-data-pvnet/data/uk/pvlive/v1/target_data_2023_01.zarr/",
    engine="zarr",
    backend_kwargs={
        "storage_options": {
            "anon": True}
        }
)
ds_monthly.info

<bound method Dataset.info of <xarray.Dataset> Size: 15MB
Dimensions:                (gsp_id: 318, datetime_gmt: 1489)
Coordinates:
  * datetime_gmt           (datetime_gmt) datetime64[ns] 12kB 2023-01-01 ... ...
  * gsp_id                 (gsp_id) int64 3kB 0 1 2 3 4 ... 313 314 315 316 317
Data variables:
    capacity_mwp           (gsp_id, datetime_gmt) float64 4MB ...
    generation_mw          (gsp_id, datetime_gmt) float64 4MB ...
    index                  (gsp_id, datetime_gmt) int64 4MB ...
    installedcapacity_mwp  (gsp_id, datetime_gmt) float64 4MB ...>

Unnamed: 0,Array,Chunk
Bytes,510.10 MiB,510.10 MiB
Shape,"(12, 318, 17521)","(12, 318, 17521)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 510.10 MiB 510.10 MiB Shape (12, 318, 17521) (12, 318, 17521) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",17521  318  12,

Unnamed: 0,Array,Chunk
Bytes,510.10 MiB,510.10 MiB
Shape,"(12, 318, 17521)","(12, 318, 17521)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,510.10 MiB,510.10 MiB
Shape,"(12, 318, 17521)","(12, 318, 17521)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 510.10 MiB 510.10 MiB Shape (12, 318, 17521) (12, 318, 17521) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",17521  318  12,

Unnamed: 0,Array,Chunk
Bytes,510.10 MiB,510.10 MiB
Shape,"(12, 318, 17521)","(12, 318, 17521)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,510.10 MiB,510.10 MiB
Shape,"(12, 318, 17521)","(12, 318, 17521)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 510.10 MiB 510.10 MiB Shape (12, 318, 17521) (12, 318, 17521) Dask graph 1 chunks in 2 graph layers Data type int64 numpy.ndarray",17521  318  12,

Unnamed: 0,Array,Chunk
Bytes,510.10 MiB,510.10 MiB
Shape,"(12, 318, 17521)","(12, 318, 17521)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,510.10 MiB,510.10 MiB
Shape,"(12, 318, 17521)","(12, 318, 17521)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 510.10 MiB 510.10 MiB Shape (12, 318, 17521) (12, 318, 17521) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",17521  318  12,

Unnamed: 0,Array,Chunk
Bytes,510.10 MiB,510.10 MiB
Shape,"(12, 318, 17521)","(12, 318, 17521)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [11]:
for var in ds.data_vars:
    print(f"{var}: {ds[var].shape}, dims: {ds[var].dims}")


capacity_mwp: (12, 318, 17521), dims: ('time', 'gsp_id', 'datetime_gmt')
generation_mw: (12, 318, 17521), dims: ('time', 'gsp_id', 'datetime_gmt')
index: (12, 318, 17521), dims: ('time', 'gsp_id', 'datetime_gmt')
installedcapacity_mwp: (12, 318, 17521), dims: ('time', 'gsp_id', 'datetime_gmt')
