# Personal Notebook for exploring nc files

In [None]:
import xarray as xr
import geopandas as gpd
import os

In [None]:
data_file = "../data/processed/era5_data_2016-2017_allm_2t_tp_monthly_unicoords_adjlon_celsius_mm_05deg_trim_tutorial_B.nc"
nuts_file = "../data/in/NUTS_RG_20M_2024_4326.shp.zip"

In [None]:
with xr.open_dataset(data_file, chunks={"time": "auto"}) as ds:
    df = ds.to_dataframe().reset_index()
df.head()

In [None]:
len(df[["latitude", "longitude"]].drop_duplicates())

In [None]:
nuts = gpd.read_file(nuts_file)
len(nuts.NUTS_ID.unique())

In [None]:
nuts.head()

In [None]:
# convert xarray dataset to pandas dataframe
gpd_ds = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs="EPSG:4326",
)
gpd_ds.head()

In [None]:
len(gpd_ds.geometry.unique())

In [None]:
# merge with nuts geodataframe
merged = gpd.sjoin(gpd_ds, nuts, how="inner", predicate="within")
merged.head()

In [None]:
len(merged.NUTS_ID), len(merged.NUTS_ID.unique()), len(merged.geometry.unique())

In [None]:
# check if there are any geometries that map to multiple NUTS_IDs
# and among these NUTS_IDs, there is no hierarchy relationship
from collections import defaultdict

geom_to_nuts = defaultdict(list)
for geom, nuts_id in zip(merged.geometry, merged.NUTS_ID):
    geom_to_nuts[geom].append(nuts_id)

In [None]:
repeated_geoms = {
    geom: nuts_ids for geom, nuts_ids in geom_to_nuts.items() if len(nuts_ids) > 1
}
len(repeated_geoms)

In [None]:
shared_geoms = defaultdict(list)
for geom, nuts_ids in repeated_geoms.items():
    common_prefix = os.path.commonprefix(nuts_ids)
    if not common_prefix:
        shared_geoms[geom] = nuts_ids
len(shared_geoms)

In [None]:
# check if there are any NUTS_IDS that do not map to any geometry
all_nuts_ids = set(nuts.NUTS_ID.unique())
mapped_nuts_ids = set(merged.NUTS_ID.unique())
unmapped_nuts_ids = all_nuts_ids - mapped_nuts_ids
len(unmapped_nuts_ids)

In [None]:
list(unmapped_nuts_ids)[:10]

In [None]:
# check if there are NUTS3 inside another NUTS3
nuts3 = nuts[nuts.LEVL_CODE == 3]
nuts3_sjoined = gpd.sjoin(nuts3, nuts3, how="inner", predicate="within")
nuts3_sjoined_diff = nuts3_sjoined[
    nuts3_sjoined.NUTS_ID_left != nuts3_sjoined.NUTS_ID_right
]
(
    len(nuts3),
    len(nuts3_sjoined),
    len(nuts3_sjoined_diff),
    len(set(nuts3_sjoined.NUTS_ID_left)),
)

In [None]:
# check if NUTS3 touch other NUTS3
nuts3_sjoined_other = gpd.sjoin(nuts3, nuts3, how="inner", predicate="intersects")
nuts3_sjoined_other_diff = nuts3_sjoined_other[
    nuts3_sjoined_other.NUTS_ID_left != nuts3_sjoined_other.NUTS_ID_right
]
(
    len(nuts3_sjoined_other),
    len(nuts3_sjoined_other_diff),
    len(set(nuts3_sjoined_other_diff.NUTS_ID_left)),
)

In [None]:
nuts[nuts.NUTS_ID == "DE271"][["NUTS_ID", "LEVL_CODE", "geometry"]]