# Personal Notebook for exploring nc files

In [None]:
import xarray as xr
import geopandas as gpd
import os
import xagg as xa

In [None]:
data_file = "../data/processed/era5_data_2016-2017_allm_2t_tp_monthly_unicoords_adjlon_celsius_mm_05deg_trim_tutorial_B.nc"
nuts_file = "../data/in/NUTS_RG_20M_2024_4326.shp.zip"

In [None]:
with xr.open_dataset(data_file, chunks={"time": "auto"}) as ds:
    df = ds.to_dataframe().reset_index()
df.head()

In [None]:
len(df[["latitude", "longitude"]].drop_duplicates())

In [None]:
nuts = gpd.read_file(nuts_file)
len(nuts.NUTS_ID.unique())

In [None]:
nuts.head()

## Using purely geopandas

In [None]:
# convert xarray dataset to pandas dataframe
gpd_ds = gpd.GeoDataFrame(
    df,
    geometry=gpd.points_from_xy(df["longitude"], df["latitude"]),
    crs="EPSG:4326",
)
gpd_ds.head()

In [None]:
len(gpd_ds.geometry.unique())

In [None]:
# merge with nuts geodataframe
merged = gpd.sjoin(gpd_ds, nuts, how="inner", predicate="within")
merged.head()

In [None]:
len(merged.NUTS_ID), len(merged.NUTS_ID.unique()), len(merged.geometry.unique())

In [None]:
# check if there are any geometries that map to multiple NUTS_IDs
# and among these NUTS_IDs, there is no hierarchy relationship
from collections import defaultdict

geom_to_nuts = defaultdict(list)
for geom, nuts_id in zip(merged.geometry, merged.NUTS_ID):
    geom_to_nuts[geom].append(nuts_id)

In [None]:
repeated_geoms = {
    geom: nuts_ids for geom, nuts_ids in geom_to_nuts.items() if len(nuts_ids) > 1
}
len(repeated_geoms)

In [None]:
shared_geoms = defaultdict(list)
for geom, nuts_ids in repeated_geoms.items():
    common_prefix = os.path.commonprefix(nuts_ids)
    if not common_prefix:
        shared_geoms[geom] = nuts_ids
len(shared_geoms)

In [None]:
# check if there are any NUTS_IDS that do not map to any geometry
all_nuts_ids = set(nuts.NUTS_ID.unique())
mapped_nuts_ids = set(merged.NUTS_ID.unique())
unmapped_nuts_ids = all_nuts_ids - mapped_nuts_ids
len(unmapped_nuts_ids)

In [None]:
list(unmapped_nuts_ids)[:10]

In [None]:
# check if there are NUTS3 inside another NUTS3
nuts3 = nuts[nuts.LEVL_CODE == 3]
nuts3_sjoined = gpd.sjoin(nuts3, nuts3, how="inner", predicate="within")
nuts3_sjoined_diff = nuts3_sjoined[
    nuts3_sjoined.NUTS_ID_left != nuts3_sjoined.NUTS_ID_right
]
(
    len(nuts3),
    len(nuts3_sjoined),
    len(nuts3_sjoined_diff),
    len(set(nuts3_sjoined.NUTS_ID_left)),
)

In [None]:
# check if NUTS3 touch other NUTS3
nuts3_sjoined_other = gpd.sjoin(nuts3, nuts3, how="inner", predicate="touches")
nuts3_sjoined_other_diff = nuts3_sjoined_other[
    nuts3_sjoined_other.NUTS_ID_left != nuts3_sjoined_other.NUTS_ID_right
]
(
    len(nuts3_sjoined_other),
    len(nuts3_sjoined_other_diff),
    len(set(nuts3_sjoined_other_diff.NUTS_ID_left)),
)

In [None]:
# check if there is NUTS inside another NUTS
nuts_sjoined = gpd.sjoin(nuts, nuts, how="inner", predicate="within")
# get all rows where NUTS_IDs are different and don't share common prefix
nuts_sjoined_diff = nuts_sjoined[
    nuts_sjoined.NUTS_ID_left != nuts_sjoined.NUTS_ID_right
]
shared_nuts = []
for _, row in nuts_sjoined_diff.iterrows():
    common_prefix = os.path.commonprefix([row.NUTS_ID_left, row.NUTS_ID_right])
    if not common_prefix:
        shared_nuts.append(row)
len(shared_nuts), len(nuts_sjoined), len(nuts_sjoined_diff)

In [None]:
nuts[nuts.CNTR_CODE == "BA"]

In [None]:
nuts[nuts.NUTS_ID == "DE502"]

In [None]:
nuts[(nuts.CNTR_CODE == "DE") & (nuts.LEVL_CODE == 3)]

In [None]:
# get all nuts related to Bremen
bremen_nuts_ids = nuts[nuts.NUTS_NAME == "Bremen"][["NUTS_ID"]].NUTS_ID.tolist()
bremen_root = sorted(bremen_nuts_ids, key=len)[0]
bremen_root

In [None]:
## get all nuts under DE5
bremen_nuts = nuts[nuts.NUTS_ID.str.startswith("DE5")]
bremen_nuts

In [None]:
# check if DE50 actually within DE5
de5_geom = nuts[nuts.NUTS_ID == "DE5"].geometry
de50_geom = nuts[nuts.NUTS_ID == "DE50"].geometry
de50_geom.within(de5_geom.iloc[0])

In [None]:
de501_geom = nuts[nuts.NUTS_ID == "DE501"].geometry
de502_geom = nuts[nuts.NUTS_ID == "DE502"].geometry

In [None]:
de501_geom.within(de5_geom.iloc[0]), de502_geom.within(de5_geom.iloc[0])

In [None]:
de501_geom.within(de50_geom.iloc[0]), de502_geom.within(de50_geom.iloc[0])

In [None]:
# check if there is any NUTSi that is not within its parent NUTS(i-1)
not_within_cases = []
ctrn_codes = nuts.CNTR_CODE.unique()
for ctrn_code in ctrn_codes:
    nuts_subset = nuts[nuts.NUTS_ID.str.startswith(ctrn_code)][
        ["NUTS_ID"]
    ].NUTS_ID.tolist()
    nuts_subset.sort(key=len)  # parent NUTS will appear before child NUTS
    for nuts_id in nuts_subset:
        parent_id = nuts_id[:-1] if len(nuts_id) > len(ctrn_code) else None
        if parent_id is None:
            continue
        check_within = gpd.sjoin(
            nuts[nuts.NUTS_ID == nuts_id],
            nuts[nuts.NUTS_ID == parent_id],
            how="inner",
            predicate="within",
        )
        if len(check_within) == 0:
            not_within_cases.append((nuts_id, parent_id))
len(not_within_cases), not_within_cases

## Aggregate data by NUTS using xagg


In [None]:
%pip install cartopy matplotlib cmocean

In [None]:
# check if there are nans in t2m or tp before aggregation
nan_t2m_ds = ds["t2m"].isnull()
nan_tp_ds = ds["tp"].isnull()

In [None]:
nan_t2m_ds.sum().values, nan_tp_ds.sum().values

In [None]:
# get overlap between pixels and polygons
weightmap = xa.pixel_overlaps(ds, nuts)
weightmap

In [None]:
# get row 50 of the nuts
nuts.iloc[50]

In [None]:
weightmap.diag_fig({"NUTS_ID": "BA01"}, ds)

In [None]:
# aggregate dat in ds onto polygons in nuts
agg_ds = xa.aggregate(ds, weightmap)
agg_ds

In [None]:
out_ds = agg_ds.to_dataset()
out_ds

In [None]:
out_df = out_ds.to_dataframe().reset_index()
out_df

In [None]:
# check how many got mapped
len(out_df), len(out_df.NUTS_ID.unique())

In [None]:
# check if there is any NUTS ID that does not have t2m or tp mapped
nan_t2m = out_df[out_df["t2m"].isna()]["NUTS_ID"].unique()
nan_tp = out_df[out_df["tp"].isna()]["NUTS_ID"].unique()
len(nan_t2m), len(nan_tp), set(nan_t2m) - set(nan_tp), set(nan_tp) - set(nan_t2m)

In [None]:
nan_tp