# Spatial join between regions and population grid

Import necessary libraries and modules

In [None]:
from libadalina_core.readers import geopackage_to_dataframe
import pathlib
import os

Read the geopackages containing the regions and the grid of the population

In [None]:
base_path = pathlib.Path(os.environ.get("SAMPLES_DIR", "")) # local directory where the sample data is stored

population = geopackage_to_dataframe(
        str(base_path / "population-north-italy" / "nord-italia.gpkg"),
        "census2021"
)[['T', 'geometry']]

regions = geopackage_to_dataframe(
        str(base_path / "regions" / "NUTS_RG_20M_2024_4326.gpkg"),
        "NUTS_RG_20M_2024_4326.gpkg"
)[["LEVL_CODE", "NUTS_NAME", "CNTR_CODE", "geometry"]]

Import libadalina-core spatial operators for performing spatial joins and aggregations.

In [None]:
from libadalina_core.spatial_operators import spatial_join, JoinType, spatial_aggregation, AggregationType, \
    AggregationFunction

For the sake of this example, filter the regions to select only those that correspond to the provinces of Milan and Cremona.
`regions` and `filtered_regions` are geopandas DataFrame at this step

In [None]:
filtered_regions = regions[
    (regions['LEVL_CODE'] == 3) &
    (regions['CNTR_CODE'] == "IT") &
    (regions['NUTS_NAME'].str.contains('Milano|Cremona', case=False))
]

Join the dataframes so that the geometries intersect, and aggregate the amount of population from the grid dataset that is contained into a region.

In [None]:

result = spatial_aggregation(
    spatial_join(filtered_regions, population, join_type=JoinType.LEFT)
          # join operator renames the geometries adding suffixes _left and _right to avoid conflicts
          .withColumnRenamed('geometry_left', 'geometry'),
    aggregate_functions=[
        AggregationFunction("T", AggregationType.SUM, 'population', proportional='geometry_right'),
])
result.show(truncate=False)

