# GHS Data Download and Aggregating

This notebook describes the process of downloading and aggregating GHS data from [here](https://ghsl.jrc.ec.europa.eu/download.php?). The notebook contains aggregation for the grid:

- Classification of pixels as rural or urban for the raster.
    - Using 21 and above for urban and 13 and below for rural.
- Fraction of each grid pixel that is rural/ urban
- Find population by grid pixel.

The epoch used in closest to the date `27th Feb 2023` and would be 2025.

In [1]:
%load_ext jupyter_black
import pandas as pd
import geopandas as gpd
from pathlib import Path
import os
import requests, zipfile, io
import rasterio
import rioxarray as rxr
from rioxarray.merge import merge_arrays

In [2]:
base_dir = Path(os.getenv("STORM_DATA_DIR")) / "analysis/02_new_model_input/"
input_dir = base_dir / "06_settlement/input/"
shp_input_dir = base_dir / "02_housing_damage/input/"
grid_input_dir = base_dir / "02_housing_damage/output/"
output_dir = base_dir / "06_settlement/output/"

In [3]:
adm3_shp = gpd.read_file(
    shp_input_dir / "phl_adminboundaries_candidate_adm3.zip"
)

# grid
grid = gpd.read_file(grid_input_dir / "phl_0.1_degree_grid_land_overlap.gpkg")

In [4]:
smod_link = "https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_SMOD_GLOBE_R2022A/GHS_SMOD_P2025_GLOBE_R2022A_54009_1000/V1-0/GHS_SMOD_P2025_GLOBE_R2022A_54009_1000_V1_0.zip"
pop_link = "https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2022A/GHS_POP_P2025_GLOBE_R2022A_54009_100/V1-0/tiles/"

## Degree of Urbanisation

### Urban-Rural Classification

In [5]:
# Downloading the whole global data set as it is small
req = requests.get(smod_link, verify=False, stream=True)
with zipfile.ZipFile(io.BytesIO(req.content)) as zObj:
    fileNames = zObj.namelist()
    for fileName in fileNames:
        if fileName.endswith("tif"):
            content = zObj.open(fileName).read()
            open(input_dir / "SMOD" / fileName, "wb").write(content)



In [6]:
# Reading in raster
file_name = os.listdir(input_dir / "SMOD")
smod_raster = rasterio.open(input_dir / "SMOD" / file_name[0])
smod_array = smod_raster.read(1)
smod_array

array([[-200, -200, -200, ..., -200, -200, -200],
       [-200, -200, -200, ..., -200, -200, -200],
       [-200, -200, -200, ..., -200, -200, -200],
       ...,
       [-200, -200, -200, ..., -200, -200, -200],
       [-200, -200, -200, ..., -200, -200, -200],
       [-200, -200, -200, ..., -200, -200, -200]], dtype=int16)

In [11]:
# no data are set at -200
# water seems to be set to 10
# converting to similar CRS
smod_raster.crs

CRS.from_wkt('PROJCS["World_Mollweide",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Mollweide"],PARAMETER["central_meridian",0],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]')

In [10]:
grid.total_bounds

array([114.25,   4.55, 126.65,  21.15])

In [12]:
# converting crs and clipping
# checking if crs are the same
smod_raster.crs == grid.crs

False

In [13]:
smod_raster = rxr.open_rasterio(input_dir / "SMOD" / file_name[0])
smod_raster.rio.crs

CRS.from_wkt('PROJCS["World_Mollweide",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Mollweide"],PARAMETER["central_meridian",0],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH]]')

In [14]:
smod_raster_wgs84 = smod_raster.rio.reproject(grid.crs)
smod_raster_wgs84_clip = smod_raster_wgs84.rio.clip_box(*grid.total_bounds)

In [15]:
smod_grid_vals = pd.DataFrame(
    {
        "id": grid["id"],
        "Centroid": grid["Centroid"],
        "urban": None,
        "rural": None,
        "water": None,
    }
)
for grd in grid.Centroid:
    grd_sel = grid[grid.Centroid == grd]
    grid_rast = smod_raster_wgs84_clip.rio.clip(
        grd_sel["geometry"], all_touched=True
    )
    smod_grid_vals.loc[grd_sel.index.values, ["urban"]] = (
        (grid_rast >= 21) & (grid_rast <= 30)
    ).sum().values / grid_rast.count().values
    smod_grid_vals.loc[grd_sel.index.values, ["rural"]] = (
        (grid_rast >= 11) & (grid_rast <= 13)
    ).sum().values / grid_rast.count().values
    smod_grid_vals.loc[grd_sel.index.values, ["water"]] = (
        grid_rast == 10
    ).sum().values / grid_rast.count().values

In [17]:
smod_grid_vals.tail(10)

Unnamed: 0,id,Centroid,urban,rural,water
3716,20513,126.5E_7.3N,0.008264,0.991736,0.0
3717,20514,126.5E_7.2N,0.049587,0.85124,0.099174
3718,20515,126.5E_7.1N,0.066116,0.363636,0.570248
3719,20516,126.5E_7.0N,0.041322,0.024793,0.933884
3720,20676,126.6E_7.7N,0.024793,0.099174,0.876033
3721,20677,126.6E_7.6N,0.07438,0.181818,0.743802
3722,20678,126.6E_7.5N,0.0,0.479339,0.520661
3723,20679,126.6E_7.4N,0.0,0.190083,0.809917
3724,20680,126.6E_7.3N,0.033058,0.297521,0.669421
3725,20681,126.6E_7.2N,0.024793,0.099174,0.876033


In [18]:
del (
    smod_raster,
    smod_array,
    smod_raster_wgs84,
    smod_raster_wgs84_clip,
)

## Population

### Total Population by grid

In [58]:
# downloading the popoulation data
# selected from here: https://ghsl.jrc.ec.europa.eu/download.php?ds=pop
phl_boxes = ["R7_C30", "R7_C31", "R8_C30", "R8_C31", "R9_C30", "R9_C31"]
file_list = [
    "GHS_POP_P2025_GLOBE_R2022A_54009_100_V1_0_" + patt + ".zip"
    for patt in phl_boxes
]
for file in file_list:
    req = requests.get(pop_link + file, allow_redirects=True)
    with zipfile.ZipFile(io.BytesIO(req.content)) as zObj:
        fileNames = zObj.namelist()
        for fileName in fileNames:
            if fileName.endswith("tif"):
                content = zObj.open(fileName).read()
                open(input_dir / "POP" / fileName, "wb").write(content)

In [38]:
# opening files and merging them
fileList = os.listdir(input_dir / "POP")
mosaic_arrays = []
for file in fileList:
    rast = (
        rxr.open_rasterio(input_dir / "POP" / file)
        .rio.reproject(grid.crs)
        .rio.clip_box(*grid.total_bounds)
    )
    mosaic_arrays.append(rast)

In [39]:
mosaic_raster = merge_arrays(mosaic_arrays)

In [40]:
mosaic_raster.rio.crs

CRS.from_epsg(4326)

In [41]:
del mosaic_arrays

In [42]:
pop_raster_wgs84_clip = mosaic_raster.copy()

In [43]:
pop_grid_vals = pd.DataFrame(
    {
        "id": grid["id"],
        "Centroid": grid["Centroid"],
        "total_pop": None,
    }
)
for grd in grid.Centroid:
    grd_sel = grid[grid.Centroid == grd]
    grid_rast = pop_raster_wgs84_clip.rio.clip(
        grd_sel["geometry"], all_touched=True, from_disk=True
    ).squeeze()
    pop_grid_vals.loc[grd_sel.index.values, ["total_pop"]] = (
        (grid_rast.where(grid_rast >= 0)).sum().values
    )

In [44]:
pop_grid_vals

Unnamed: 0,id,Centroid,total_pop
0,101,114.3E_11.1N,0.0
1,4475,116.9E_7.9N,0.0
2,4639,117.0E_8.2N,356.354257
3,4640,117.0E_8.1N,7565.226955
4,4641,117.0E_8.0N,17301.534394
5,4642,117.0E_7.9N,4279.533358
6,4643,117.0E_7.8N,4987.95843
7,4805,117.1E_8.3N,138.250653
8,4806,117.1E_8.2N,31498.222855
9,4807,117.1E_8.1N,5033.615869


In [None]:
# merging the two dataframes
merged_ghs_df = smod_grid_vals.merge(pop_grid_vals, on=["id", "Centroid"])

In [None]:
# writing output
merged_ghs_df.to_csv(output_dir / "ghs_rural_urban_pop.csv", index=False)