# Prepare land cover for the precursor run


In [None]:
%load_ext dotenv
%dotenv
%load_ext autoreload
%autoreload 2

In [None]:
from src.config import get_config, get_dask_cluster
from src.job_generation import dict_to_intarraymap

config = get_config()
cluster, client = get_dask_cluster(config)

In [None]:
from functools import partial
from pathlib import Path

import dask.array
import numpy as np
import pandas as pd
import xarray as xr
import yaml

## Preprocess Corine 100 m raster dataset


In [None]:
clc_raster = xr.open_dataset(
    config.path.data.raw + "corine_land_cover/U2018_CLC2018_V2020_20u1.tif",
    engine="rasterio",
    chunks={"x": "auto", "y": -1},
)
clc_raster["band_data"] = clc_raster["band_data"]

We use a section from the central Europe as our representative land area (area of interest, AOI).


In [None]:
aoi = {"miny": 45.0, "maxy": 55.0, "minx": 0.0, "maxx": 20.0, "crs": "EPSG:4326"}

In [None]:
clc_raster = clc_raster.rio.clip_box(**aoi)

Next, compute the prevalence of each land cover class in the area of interest.


In [None]:
clc_raster["band_data"] = clc_raster["band_data"].fillna(-127).astype(np.int8)
lc_classes, lc_class_counts = dask.array.unique(
    clc_raster["band_data"].fillna(-127).data, return_counts=True
)
lc_classes_lc_class_counts = xr.apply_ufunc(
    partial(np.unique, return_counts=True), clc_raster["band_data"], dask="parallelized"
)
# lc_classes = lc_classes.astype(np.uint8)
lc_classes.compute()
# lc_class_counts = lc_class_counts.astype(np.uint8)
lc_class_counts.compute()
lc_classes = np.array(lc_classes)
lc_class_counts = np.array(lc_class_counts)

Get the classification labels from a legend file. Shift is required to align the data with unique() output.


In [None]:
legend = pd.read_csv(
    config.path.data.raw + "corine_land_cover/CLC2018_CLC2018_V2018_20_QGIS.txt",
    names=["ID", "R", "G", "B", "A", "Class"],
).shift()
legend.loc[0, "ID"] = -127
legend["ID"] = legend["ID"].astype(int)

Finally, merge the classes with the counts.


In [None]:
lcs = pd.concat([legend, pd.Series(lc_class_counts, name="Count")], axis=1).set_index(
    "ID"
)
lcs

## Land cover class mapping

The Corine classification needs to be mapped to PALM-LSM classification, which follows that of H-TESSEL (ECMWF's IFS). We read the mapping from a config file individually for vegetation and water surfaces. The latter is not technically categorized, but is needed for computing surface fractions. Note that the water surfaces in Corine include only inland waters and coastal seawater, thus the real water fraction in our AOI is significantly. However, this is fine as we do not really want to simulate coastal/marine boundary layer, but rather a representative land boundary layer with some contribution from water surfaces.


In [None]:
with open(
    config.path.data.raw + "corine_land_cover/corine_to_lsm_mapping.yml"
) as map_file:
    mapping = yaml.safe_load(map_file)
vegetation_mapping = dict_to_intarraymap(mapping["vegetation"])
water_mapping = dict_to_intarraymap(mapping["water"])

Apply the mapping and count occurrence and fractions for the target categories.


In [None]:
lcs_vegetation = lcs.copy()
lcs_vegetation.index = lcs_vegetation.index.map(
    lambda x: vegetation_mapping.get(x, -127)
)
lcs_vegetation = lcs_vegetation.loc[:, ["Count"]].groupby("ID").sum().drop(-127)
lcs_vegetation["Fraction"] = lcs_vegetation["Count"] / lcs_vegetation["Count"].sum()
lcs_vegetation

In [None]:
lcs_water = lcs.copy()
lcs_water.index = lcs_water.index.map(lambda x: water_mapping.get(x, -127))
lcs_water = lcs_water.loc[:, ["Count"]].groupby("ID").sum().drop(-127)
lcs_water["Fraction"] = lcs_water["Count"] / lcs_water["Count"].sum()
lcs_water

In [None]:
surface_fraction = pd.DataFrame(
    [],
    index=pd.Series(["Vegetation", "Water"], name="Surface type"),
    columns=["Count", "Fraction"],
)
surface_fraction.loc["Vegetation", "Count"] = lcs_vegetation.loc[:, "Count"].sum()
surface_fraction.loc["Water", "Count"] = lcs_water.loc[:, "Count"].sum()
surface_fraction.loc[:, "Fraction"] = (
    surface_fraction.loc[:, "Count"] / surface_fraction.loc[:, "Count"].sum()
)
surface_fraction

Store these for later usage in job generation.


## Soil type

These are derived from the ERA5 data. We take the same area of interest, and compute the most prevalence of soil types.


In [None]:
era5_st = xr.open_dataset(
    config.path.data.raw + "era5-land/era5-land_soil_type.grib", engine="cfgrib"
)
era5_st = era5_st.where(era5_st.longitude >= aoi["minx"], drop=True)
era5_st = era5_st.where(era5_st.longitude <= aoi["maxx"], drop=True)
era5_st = era5_st.where(era5_st.latitude >= aoi["miny"], drop=True)
era5_st = era5_st.where(era5_st.latitude <= aoi["maxy"], drop=True)
era5_st["slt"] = era5_st.slt.astype(np.int8)

In [None]:
elem, count = np.unique(era5_st["slt"], return_counts=True)
soil_type = pd.concat(
    [pd.Series(elem, name="ID"), pd.Series(count, name="Count")], axis=1
).set_index("ID")
soil_type

## Storing the interim data


In [None]:
data_path = Path(config.path.data.interim) / "land_cover"
data_path.mkdir(parents=True, exist_ok=True)
lcs_vegetation.to_csv(data_path / "vegetation_fractions.csv")
lcs_water.to_csv(data_path / "water_fractions.csv")
surface_fraction.to_csv(data_path / "surface_fractions.csv")
soil_type.to_csv(data_path / "soil_fractions.csv")