# How to make an LGBM model


In [None]:
from pathlib import Path

import geopandas as gpd
import ray
import requests
import shapely

from eolearn.core import FeatureType

### Preparations

In [None]:
# establish project folder where all the files are saved
PROJECT_FOLDER = Path("./project")

In [None]:
# download reference data
REFERENCE_PATH = PROJECT_FOLDER / "input-data" / "reference.gpkg"
REFERENCE_PATH.parent.mkdir(parents=True, exist_ok=True)

url = "http://eo-learn.sentinel-hub.com.s3.eu-central-1.amazonaws.com/land_use_10class_reference_slovenia_partial.gpkg"
r = requests.get(url, allow_redirects=True)
with open(REFERENCE_PATH, "wb") as gpkg:
    gpkg.write(r.content)

In [None]:
# establish AOI and TOI
reference_gdf = gpd.read_file(REFERENCE_PATH)

aoi_bounds = reference_gdf.total_bounds
aoi_geometry = shapely.geometry.box(*aoi_bounds)
aoi_gdf = gpd.GeoDataFrame(geometry=[aoi_geometry], crs=reference_gdf.crs)

TOI = ["2019-01-01", "2019-12-31"]

In [None]:
# save to geojson
aoi_gdf.to_file(PROJECT_FOLDER / "input-data" / "aoi.geojson")

### Define project specifics

In [None]:
# define managers

area_config = {
    "manager": "eogrow.core.area.UtmZoneAreaManager",
    "geometry_filename": "aoi.geojson",
    "patch": {"size_x": 10000, "size_y": 10000},  # 10km x 10km, which will be 1000px x 1000px
}
logging_config = {
    "manager": "eogrow.core.logging.LoggingManager",
    "save_logs": True,
    "show_logs": True,
}
storage_config = {
    "manager": "eogrow.core.storage.StorageManager",
    "project_folder": str(PROJECT_FOLDER),
    "structure": {
        "data": "data",
        "mosaicked_data": "mosaicked_data",
        "reference": "reference",
        "samples": "samples",
        "training_data": "training_data",
        "models": "models",
        "predictions": "predictions",
    },
}

managers = {
    "area": area_config,
    "storage": storage_config,
    "logging": logging_config,
}

### Initialize cluster

In [None]:
ray.init(num_cpus=4)  # restrict number of CPUS to avoid memory issues

## Download and process imagery

In [None]:
from eogrow.pipelines.download import DownloadPipeline

BAND_NAMES = ["B02", "B03", "B04", "B08", "B11", "B12"]
download_config = dict(
    **managers,
    output_folder_key="data",
    bands_feature_name="BANDS",
    bands=BAND_NAMES,
    additional_data=[(FeatureType.MASK, "CLM"), (FeatureType.MASK, "dataMask")],
    data_collection="SENTINEL2_L2A",
    resolution=10,
    maxcc=0.2,
    time_period=TOI,
    use_dn=True,
    threads_per_worker=2,  # to avoid overloading SH
)

download_pipeline = DownloadPipeline.from_raw_config(download_config)

In [None]:
# download_pipeline.run() # UNCOMMENT AT END

We want to remove any invalid points in the data series and make it uniform. For that `eo-grow` offers mosaicking. We also calculate the NDVI in this step.

In [None]:
from eogrow.pipelines.features import MosaickingFeaturesPipeline

mosaicking_config = dict(
    **managers,
    input_folder_key="data",
    bands_feature_name="BANDS",
    output_folder_key="mosaicked_data",
    output_feature_name="FEATURES",
    data_preparation=dict(cloud_mask_feature_name="CLM", valid_data_feature_name="dataMask", validity_threshold=0.8),
    ndis=dict(NDVI=[BAND_NAMES.index("B08"), BAND_NAMES.index("B04")]),
    mosaicking=dict(time_period=TOI, n_mosaics=12),
)

mosaicking_pipeline = MosaickingFeaturesPipeline.from_raw_config(mosaicking_config)

In [None]:
mosaicking_pipeline.run()

## Reference data

In [None]:
from eogrow.pipelines.rasterize import RasterizePipeline

rasterization_config = dict(
    **managers,
    input_folder_key="input_data",
    output_folder_key="reference",
    vector_input="reference.gpkg",
    output_feature=(FeatureType.MASK_TIMELESS, "LULC"),
    raster_values_column="lulcid",
    resolution=10,
    no_data_value=0,
)

rasterization_pipeline = RasterizePipeline.from_raw_config(rasterization_config)

In [None]:
rasterization_pipeline.run()

## Sampling data for model

In [None]:
from eogrow.pipelines.sampling import FractionSamplingPipeline

rasterization_config = dict(
    **managers,
    output_folder_key="samples",
    apply_to={
        "mosaicked_data": {"data": ["FEATURES"]},
        "reference": {"mask_timeless": ["LULC"]},
    },
    seed=42,
    sampling_feature_name="LULC",
    fraction_of_samples=0.5,
    exclude_values=[0],
)

sampling_pipeline = FractionSamplingPipeline.from_raw_config(rasterization_config)

In [None]:
sampling_pipeline.run()

## Training the model

In [None]:
from eogrow.pipelines.merge_samples import MergeSamplesPipeline

merging_config = dict(
    **managers,
    input_folder_key="samples",
    output_folder_key="training_data",
    features_to_merge=[("data", "FEATURES"), ("mask_timeless", "LULC")],
)

merge_pipeline = MergeSamplesPipeline.from_raw_config(merging_config)

In [None]:
merge_pipeline.run()

In [None]:
from eogrow.pipelines.training import ClassificationTrainingPipeline

merging_config = dict(
    **managers,
    input_folder_key="training_data",
    model_folder_key="models",
    model_filename="lulc_model",
    train_features=["FEATURES.npy"],
    train_reference="LULC.npy",
    train_test_split=dict(train_size=0.8),
)

training_pipeline = ClassificationTrainingPipeline.from_raw_config(merging_config)

In [None]:
training_pipeline.run()

## Sanity check

In [None]:
from eogrow.pipelines.prediction import ClassificationPredictionPipeline

merging_config = dict(
    **managers,
    input_folder_key="mosaicked_data",
    input_features=[("data", "FEATURES")],
    output_folder_key="predictions",
    output_feature_name="predicted_LULC",
    model_folder_key="models",
    model_filename="lulc_model",
)

prediction_pipeline = ClassificationPredictionPipeline.from_raw_config(merging_config)

In [None]:
prediction_pipeline.run()

In [None]:
from eolearn.core import EOPatch

patch_name = "eopatch-id-05-col-1-row-1"

ref_patch = EOPatch.load(PROJECT_FOLDER / "reference" / patch_name)
predicted_patch = EOPatch.load(PROJECT_FOLDER / "predictions" / patch_name)
reference = ref_patch.mask_timeless["LULC"]
prediction = predicted_patch.mask_timeless["predicted_LULC"]

In [None]:
from matplotlib.colors import BoundaryNorm, ListedColormap

lulc_cmap = ListedColormap(
    [
        "#ffffff",
        "#ffff00",
        "#054907",
        "#ffa500",
        "#806000",
        "#069af3",
        "#95d0fc",
        "#967bb6",
        "#dc143c",
        "#a6a6a6",
        "#000000",
    ],
    name="lulc_cmap",
)
lulc_norm = BoundaryNorm([x - 0.5 for x in range(11)], lulc_cmap.N)

In [None]:
import matplotlib.pyplot as plt

figs, axs = plt.subplots(1, 3, figsize=(15, 6))
axs[0].imshow(reference, cmap=lulc_cmap, norm=lulc_norm, interpolation="none")
axs[1].imshow(prediction, cmap=lulc_cmap, norm=lulc_norm, interpolation="none")
axs[2].imshow(reference != prediction, interpolation="none")