# Post Processing

This notebook demonstrates how to use `history.postprocessing` module. For more details on the Post-Processing : [README](README.md). 


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import history.postprocessing as pp
from pathlib import Path
import pandas as pd

## ‚öôÔ∏è General Settings

In [None]:
# level 1 directory
raw_dir = Path("/mnt/summer/USERS/DEHECQA/history/output/raw")
extracted_dir = Path("/mnt/summer/USERS/DEHECQA/history/output/extracted")
proc_dir = Path("/mnt/summer/USERS/DEHECQA/history/output/processing")
plot_dir = Path("/mnt/summer/USERS/DEHECQA/history/output/plots")

# level 2 directory
symlinks_dir = proc_dir / "symlinks"
raw_dems_dir = proc_dir / "raw_dems"
coreg_dems_dir = proc_dir / "coregistered_dems"
before_coreg_ddems_dir = proc_dir / "ddems" / "before_coregistration"
after_coreg_ddems_dir = proc_dir / "ddems" / "after_coregistration"
std_dems_dir = proc_dir / "std_dems"

OVERWRITE = False
DRY_RUN = False # set this to True to avoid process point2dem
PDAL_EXEC_PATH = "/home/godinlu/micromamba/envs/pdal/bin/pdal"
MAX_WORKERS = 4

## ‚öôÔ∏è References Data Settings
For each pair site, dataset assign 3 references files : The reference DEM and mask and the landcover.

In [None]:
references_data_mapping = {
    ("casa_grande", "aerial") : {
        "ref_dem": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/casagrande_ref_dem_zoom_5m.tif",
        "ref_dem_mask": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/casagrande_ref_dem_zoom_5m_mask.tif",
        "landcover": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/casagrande_landcover_zoom.tif"
    },
    ("casa_grande", "kh9mc") : {
        "ref_dem": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/casagrande_ref_dem_large.tif",
        "ref_dem_mask": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/casagrande_ref_dem_large_mask.tif",
        "landcover": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/casagrande_landcover_large.tif"
    },
    ("iceland", "aerial") : {
        "ref_dem": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/iceland_ref_dem_zoom_5m.tif",
        "ref_dem_mask": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/iceland_ref_dem_zoom_5m_mask.tif",
        "landcover": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/iceland_landcover_zoom.tif"
    },
    ("iceland", "kh9mc") : {
        "ref_dem": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/iceland_ref_dem_large.tif",
        "ref_dem_mask": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/iceland_ref_dem_large_mask.tif",
        "landcover": "/mnt/summer/USERS/DEHECQA/history/output/aux_data/iceland_landcover_large.tif"
    }
}

# Same references dem for KH-9 MC and KH-9 PC
references_data_mapping[("casa_grande", "kh9pc")] = references_data_mapping[("casa_grande", "kh9mc")]
references_data_mapping[("iceland", "kh9pc")] = references_data_mapping[("iceland", "kh9mc")]


references_data = pp.ReferencesData(references_data_mapping)

### üß© Step 1 ‚Äî Extract archives

In [None]:
pp.pipeline.uncompress_all_submissions(raw_dir, extracted_dir, OVERWRITE, MAX_WORKERS)

### üß© Step 2 ‚Äî Analyse submissions and create symlinks

In [None]:
pp.pipeline.index_submissions_and_link_files(extracted_dir, symlinks_dir)

Visualize all founds files for each submissions.

In [None]:
symlinks_directories = list(symlinks_dir.iterdir())
pp.viz.visualize_files_presence_map(symlinks_directories)

### üß© Step 4 ‚Äî Convert point clouds to DEMs

In [None]:
pointcloud_files = list((symlinks_dir / "dense_pointclouds").iterdir())

pp.pipeline.process_pointclouds_to_dems(
    pointcloud_files,
    raw_dems_dir,
    references_data,
    PDAL_EXEC_PATH,
    OVERWRITE,
    DRY_RUN,
    MAX_WORKERS
)

For submissions where the pointcloud to DEM doesn't work, add manually provided DEM. The provided DEM will be reprojected on the corresponding reference DEM.

In [None]:
dem_files = [
    "/mnt/summer/USERS/DEHECQA/history/output/processing/symlinks/dems/CoSP_CG_PC_PP_CY_GN_PN_MN_dem.tif",
    "/mnt/summer/USERS/DEHECQA/history/output/processing/symlinks/dems/CoSP_IL_PC_PP_CY_GN_PN_MN_dem.tif",
    "/mnt/summer/USERS/DEHECQA/history/output/processing/symlinks/dems/HIATUS_CG_AI_PP_CY_GY_PN_MN_dem.tif",
    "/mnt/summer/USERS/DEHECQA/history/output/processing/symlinks/dems/HIATUS_CG_AI_PP_CY_GY_PN_MY_dem.tif",
    "/mnt/summer/USERS/DEHECQA/history/output/processing/symlinks/dems/JB_CG_AI_PP_CY_GN_PN_MN_3m_EPSG4326_zmae_DEM.tif",
    "/mnt/summer/USERS/DEHECQA/history/output/processing/symlinks/dems/JB_CG_MC_PP_CY_GN_PN_MN_20m_EPSG4326_zmae_DEM.tif",
    "/mnt/summer/USERS/DEHECQA/history/output/processing/symlinks/dems/JB_IL_AI_PP_CY_GN_PN_MN_5m_EPSG4326_zmae_DEM.tif",
    "/mnt/summer/USERS/DEHECQA/history/output/processing/symlinks/dems/JB_IL_MC_PP_CY_GN_PN_MN_20m_EPSG4326_zmae_DEM.tif"
]

pp.pipeline.add_provided_dems(dem_files, raw_dems_dir, references_data)

### üß© Step 5 ‚Äî Coregister DEMs

In [None]:
pp.pipeline.coregister_dems(raw_dems_dir.glob("*-DEM.tif"), coreg_dems_dir, references_data, OVERWRITE)

Visualize for each submissions all founds files and all generated DEMs and coregistered DEMs

In [None]:
symlinks_directories = list(symlinks_dir.iterdir()) + [raw_dems_dir, coreg_dems_dir]
pp.viz.visualize_files_presence_map(symlinks_directories)

### üß© Step 6 ‚Äî Generate Differential DEMs (DDEMs)


In [None]:
pp.pipeline.generate_ddems(raw_dems_dir.glob("*-DEM.tif"), before_coreg_ddems_dir, references_data, OVERWRITE, MAX_WORKERS)
pp.pipeline.generate_ddems(coreg_dems_dir.glob("*-DEM.tif"), after_coreg_ddems_dir, references_data, OVERWRITE, MAX_WORKERS)

### üß© Step 7 ‚Äî Compute general statistics

| Data Type          | Directory / Source                   | Function Used                     | Prefix                  |
|-------------------|------------------------------------|----------------------------------|------------------------|
| Raw DEMs           | generated_raw_dems, provided_raw_dems | compute_dems_statistics_df       | raw_dem_               |
| Coregistered DEMs  | coregistered_dems                   | compute_dems_statistics_df       | coreg_dem_             |
| DDEMs (Before)     | ddems/before_coregistration         | compute_dems_statistics_df       | ddem_before_           |
| DDEMs (After)      | ddems/after_coregistration          | compute_dems_statistics_df       | ddem_after_            |
| Dense Point Clouds | symlinks/dense_pointclouds          | compute_pcs_statistics_df        | dense_pointcloud_      |
| Sparse Point Clouds| symlinks/sparse_pointclouds         | compute_pcs_statistics_df        | sparse_pointcloud_     |
| Coregistration Shifts | coregistered_dems                | get_coregistration_statistics_df |                       |


In [None]:
dems_dict = {
    "raw_dem_": raw_dems_dir.glob("*-DEM.tif"),
    "coreg_dem_": coreg_dems_dir.glob("*-DEM.tif"),
    "ddem_before_": before_coreg_ddems_dir.glob("*-DDEM.tif"),
    "ddem_after_": after_coreg_ddems_dir.glob("*-DDEM.tif"),
}
pcs_dict = {
    "dense_pointcloud_": (symlinks_dir / "dense_pointclouds").iterdir(),
    "sparse_pointcloud_": (symlinks_dir / "sparse_pointclouds").iterdir()
}

dem_dfs = [pp.stats.compute_dems_statistics_df(dem_files, prefix, MAX_WORKERS) for prefix, dem_files in dems_dict.items()]
pc_dfs = [pp.stats.compute_pcs_statistics_df(pc_files, prefix) for prefix, pc_files in pcs_dict.items()]
shifts_df = pp.stats.get_coregistration_statistics_df(coreg_dems_dir.glob("*-DEM.tif"))

# Combine all statistics into a single dataframe
# Group by 'code' index and keep the first occurrence to merge all sources
combined_df = pd.concat([*dem_dfs, *pc_dfs, shifts_df]).groupby(level=0).first()

combined_df.to_csv(proc_dir / "global_statistics.csv")


Add an inliers filter based on the NMAD of the altitude difference from the reference DEM.

In [None]:

df = pd.read_csv(proc_dir / "global_statistics.csv", index_col="code")
df["inliers"] = True

outlier_ids = [
    "HIATUS_CG_AI_PP_CY_GY_PN_MN", "HIATUS_CG_AI_PP_CY_GY_PN_MY",
    "HSfM_CG_AI_PP_CY_GN_PY_MN", 
    "fdahle_CG_AI_PP_CN_GY_PN_MN", "fdahle_CG_AI_PP_CY_GN_PN_MN",
    "lkugl_CG_AI_PP_CN_GN_PY_MN_V1", "lkugl_CG_AI_PP_CN_GN_PY_MN_V2", "lkugl_CG_AI_PP_CN_GN_PY_MN_V3", 
    "lpierm_CG_AI_PP_CY_GY_PN_MN",
    "fdahle_IL_AI_PP_CY_GN_PN_MN", "fdahle_IL_AI_PP_CY_GY_PN_MN"
]
df.loc[outlier_ids, "inliers"] = False

df["inliers"].to_csv(proc_dir / "inliers.csv")

### üß© Step 8 ‚Äî Compute landcover-based statistics

In [None]:
landcover_df = pp.stats.compute_landcover_statistics(after_coreg_ddems_dir.glob("*-DDEM.tif"), references_data, MAX_WORKERS)
landcover_df.to_csv(proc_dir / "landcover_statistics.csv", index=None)

### üß© Step 9 ‚Äî Generate STD DEMs

genrate for each par site, dataset 2 STD DEM: one with all DEMs and one with only inliers DEMs.

In [None]:
df: pd.DataFrame = pd.read_csv(proc_dir / "global_statistics.csv", index_col="code")
inliers = pd.read_csv(proc_dir / "inliers.csv", index_col="code")["inliers"]


for (site, dataset), group in df.groupby(["site", "dataset"]):
    group_inliers = group.loc[inliers]

    dem_files = group["coreg_dem_file"].dropna().to_list()
    dem_files_inliers = group_inliers["coreg_dem_file"].dropna().to_list() 

    pp.pipeline.create_std_dem(dem_files, std_dems_dir / f"std_dem_{site}_{dataset}.tif", OVERWRITE)
    pp.pipeline.create_std_dem(dem_files_inliers, std_dems_dir / f"std_dem_{site}_{dataset}_inliers.tif", OVERWRITE)

### üß© Step 10 ‚Äî Compute landcover-based statistics on STD DEMs

In [None]:
std_lc_df = pp.stats.compute_landcover_statistics_on_std_dems(std_dems_dir.glob("*.tif"), references_data, MAX_WORKERS)
std_lc_df.to_csv(proc_dir / "std_landcover_statistics.csv", index=None)

### üß© Step 11 ‚Äî Generate visualizations

In [None]:
df = pd.read_csv(proc_dir / "global_statistics.csv", index_col = "code")
lc_df = pd.read_csv(proc_dir / "landcover_statistics.csv")
std_lc_df = pd.read_csv(proc_dir / "std_landcover_statistics.csv")

inliers = pd.read_csv(proc_dir / "inliers.csv", index_col = "code")["inliers"]

#### General Statistics plots

In [None]:
pp.viz.barplot_var(
    df,
    plot_dir / "pointcloud_point_count.png",
    "dense_pointcloud_point_count",
    "Point count in dense point-cloud file",
)
pp.viz.barplot_var(
    df,
    plot_dir / "nmad_after_coregistration.png",
    "ddem_after_nmad",
    "NMAD of Altitude differences with ref DEM after coregistration by code",
)
pp.viz.barplot_var(
    df, plot_dir / "raw_dem_voids.png", "raw_dem_percent_nodata", "Raw DEM nodata percent"
)
pp.viz.generate_landcover_grouped_boxplot_from_std_dems(
    std_lc_df, plot_dir / "landcover_boxplot_from_std_dems.png"
)

std_lc_df_inliers = std_lc_df.loc[std_lc_df["std_dem_file"].str.contains("_inliers")]
pp.viz.generate_landcover_grouped_boxplot_from_std_dems(
    std_lc_df_inliers, plot_dir / "landcover_boxplot_from_std_dems_inliers.png"
)

#### Generate Plots for each pairs site, dataset

In [None]:
for (site, dataset), group in df.groupby(["site", "dataset"]):
    # set the sub dir with site and dataset
    sub_dir = plot_dir / f"{site}_{dataset}"

    # add site and dataset before all title plots
    title_prefix = f"({site} - {dataset})"
    pp.viz.generate_plot_nmad_before_vs_after(
        group,
        sub_dir / "nmad_before_vs_after_coregistration.png",
        title=f"{title_prefix} NMAD of DEM differences before vs after coregistration",
    )
    pp.viz.generate_plot_coreg_shifts(
        group,
        sub_dir / "coregistration_shifts.png",
        title=f"{title_prefix} Coregistration shifts",
    )

    # Generate the same plots but with only inliers to increase the visibility
    group_inliers = group.loc[inliers]

    pp.viz.generate_plot_nmad_before_vs_after(
        group_inliers,
        sub_dir / "nmad_before_vs_after_coregistration_inliers.png",
        title=f"{title_prefix} NMAD of DEM differences before vs after coregistration (inliers only)",
    )
    pp.viz.generate_plot_coreg_shifts(
        group_inliers,
        sub_dir / "coregistration_shifts_inliers.png",
        title=f"{title_prefix} Coregistration shifts (inliers only)",
    )


#### Generate landcover Plots for each pairs site, dataset

In [None]:
for (site, dataset), group in lc_df.groupby(["site", "dataset"]):
    # set the sub dir with site and dataset
    sub_dir = plot_dir / f"{site}_{dataset}"

    pp.viz.generate_landcover_grouped_boxplot(
        group,
        sub_dir / "landcover_grouped_boxplot.png",
        title=f"{title_prefix} Boxplot of Altitude difference with ref DEM by code/landcover",
    )
    pp.viz.generate_landcover_nmad(
        group,
        sub_dir / "landcover_nmad.png",
    title=f"{title_prefix} NMAD of Altitude difference with ref DEM by code/landcover",
    )

    # landcover plots inliers
    group_inliers = group[group["code"].isin(inliers.index[inliers])]
    pp.viz.generate_landcover_grouped_boxplot(
        group_inliers,
        sub_dir / "landcover_grouped_boxplot_inliers.png",
        title=f"{title_prefix} Boxplot of Altitude difference with ref DEM by code/landcover (inliers only)",
    )
    pp.viz.generate_landcover_nmad(
        group_inliers,
        sub_dir / "landcover_nmad_inliers.png",
        title=f"{title_prefix} NMAD of Altitude difference with ref DEM by code/landcover (inliers only)",
    )

#### Generate STD DEMs plots

In [None]:
for (std_dem_file, site, dataset), _ in std_lc_df.groupby(["std_dem_file", "site", "dataset"]):
    output_path = plot_dir / f"{site}_{dataset}" / Path(std_dem_file).with_suffix(".png").name
    pp.viz.generate_std_dem_plots(std_dem_file, output_path)


#### Generate individual Coregistration Plots

In [None]:
for (site, dataset), group in df.groupby(["site", "dataset"]):
    pp.viz.generate_coregistration_individual_plots(group, plot_dir / f"{site}_{dataset}" / "coregistrations")

#### Generate all Mosaic plots

In [None]:
pp.viz.generate_all_mosaics(df, plot_dir, MAX_WORKERS)