# Assignment of building damage data to grids from municipalities

In [None]:
%load_ext jupyter_black
import pandas as pd
import os
from pathlib import Path

pd.set_option("display.float_format", lambda x: "%.5f" % x)
input_dir = (
    Path(os.getenv("STORM_DATA_DIR"))
    / "analysis/02_new_model_input/02_housing_damage/input/"
)
baseline_input_dir = (
    Path(os.getenv("STORM_DATA_DIR")) / "analysis/01_baseline_model/input/"
)
output_dir = (
    Path(os.getenv("STORM_DATA_DIR"))
    / "analysis/02_new_model_input/02_housing_damage/output/"
)

In [None]:
# reading in file with building damage data and adm3 in grid
adm3_perc_ingrid = pd.read_csv(
    input_dir / "Adm3_Perc_inGrid_Weight_Matrix.csv"
)
perc_build_dmg_data = pd.read_csv(
    baseline_input_dir / "combined_input_data.csv"
)
build_dmg_data = pd.read_csv(input_dir / "IMpact_data_philipines_SEP_2021.csv")
phl_build_weights = pd.read_csv(input_dir / "phl_bld_weight_matrix.csv")
phl_build_grid = pd.read_csv(
    input_dir / "phl_hotosm_bld_centroid_grid_count.csv"
)
phl_build_municip = pd.read_csv(
    input_dir / "phl_hotosm_bld_centroid_municip_count.csv"
)

In [None]:
# removing duplicates from the building damage data
build_dmg_data.drop("Id", axis=1, inplace=True)
build_dmg_data.drop_duplicates(inplace=True)

## Using Area of Municipality

In [None]:
## Section describing the merging of the north and south buildings from HOTOSM shapefile
merged_df_points = phl_build_grid.copy()
merged_df_points = pd.merge(
    merged_df_points,
    phl_build_grid.drop("numbuildings", axis=1),
    on="id",
    suffixes=(None, "_y"),
)

merged_df_points["numbuildings"].describe()

In [None]:
merged_df_points_overlap = merged_df_points.loc[
    merged_df_points["Centroid"].isin(adm3_perc_ingrid["Centroid"])
]

In [None]:
all(
    item in list(adm3_perc_ingrid["ADM3_PCODE"])
    for item in list(build_dmg_data["pcode"])
)
# Not all municipalities are in the damage data set.

In [None]:
build_dmg_data[build_dmg_data["pcode"] == "PH025012000"]

We will come back to this 
as it is not clear why there are multiple rows for the same municipalities 
recorded for the same typhoon and Year in the building damage data set. 
Different total values could be accumulations. For now, treating them as separate values.

In [None]:
## aggregating these values
## To confirm whether they are additional data or cumulative data in the case where the total values are different for same typhoons.

build_dmg_data_grouped = (
    build_dmg_data.groupby(["pcode", "typhoon", "Year"]).sum().reset_index()
)
build_dmg_data_grouped[build_dmg_data_grouped["pcode"] == "PH025012000"]

In [None]:
### Section describing the computation of the building damage percentage
# combining building damage data and grid information
merged_total_damage_df = adm3_perc_ingrid.merge(
    build_dmg_data_grouped,
    left_on="ADM3_PCODE",
    right_on="pcode",
    how="left",
)

In [None]:
# computing % in each grid
# totally damaged
merged_total_damage_df["Totally_Damaged_bygrid"] = (
    merged_total_damage_df["Municipality Completeness"]
    * merged_total_damage_df["Totally"]
)

In [None]:
merged_total_damage_df[
    (merged_total_damage_df["pcode"] == "PH025012000")
    & (merged_total_damage_df["typhoon"] == "Mangkhut")
]

In [None]:
test_df = (
    merged_total_damage_df[["ADM3_PCODE", "Totally_Damaged_bygrid"]]
    .groupby("ADM3_PCODE")
    .sum()
    .reset_index()
    .merge(
        build_dmg_data_grouped[["pcode", "Totally"]]
        .groupby("pcode")
        .sum()
        .reset_index(),
        left_on="ADM3_PCODE",
        right_on="pcode",
        how="left",
    )
)
test_df["Diff"] = test_df["Totally_Damaged_bygrid"] - test_df["Totally"]
test_df["Diff"].describe()

A small difference between the number of totally damaged values. 
This can be explained by the weighting using the area overlap.

In [None]:
# computing percentage damage
# merging with building damage data
merged_perc_damage_df = merged_df_points.merge(
    merged_total_damage_df, on="id", how="right", suffixes=(None, "_y")
)
merged_perc_damage_df[
    [
        "id",
        "Centroid",
        "ADM3_PCODE",
        "ADM3_EN",
        "typhoon",
        "Year",
        "Municipality Completeness",
        "Totally",
        "numbuildings",
        "Totally_Damaged_bygrid",
    ]
].sort_values(["Totally_Damaged_bygrid"], ascending=False)

This method of assigning weights using area is providing an interesting problem 
due to some grids having a higher number of damaged buildings that actual buildings. 
We changed the approach to use number of buildings in a municipality instead.

In [None]:
merged_perc_damage_dfout = (
    merged_perc_damage_df[
        [
            "id",
            "Centroid",
            "ADM3_PCODE",
            "ADM3_EN",
            "typhoon",
            "Year",
            "numbuildings",
            "Totally_Damaged_bygrid",
        ]
    ]
    .groupby(["id", "Centroid", "typhoon", "Year"])
    .sum(numeric_only=True)
    .reset_index()
)
# computing the percentage damage
merged_perc_damage_dfout["Totally_Damaged_Perc_bygrid"] = (
    merged_perc_damage_dfout["Totally_Damaged_bygrid"]
    / merged_perc_damage_dfout["numbuildings"]
)
merged_perc_damage_dfout.sort_values(
    ["Totally_Damaged_Perc_bygrid"], ascending=False
)

In [None]:
# writing output to CSV file
# to write to csv file, group first by grid centroid
merged_perc_damage_dfout.to_csv(
    output_dir / "building_damage_bygrid_using_area.csv", index=False
)

## Using Number of Buildings

Merging all dataframes, one with number of buildings in municipality, 
one with number of damaged buildings in municipality and 
the last with weights for each grid and municipality.

In [None]:
phl_bld_all_merged_df = phl_build_municip.merge(
    build_dmg_data_grouped,
    left_on="ADM3_PCODE",
    right_on="pcode",
    how="left",
    suffixes=(None, "_y"),
).merge(phl_build_weights, on="ADM3_PCODE", how="left", suffixes=(None, "_y"))

In [None]:
phl_bld_all_merged_df["numbuildings_bygrid"] = (
    phl_bld_all_merged_df["weight"] * phl_bld_all_merged_df["numbuildings"]
)
phl_bld_all_merged_df["Totally_Damaged_bygrid"] = (
    phl_bld_all_merged_df["weight"] * phl_bld_all_merged_df["Totally"]
)
phl_bld_all_merged_df[
    [
        "id",
        "Centroid",
        "ADM3_PCODE",
        "typhoon",
        "Year",
        "weight",
        "Totally",
        "numbuildings",
        "numbuildings_bygrid",
        "Totally_Damaged_bygrid",
    ]
].sort_values(["Totally_Damaged_bygrid"], ascending=False)

In [None]:
build_dmg_data[build_dmg_data["pcode"] == "PH045635000"]

In [None]:
phl_bld_all_merged_dfout = phl_bld_all_merged_df[
    [
        "id",
        "Centroid",
        "typhoon",
        "Year",
        "weight",
        "numbuildings_bygrid",
        "Totally_Damaged_bygrid",
    ]
]
phl_bld_all_merged_dfout = (
    phl_bld_all_merged_dfout.groupby(["id", "Centroid", "typhoon", "Year"])
    .sum()
    .reset_index()
)

phl_bld_all_merged_dfout["Totally_Damaged_Perc_bygrid"] = (
    phl_bld_all_merged_dfout["Totally_Damaged_bygrid"]
    / phl_bld_all_merged_dfout["numbuildings_bygrid"]
)
phl_bld_all_merged_dfout.sort_values(
    ["Totally_Damaged_Perc_bygrid"], ascending=False
)

In [None]:
phl_bld_all_merged_dfout["Totally_Damaged_Perc_bygrid"].describe()

In [None]:
# writing output to CSV file
# to write to csv file, group first by grid centroid
phl_bld_all_merged_dfout.to_csv(
    output_dir / "building_damage_bygrid.csv", index=False
)