# Assignment of building damage data to grids from municipalities

This notebook that uses the weights from the area and the number of buildings.
The rasterisation of damage data is done in this notebook.

In [33]:
%load_ext jupyter_black
import pandas as pd
import os
from pathlib import Path

pd.set_option("display.float_format", lambda x: "%.5f" % x)
input_dir = (
    Path(os.getenv("STORM_DATA_DIR"))
    / "analysis/02_new_model_input/02_housing_damage/input/"
)
baseline_input_dir = (
    Path(os.getenv("STORM_DATA_DIR")) / "analysis/01_baseline_model/input/"
)
output_dir = (
    Path(os.getenv("STORM_DATA_DIR"))
    / "analysis/02_new_model_input/02_housing_damage/output/"
)

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [34]:
# reading in file with building damage data and adm3 in grid
adm3_perc_ingrid = pd.read_csv(
    input_dir / "Adm3_Perc_inGrid_Weight_Matrix.csv"
)
perc_build_dmg_data = pd.read_csv(
    baseline_input_dir / "combined_input_data.csv"
)
build_dmg_data = pd.read_csv(input_dir / "IMpact_data_philipines_SEP_2021.csv")
phl_build_weights = pd.read_csv(input_dir / "phl_bld_weight_matrix.csv")
phl_build_grid = pd.read_csv(
    input_dir / "phl_hotosm_bld_centroid_grid_count.csv"
)
phl_build_municip = pd.read_csv(
    input_dir / "phl_hotosm_bld_centroid_municip_count.csv"
)

In [35]:
# removing duplicates from the building damage data
# duplicates removed and kep only the first value for each municipality, typhoon and year
build_dmg_data.drop("Id", axis=1, inplace=True)
build_dmg_data.drop_duplicates(
    subset=["pcode", "typhoon", "Year"], inplace=True
)

In [36]:
build_dmg_data[build_dmg_data["pcode"] == "PH025012000"]

Unnamed: 0,pcode,typhoon,Year,Totally,Partially,total
740,PH025012000,Nesat,2011,1.0,5.0,6.0
1227,PH025012000,Utor,2013,1.0,104.0,105.0
1728,PH025012000,Fung-wong,2014,12.0,149.0,161.0
1964,PH025012000,Goni,2015,0.0,20.0,20.0
2114,PH025012000,Koppu,2015,22.0,639.0,661.0
2355,PH025012000,Sarika,2016,1.0,20.0,21.0
2560,PH025012000,Haima,2016,1.0,1.0,2.0
2944,PH025012000,Mangkhut,2018,6.0,211.0,217.0
3524,PH025012000,YUTU,2018,0.0,8.0,8.0
3580,PH025012000,PHANFONE,2019,124.0,1103.0,1227.0


We will come back to this 
as it is not clear why there are multiple rows for the same municipalities 
recorded for the same typhoon and Year in the building damage data set. 
Different total values could be accumulations. For now, treating them as separate values.

In [37]:
## aggregating these values
## To confirm whether they are additional data or cumulative data in the case where the total values are different for same typhoons.

# build_dmg_data_grouped = (
#    build_dmg_data.groupby(["pcode", "typhoon", "Year"]).sum().reset_index()
# )
# build_dmg_data_grouped[build_dmg_data_grouped["pcode"] == "PH025012000"]

In [38]:
all(
    item in list(phl_build_municip["ADM3_PCODE"])
    for item in list(build_dmg_data["pcode"])
)
# Not all municipalities are in the damage data set.
# Not all municipalities in the building damage data can be found in the admin 3 shapefile.
# removing those buildings with incorrect pcode in them
build_dmg_data_grouped = build_dmg_data[
    build_dmg_data["pcode"].isin(list(phl_build_municip["ADM3_PCODE"]))
]
build_dmg_data_grouped["Totally"].sum()

1670866.0

## Using Area of Municipality

In [39]:
## Section describing the merging of the north and south buildings from HOTOSM shapefile
merged_df_points = phl_build_grid.copy()
merged_df_points = pd.merge(
    merged_df_points,
    phl_build_grid.drop("numbuildings", axis=1),
    on="id",
    suffixes=(None, "_y"),
)

merged_df_points["numbuildings"].describe()

count     3352.00000
mean      3084.21002
std      12590.52975
min          1.00000
25%         78.00000
50%        395.00000
75%       1913.00000
max     302704.00000
Name: numbuildings, dtype: float64

In [40]:
merged_df_points_overlap = merged_df_points.loc[
    merged_df_points["Centroid"].isin(adm3_perc_ingrid["Centroid"])
]

In [41]:
### Section describing the computation of the building damage percentage
# combining building damage data and grid information
merged_total_damage_df = adm3_perc_ingrid.merge(
    build_dmg_data_grouped,
    left_on="ADM3_PCODE",
    right_on="pcode",
    how="left",
)

In [42]:
# computing % in each grid
# totally damaged
merged_total_damage_df["Totally_Damaged_bygrid"] = (
    merged_total_damage_df["Municipality Completeness"]
    * merged_total_damage_df["Totally"]
)

In [43]:
merged_total_damage_df[
    (merged_total_damage_df["pcode"] == "PH025012000")
    & (merged_total_damage_df["typhoon"] == "Mangkhut")
]

Unnamed: 0,id,Centroid,ADM3_PCODE,ADM3_EN,Municipality Completeness,pcode,typhoon,Year,Totally,Partially,total,Totally_Damaged_bygrid
25526,10904,120.8E_16.3N,PH025012000,Santa Fe,0.00794,PH025012000,Mangkhut,2018.0,6.0,211.0,217.0,0.04765
25538,10905,120.8E_16.2N,PH025012000,Santa Fe,0.23855,PH025012000,Mangkhut,2018.0,6.0,211.0,217.0,1.43128
25550,10906,120.8E_16.1N,PH025012000,Santa Fe,2e-05,PH025012000,Mangkhut,2018.0,6.0,211.0,217.0,0.00013
25562,11072,120.9E_16.2N,PH025012000,Santa Fe,0.38775,PH025012000,Mangkhut,2018.0,6.0,211.0,217.0,2.32653
25574,11073,120.9E_16.1N,PH025012000,Santa Fe,0.10163,PH025012000,Mangkhut,2018.0,6.0,211.0,217.0,0.6098
25586,11239,121.0E_16.2N,PH025012000,Santa Fe,0.17268,PH025012000,Mangkhut,2018.0,6.0,211.0,217.0,1.0361
25598,11240,121.0E_16.1N,PH025012000,Santa Fe,0.07625,PH025012000,Mangkhut,2018.0,6.0,211.0,217.0,0.45751
25610,11406,121.1E_16.2N,PH025012000,Santa Fe,0.00336,PH025012000,Mangkhut,2018.0,6.0,211.0,217.0,0.02019
25622,11407,121.1E_16.1N,PH025012000,Santa Fe,0.0118,PH025012000,Mangkhut,2018.0,6.0,211.0,217.0,0.07081


In [44]:
test_df = (
    merged_total_damage_df[["ADM3_PCODE", "Totally_Damaged_bygrid"]]
    .groupby("ADM3_PCODE")
    .sum()
    .reset_index()
    .merge(
        build_dmg_data_grouped[["pcode", "Totally"]]
        .groupby("pcode")
        .sum()
        .reset_index(),
        left_on="ADM3_PCODE",
        right_on="pcode",
        how="left",
    )
)
test_df["Diff"] = test_df["Totally_Damaged_bygrid"] - test_df["Totally"]
test_df["Diff"].describe()

count   1263.00000
mean      -0.00004
std        0.00141
min       -0.04138
25%       -0.00000
50%        0.00000
75%        0.00000
max        0.00795
Name: Diff, dtype: float64

A small difference between the number of totally damaged values. 
This can be explained by the weighting using the area overlap.
There may be some rounding when computing the area.

In [45]:
# computing percentage damage
# merging with building damage data
merged_perc_damage_df = merged_df_points.merge(
    merged_total_damage_df, on="id", how="right", suffixes=(None, "_y")
)
merged_perc_damage_df[
    [
        "id",
        "Centroid",
        "ADM3_PCODE",
        "ADM3_EN",
        "typhoon",
        "Year",
        "Municipality Completeness",
        "Totally",
        "numbuildings",
        "Totally_Damaged_bygrid",
    ]
].sort_values(["Totally_Damaged_bygrid"], ascending=False)

Unnamed: 0,id,Centroid,ADM3_PCODE,ADM3_EN,typhoon,Year,Municipality Completeness,Totally,numbuildings,Totally_Damaged_bygrid
19236,15273,123.4E_13.6N,PH051725000,Ocampo,Durian,2006.00000,0.73242,25951.00000,381.00000,19007.06995
9136,16299,124.0E_11.2N,PH072221000,Daanbantayan,Haiyan,2013.00000,0.67517,13660.00000,13825.00000,9222.79095
19836,17969,125.0E_11.2N,PH083739000,Palo,Haiyan,2013.00000,0.51789,13481.00000,48718.00000,6981.66883
3069,15798,123.7E_11.2N,PH072209000,Bantayan,Haiyan,2013.00000,0.59869,10533.00000,10951.00000,6305.98698
27737,17970,125.0E_11.1N,PH083748000,Tanauan,Haiyan,2013.00000,0.90848,6670.00000,10487.00000,6059.56721
...,...,...,...,...,...,...,...,...,...,...
29505,14815,123.1E_9.3N,PH074625000,Zamboanguita,,,0.00045,,39.00000,
29506,14816,123.1E_9.2N,PH074625000,Zamboanguita,,,0.26854,,111.00000,
29507,14817,123.1E_9.1N,PH074625000,Zamboanguita,,,0.09097,,416.00000,
29508,14983,123.2E_9.2N,PH074625000,Zamboanguita,,,0.31341,,127.00000,


We changed the approach to use number of buildings in a municipality instead.
Since we are using HOTOSM data, which is differently sourced from the damage data,
some municipalities and grids have more damaged buildings than total number of buildings.
TODO: Find new building dataset that is more accurate.

In [46]:
merged_perc_damage_dfout = (
    merged_perc_damage_df[
        [
            "id",
            "Centroid",
            "ADM3_PCODE",
            "ADM3_EN",
            "typhoon",
            "Year",
            "numbuildings",
            "Totally_Damaged_bygrid",
        ]
    ]
    .groupby(["id", "Centroid", "typhoon", "Year"])
    .sum(numeric_only=True)
    .reset_index()
)
# computing the percentage damage
merged_perc_damage_dfout["Totally_Damaged_Perc_bygrid"] = (
    merged_perc_damage_dfout["Totally_Damaged_bygrid"]
    / merged_perc_damage_dfout["numbuildings"]
)
merged_perc_damage_dfout.sort_values(
    ["Totally_Damaged_Perc_bygrid"], ascending=False
)

Unnamed: 0,id,Centroid,typhoon,Year,numbuildings,Totally_Damaged_bygrid,Totally_Damaged_Perc_bygrid
12985,20007,126.2E_7.8N,Bopha,2012.00000,8.00000,2280.68599,285.08575
12267,18468,125.3E_11.4N,Fengshen,2008.00000,4.00000,806.32677,201.58169
12992,20009,126.2E_7.6N,Bopha,2012.00000,12.00000,1759.18131,146.59844
12269,18468,125.3E_11.4N,Haiyan,2013.00000,4.00000,573.66872,143.41718
4988,11772,121.3E_13.0N,Melor,2015.00000,24.00000,2916.54911,121.52288
...,...,...,...,...,...,...,...
2732,11048,120.9E_18.6N,VAMCO,2020.00000,1334.00000,0.00000,0.00000
2726,11048,120.9E_18.6N,Kalmaegi,2014.00000,1334.00000,0.00000,0.00000
2717,10940,120.8E_12.7N,NAKRI,2019.00000,28.00000,0.00000,0.00000
2711,10939,120.8E_12.8N,NAKRI,2019.00000,157.00000,0.00000,0.00000


In [47]:
merged_perc_damage_dfout["Totally_Damaged_bygrid"].sum()

1645927.7189275506

In [None]:
# writing output to CSV file
# to write to csv file, group first by grid centroid
merged_perc_damage_dfout.to_csv(
    output_dir / "building_damage_bygrid_using_area.csv", index=False
)

## Using Number of Buildings

Merging all dataframes, one with number of buildings in municipality, 
one with number of damaged buildings in municipality and 
the last with weights for each grid and municipality.

In [48]:
phl_bld_all_merged_df = phl_build_municip.merge(
    build_dmg_data_grouped,
    left_on="ADM3_PCODE",
    right_on="pcode",
    how="left",
    suffixes=(None, "_y"),
).merge(phl_build_weights, on="ADM3_PCODE", how="left", suffixes=(None, "_y"))

In [49]:
phl_bld_all_merged_df[phl_bld_all_merged_df["ADM3_PCODE"] == "PH175301000"]

Unnamed: 0.1,Unnamed: 0,Shape_Leng,Shape_Area,ADM3_EN,ADM3_PCODE,ADM3_REF,ADM3ALT1EN,ADM3ALT2EN,ADM2_EN,ADM2_PCODE,...,typhoon,Year,Totally,Partially,total,Unnamed: 0_y,ADM3_EN_y,id,Centroid,weight
0,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,0,Aborlan,6795.0,118.3E_9.7N,0.0
1,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,1,Aborlan,6796.0,118.3E_9.6N,0.00499
2,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,2,Aborlan,6797.0,118.3E_9.5N,0.00449
3,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,3,Aborlan,6798.0,118.3E_9.4N,0.0
4,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,4,Aborlan,6962.0,118.4E_9.7N,0.15254
5,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,5,Aborlan,6963.0,118.4E_9.6N,0.02742
6,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,6,Aborlan,6964.0,118.4E_9.5N,0.0
7,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,7,Aborlan,6965.0,118.4E_9.4N,0.0
8,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,8,Aborlan,7130.0,118.5E_9.6N,0.0
9,0,1.60122,0.0635,Aborlan,PH175301000,,,,Palawan,PH175300000,...,,,,,,9,Aborlan,7131.0,118.5E_9.5N,0.02692


In [50]:
phl_build_weights.groupby("ADM3_PCODE")["weight"].sum().describe()

count   1647.00000
mean       0.99636
std        0.06027
min        0.00000
25%        1.00000
50%        1.00000
75%        1.00000
max        1.00000
Name: weight, dtype: float64

In [51]:
phl_bld_all_merged_df.groupby(["pcode", "typhoon", "Year"]).first()[
    "Totally"
].sum()

1670866.0

In [52]:
phl_bld_all_merged_df.groupby(["pcode", "typhoon", "Year"]).first()[
    "numbuildings"
].sum()

31001690

In [53]:
phl_build_municip["numbuildings"].sum()

10324006

In [54]:
phl_bld_all_merged_df["numbuildings_bygrid"] = (
    phl_bld_all_merged_df["weight"] * phl_bld_all_merged_df["numbuildings"]
)
phl_bld_all_merged_df["Totally_Damaged_bygrid"] = (
    phl_bld_all_merged_df["weight"] * phl_bld_all_merged_df["Totally"]
)
phl_bld_all_merged_df["Totally_Damaged_bygrid"] = phl_bld_all_merged_df[
    "Totally_Damaged_bygrid"
].fillna(0)
phl_bld_all_merged_df[
    [
        "id",
        "Centroid",
        "ADM3_PCODE",
        "typhoon",
        "Year",
        "weight",
        "Totally",
        "numbuildings",
        "numbuildings_bygrid",
        "Totally_Damaged_bygrid",
    ]
].sort_values(["Totally_Damaged_bygrid"], ascending=False)

Unnamed: 0,id,Centroid,ADM3_PCODE,typhoon,Year,weight,Totally,numbuildings,numbuildings_bygrid,Totally_Damaged_bygrid
19222,15273.00000,123.4E_13.6N,PH051725000,Durian,2006.00000,0.76867,25951.00000,415,319.00000,19947.87711
19834,17969.00000,125.0E_11.2N,PH083739000,Haiyan,2013.00000,0.80034,13481.00000,10588,8474.00000,10789.38364
27352,17969.00000,125.0E_11.2N,PH083747000,Haiyan,2013.00000,0.72317,12270.00000,55565,40183.00000,8873.30892
9144,16299.00000,124.0E_11.2N,PH072221000,Haiyan,2013.00000,0.52432,13660.00000,20701,10854.00000,7162.24530
19398,17303.00000,124.6E_11.0N,PH083738000,Haiyan,2013.00000,0.49538,14132.00000,30865,15290.00000,7000.75425
...,...,...,...,...,...,...,...,...,...,...
18201,15783.00000,123.7E_12.7N,PH054114000,Melor,2015.00000,0.00000,964.00000,1430,0.00000,0.00000
5245,14604.00000,123.0E_13.7N,PH051707000,Rammasun,2014.00000,0.00000,696.00000,1815,0.00000,0.00000
18203,15616.00000,123.6E_12.7N,PH054114000,Haiyan,2013.00000,0.00000,0.00000,1430,0.00000,0.00000
18204,15617.00000,123.6E_12.6N,PH054114000,Haiyan,2013.00000,0.33427,0.00000,1430,478.00000,0.00000


In [55]:
phl_bld_all_merged_df[phl_bld_all_merged_df["ADM3_PCODE"] == "PH025012000"][
    [
        "id",
        "Centroid",
        "ADM3_PCODE",
        "typhoon",
        "Year",
        "weight",
        "Totally",
        "numbuildings",
        "numbuildings_bygrid",
        "Totally_Damaged_bygrid",
    ]
]

Unnamed: 0,id,Centroid,ADM3_PCODE,typhoon,Year,weight,Totally,numbuildings,numbuildings_bygrid,Totally_Damaged_bygrid
25519,11072.00000,120.9E_16.2N,PH025012000,Nesat,2011.00000,0.22449,1.00000,245,55.00000,0.22449
25520,11239.00000,121.0E_16.2N,PH025012000,Nesat,2011.00000,0.05306,1.00000,245,13.00000,0.05306
25521,11406.00000,121.1E_16.2N,PH025012000,Nesat,2011.00000,0.00000,1.00000,245,0.00000,0.00000
25522,11407.00000,121.1E_16.1N,PH025012000,Nesat,2011.00000,0.00000,1.00000,245,0.00000,0.00000
25523,10904.00000,120.8E_16.3N,PH025012000,Nesat,2011.00000,0.00000,1.00000,245,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...
25622,10904.00000,120.8E_16.3N,PH025012000,VAMCO,2020.00000,0.00000,2.00000,245,0.00000,0.00000
25623,11073.00000,120.9E_16.1N,PH025012000,VAMCO,2020.00000,0.04082,2.00000,245,10.00000,0.08163
25624,11240.00000,121.0E_16.1N,PH025012000,VAMCO,2020.00000,0.02041,2.00000,245,5.00000,0.04082
25625,10905.00000,120.8E_16.2N,PH025012000,VAMCO,2020.00000,0.66122,2.00000,245,162.00000,1.32245


In [56]:
phl_bld_all_merged_df["Totally_Damaged_bygrid"].sum()

1669643.9999999995

In [57]:
phl_bld_all_merged_df["numbuildings_bygrid"].sum()

32311224.0

In [58]:
phl_bld_all_merged_dfout = phl_bld_all_merged_df[
    [
        "id",
        "Centroid",
        "typhoon",
        "Year",
        "weight",
        "numbuildings_bygrid",
        "Totally_Damaged_bygrid",
    ]
]
phl_bld_all_merged_dfout = (
    phl_bld_all_merged_dfout.groupby(["id", "Centroid", "typhoon", "Year"])
    .sum()
    .reset_index()
)

phl_bld_all_merged_dfout["Totally_Damaged_Perc_bygrid"] = (
    phl_bld_all_merged_dfout["Totally_Damaged_bygrid"]
    / phl_bld_all_merged_dfout["numbuildings_bygrid"]
)
phl_bld_all_merged_dfout.sort_values(
    ["Totally_Damaged_Perc_bygrid"], ascending=False
)

Unnamed: 0,id,Centroid,typhoon,Year,weight,numbuildings_bygrid,Totally_Damaged_bygrid,Totally_Damaged_Perc_bygrid
10095,15273.00000,123.4E_13.6N,Durian,2006.00000,0.88814,381.00000,20770.12374,54.51476
14052,20339.00000,126.4E_8.0N,Bopha,2012.00000,0.23171,19.00000,846.19512,44.53659
14049,20338.00000,126.4E_8.1N,Bopha,2012.00000,0.67995,64.00000,2454.81445,38.35648
5461,11780.00000,121.3E_12.2N,PHANFONE,2019.00000,0.01299,1.00000,34.51948,34.51948
5456,11779.00000,121.3E_12.3N,PHANFONE,2019.00000,0.58442,45.00000,1553.37662,34.51948
...,...,...,...,...,...,...,...,...
14088,20505.00000,126.5E_8.1N,Haiyan,2013.00000,0.00000,0.00000,0.00000,
14089,20505.00000,126.5E_8.1N,Lingling,2014.00000,0.00000,0.00000,0.00000,
14111,20516.00000,126.5E_7.0N,Bopha,2012.00000,0.00000,0.00000,0.00000,
14112,20516.00000,126.5E_7.0N,Haiyan,2013.00000,0.00000,0.00000,0.00000,


In [59]:
phl_bld_all_merged_dfout["numbuildings_bygrid"].sum()

31001690.0

In [60]:
phl_bld_all_merged_dfout[phl_bld_all_merged_dfout["Centroid"] == "126.6E_7.3N"]

Unnamed: 0,id,Centroid,typhoon,Year,weight,numbuildings_bygrid,Totally_Damaged_bygrid,Totally_Damaged_Perc_bygrid
14122,20680.0,126.6E_7.3N,Bopha,2012.0,0.47503,401.0,923.35933,2.30264
14123,20680.0,126.6E_7.3N,Haiyan,2013.0,0.01795,7.0,0.05385,0.00769
14124,20680.0,126.6E_7.3N,Lingling,2014.0,0.47503,401.0,27.02136,0.06738


In [61]:
phl_bld_all_merged_dfout.groupby(["Centroid"])[
    "numbuildings_bygrid"
].sum().reset_index()

Unnamed: 0,Centroid,numbuildings_bygrid
0,117.9E_9.0N,0.00000
1,117.9E_9.1N,0.00000
2,118.0E_8.9N,2.00000
3,118.0E_9.0N,379.00000
4,118.0E_9.1N,0.00000
...,...,...
2989,126.6E_7.3N,809.00000
2990,126.6E_7.4N,26.00000
2991,126.6E_7.5N,88.00000
2992,126.6E_7.6N,346.00000


In [62]:
phl_bld_all_merged_dfout["Totally_Damaged_bygrid"].sum()

1669643.9999999998

In [63]:
phl_bld_all_merged_dfout["Totally_Damaged_Perc_bygrid"].describe()

count   12228.00000
mean        0.25820
std         1.55548
min         0.00000
25%         0.00045
50%         0.00614
75%         0.06833
max        54.51476
Name: Totally_Damaged_Perc_bygrid, dtype: float64

In [None]:
# writing output to CSV file
# to write to csv file, group first by grid centroid
phl_bld_all_merged_dfout.to_csv(
    output_dir / "building_damage_bygrid.csv", index=False
)