# RMSE estimation for region(adm1)

We used a weight and adm3_area files to join weight and adm_1(region) and adm_3(municipality) to our main dataset.
We prepared a dataframe that represents real and damaged value per region(ADM1). Then we train our model(XGBoost Reduced Overfitting) to this input data while we splitted five typhoons(randomly selected) as the test set and the rest of them as the train set.
The final goal is to estimate the difference between real and predicted damage value per region with respect to each typhoon, to check how the model performs for a wide area.

In [1]:
%load_ext jupyter_black

In [55]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from pathlib import Path
import os

from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import statistics

from utils import get_training_dataset, weight_file

In [56]:
data_dir = (
    Path(os.getenv("STORM_DATA_DIR"))
    / "analysis/02_new_model_input/02_housing_damage/input/"
)

In [3]:
# Read csv file and import to df
df = get_training_dataset()

# Move target to be the last column for simplicity
df = df.reindex(
    columns=[col for col in df.columns if col != "percent_houses_damaged"]
    + ["percent_houses_damaged"]
)

df.head()

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
0,DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.0,,22.580645,...,2.699781,5.762712,3445.709753,1,0.0,0.0,1.0,0.0,0.0,0.0
1,DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.30102,-0.527,2.639401,...,4.585088,12.799127,8602.645832,1,0.0,0.0,1.0,0.0,0.0,0.0
2,DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283,2.639401,...,1.527495,8.833333,5084.012925,1,0.0,0.01,0.99,197.339034,0.0,0.0
3,DURIAN,2006,4640,0.0,614.67527,0.35625,0.101562,645.89966,-0.358889,2.639401,...,11.677657,17.530431,55607.86595,1,0.0,0.31,0.69,4970.477311,0.0,0.0
4,DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.4628,2.639401,...,17.074011,31.931338,35529.342507,1,0.0,0.77,0.23,12408.594656,0.0,0.0


In [4]:
# df.loc[df["typhoon_name"] == "GONI"]

In [5]:
# Fill the missing values of RWI with mean value
df["rwi"].fillna(df["rwi"].mean(), inplace=True)

In [6]:
# Set any values >100% to 100%,
for i in range(len(df)):
    if df.loc[i, "percent_houses_damaged"] > 100:
        df.at[i, "percent_houses_damaged"] = float(100)

In [7]:
# Remove zeros from wind_speed
df = df[(df[["wind_speed"]] != 0).any(axis=1)]
df.reset_index(drop=True, inplace=True)
df = df.drop(columns=["typhoon_year"])
df.head()

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,strong_roof_light_wall,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
0,DURIAN,8284,12.460039,275.018491,0.670833,0.313021,0.479848,-0.213039,31.336503,29.117802,...,34.62955,42.21875,5303.65949,1,0.0,0.0,1.0,0.0,0.0,0.0
1,DURIAN,8286,11.428974,297.027578,0.929167,0.343229,55.649739,0.206,23.447758,23.591571,...,25.475388,72.283154,61015.543599,1,0.0,0.14,0.86,276.871504,0.0,0.0
2,DURIAN,8450,13.077471,262.598363,0.716667,0.424479,8.157414,-0.636,31.336503,29.117802,...,54.353996,102.215198,66707.43807,1,0.0,0.11,0.89,448.539453,0.0,0.0
3,DURIAN,8451,12.511864,273.63933,0.56875,0.336979,88.292015,-0.2275,31.336503,29.117802,...,31.814048,58.988877,53841.050168,1,0.0,0.12,0.88,2101.708435,0.0,0.0
4,DURIAN,8452,11.977511,284.680297,0.589583,0.290625,962.766739,-0.299667,23.546053,23.660429,...,25.976413,111.386527,87378.257957,1,0.07,0.46,0.47,11632.726327,0.0,0.0


In [8]:
# Define bins for data stratification
bins2 = [0, 0.00009, 1, 10, 50, 101]
samples_per_bin2, binsP2 = np.histogram(
    df["percent_houses_damaged"], bins=bins2
)

In [9]:
# Check the bins' intervalls
df["percent_houses_damaged"].value_counts(bins=binsP2)

(-0.001, 9e-05]    38901
(9e-05, 1.0]        7232
(1.0, 10.0]         2552
(10.0, 50.0]         925
(50.0, 101.0]        144
Name: percent_houses_damaged, dtype: int64

In [10]:
bin_index2 = np.digitize(df["percent_houses_damaged"], bins=binsP2)

In [11]:
y_input_strat = bin_index2

In [28]:
# Use MinMaxScaler function for data standardization (it normalaize data in range of [0,1] and not negative values)

# Separate typhoon from other features
dfs = np.split(df, [2], axis=1)
dfa = np.split(dfs[1], [27], axis=1)
# print(dfs[0], dfs[1], dfa[0], dfa[1])

# Standardaize data
scaler = MinMaxScaler().fit(dfa[0])
X1 = scaler.transform(dfa[0])
Xnew = pd.DataFrame(X1)
Xnew_per_pred = pd.DataFrame(X1)
display(Xnew)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.168433,0.879044,0.010371,0.009660,2.334511e-07,0.330964,0.314832,0.337448,0.008392,0.015702,...,0.478800,0.413882,0.028344,0.031312,1.0,0.00,0.000000,1.000000,0.000000,0.000000
1,0.154462,0.949392,0.014378,0.010594,9.712553e-05,0.504983,0.233919,0.273404,0.007450,0.035236,...,0.440663,0.304474,0.043049,0.360224,1.0,0.00,0.140000,0.860000,0.000086,0.000000
2,0.176799,0.839346,0.011082,0.013105,1.371717e-05,0.155316,0.314832,0.337448,0.008392,0.015702,...,0.670175,0.649623,0.057690,0.393828,1.0,0.00,0.110000,0.890000,0.000139,0.000000
3,0.169135,0.874636,0.008788,0.010400,1.544535e-04,0.324958,0.314832,0.337448,0.008392,0.015702,...,0.383667,0.380232,0.036547,0.317867,1.0,0.00,0.120000,0.880000,0.000654,0.000000
4,0.161895,0.909926,0.009111,0.008968,1.690249e-03,0.294989,0.234927,0.274202,0.007462,0.034992,...,0.421247,0.310462,0.062176,0.515864,1.0,0.07,0.460000,0.470000,0.003618,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49749,0.108159,0.816770,0.034958,0.032650,7.311641e-05,0.111296,0.351423,0.702693,0.286799,0.005496,...,0.331492,0.433746,0.026779,0.011391,1.0,0.01,0.000000,0.990000,0.000103,0.009097
49750,0.102816,0.852281,0.033634,0.028851,2.854586e-03,0.379331,0.351423,0.702693,0.286799,0.005496,...,0.129081,0.208269,0.019943,0.271370,1.0,0.05,0.190000,0.760000,0.001682,0.009097
49751,0.096754,0.887792,0.029724,0.025744,1.096340e-03,0.498339,0.351423,0.702693,0.286799,0.005496,...,0.138776,0.204722,0.016272,0.087050,1.0,0.01,0.020000,0.970000,0.001673,0.009097
49752,0.092212,0.923300,0.058092,0.035882,3.178534e-05,0.286545,0.351423,0.702693,0.286799,0.005496,...,0.156689,0.151533,0.015221,0.031742,1.0,0.00,0.027273,0.972727,0.000284,0.009097


In [13]:
# All df without target column
dfa[0]

Unnamed: 0,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,strong_roof_light_wall,strong_roof_salvage_wall,light_roof_strong_wall,...,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years
0,12.460039,275.018491,0.670833,0.313021,0.479848,-0.213039,31.336503,29.117802,0.042261,0.507132,...,74.625539,34.62955,42.21875,5303.65949,1,0.0,0.0,1.0,0.0,0.0
1,11.428974,297.027578,0.929167,0.343229,55.649739,0.206,23.447758,23.591571,0.037516,1.137998,...,68.681417,25.475388,72.283154,61015.543599,1,0.0,0.14,0.86,276.871504,0.0
2,13.077471,262.598363,0.716667,0.424479,8.157414,-0.636,31.336503,29.117802,0.042261,0.507132,...,104.453163,54.353996,102.215198,66707.43807,1,0.0,0.11,0.89,448.539453,0.0
3,12.511864,273.63933,0.56875,0.336979,88.292015,-0.2275,31.336503,29.117802,0.042261,0.507132,...,59.798108,31.814048,58.988877,53841.050168,1,0.0,0.12,0.88,2101.708435,0.0
4,11.977511,284.680297,0.589583,0.290625,962.766739,-0.299667,23.546053,23.660429,0.037576,1.130137,...,65.65528,25.976413,111.386527,87378.257957,1,0.07,0.46,0.47,11632.726327,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49749,8.011792,255.535258,2.25625,1.056771,41.979062,-0.742,34.903986,60.634178,1.444247,0.177505,...,51.666178,36.291573,39.018519,1929.419748,1,0.01,0.0,0.99,330.215768,1.143833
49750,7.61746,266.645258,2.170833,0.933854,1625.734579,-0.096571,34.903986,60.634178,1.444247,0.177505,...,20.11842,17.425889,25.042969,45965.284119,1,0.05,0.19,0.76,5409.607943,1.143833
49751,7.170117,277.755258,1.91875,0.833333,624.597557,0.19,34.903986,60.634178,1.444247,0.177505,...,21.62959,17.129093,17.537129,14744.712453,1,0.01,0.02,0.97,5378.401365,1.143833
49752,6.834925,288.864374,3.747917,1.16131,18.445345,-0.32,34.903986,60.634178,1.444247,0.177505,...,24.42143,12.678785,15.389474,5376.583753,1,0.0,0.027273,0.972727,914.677196,1.143833


In [14]:
dfa[1] = dfa[1].astype(float)

In [15]:
Xnew = pd.concat(
    [Xnew.reset_index(drop=True), dfa[1].reset_index(drop=True)], axis=1
)
Xnew

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,percent_houses_damaged
0,0.168433,0.879044,0.010371,0.009660,2.334511e-07,0.330964,0.314832,0.337448,0.008392,0.015702,...,0.413882,0.028344,0.031312,1.0,0.00,0.000000,1.000000,0.000000,0.000000,0.0
1,0.154462,0.949392,0.014378,0.010594,9.712553e-05,0.504983,0.233919,0.273404,0.007450,0.035236,...,0.304474,0.043049,0.360224,1.0,0.00,0.140000,0.860000,0.000086,0.000000,0.0
2,0.176799,0.839346,0.011082,0.013105,1.371717e-05,0.155316,0.314832,0.337448,0.008392,0.015702,...,0.649623,0.057690,0.393828,1.0,0.00,0.110000,0.890000,0.000139,0.000000,0.0
3,0.169135,0.874636,0.008788,0.010400,1.544535e-04,0.324958,0.314832,0.337448,0.008392,0.015702,...,0.380232,0.036547,0.317867,1.0,0.00,0.120000,0.880000,0.000654,0.000000,0.0
4,0.161895,0.909926,0.009111,0.008968,1.690249e-03,0.294989,0.234927,0.274202,0.007462,0.034992,...,0.310462,0.062176,0.515864,1.0,0.07,0.460000,0.470000,0.003618,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49749,0.108159,0.816770,0.034958,0.032650,7.311641e-05,0.111296,0.351423,0.702693,0.286799,0.005496,...,0.433746,0.026779,0.011391,1.0,0.01,0.000000,0.990000,0.000103,0.009097,0.0
49750,0.102816,0.852281,0.033634,0.028851,2.854586e-03,0.379331,0.351423,0.702693,0.286799,0.005496,...,0.208269,0.019943,0.271370,1.0,0.05,0.190000,0.760000,0.001682,0.009097,0.0
49751,0.096754,0.887792,0.029724,0.025744,1.096340e-03,0.498339,0.351423,0.702693,0.286799,0.005496,...,0.204722,0.016272,0.087050,1.0,0.01,0.020000,0.970000,0.001673,0.009097,0.0
49752,0.092212,0.923300,0.058092,0.035882,3.178534e-05,0.286545,0.351423,0.702693,0.286799,0.005496,...,0.151533,0.015221,0.031742,1.0,0.00,0.027273,0.972727,0.000284,0.009097,0.0


In [16]:
features = [
    "wind_speed",
    "track_distance",
    "total_houses",
    "rainfall_max_6h",
    "rainfall_max_24h",
    "rwi",
    "mean_slope",
    "std_slope",
    "mean_tri",
    "std_tri",
    "mean_elev",
    "coast_length",
    "with_coast",
    "urban",
    "rural",
    "water",
    "total_pop",
    "percent_houses_damaged_5years",
]
features = dfs[1].columns

In [17]:
# Add the features to the columns' headers after standardization
i = 0
for feature in features:
    Xnew = Xnew.rename(columns={i: feature})
    i += 1

Xnew = pd.concat(
    [dfs[0].reset_index(drop=True), Xnew.reset_index(drop=True)], axis=1
)
Xnew

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,strong_roof_light_wall,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
0,DURIAN,8284,0.168433,0.879044,0.010371,0.009660,2.334511e-07,0.330964,0.314832,0.337448,...,0.413882,0.028344,0.031312,1.0,0.00,0.000000,1.000000,0.000000,0.000000,0.0
1,DURIAN,8286,0.154462,0.949392,0.014378,0.010594,9.712553e-05,0.504983,0.233919,0.273404,...,0.304474,0.043049,0.360224,1.0,0.00,0.140000,0.860000,0.000086,0.000000,0.0
2,DURIAN,8450,0.176799,0.839346,0.011082,0.013105,1.371717e-05,0.155316,0.314832,0.337448,...,0.649623,0.057690,0.393828,1.0,0.00,0.110000,0.890000,0.000139,0.000000,0.0
3,DURIAN,8451,0.169135,0.874636,0.008788,0.010400,1.544535e-04,0.324958,0.314832,0.337448,...,0.380232,0.036547,0.317867,1.0,0.00,0.120000,0.880000,0.000654,0.000000,0.0
4,DURIAN,8452,0.161895,0.909926,0.009111,0.008968,1.690249e-03,0.294989,0.234927,0.274202,...,0.310462,0.062176,0.515864,1.0,0.07,0.460000,0.470000,0.003618,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49749,MOLAVE,19306,0.108159,0.816770,0.034958,0.032650,7.311641e-05,0.111296,0.351423,0.702693,...,0.433746,0.026779,0.011391,1.0,0.01,0.000000,0.990000,0.000103,0.009097,0.0
49750,MOLAVE,19307,0.102816,0.852281,0.033634,0.028851,2.854586e-03,0.379331,0.351423,0.702693,...,0.208269,0.019943,0.271370,1.0,0.05,0.190000,0.760000,0.001682,0.009097,0.0
49751,MOLAVE,19308,0.096754,0.887792,0.029724,0.025744,1.096340e-03,0.498339,0.351423,0.702693,...,0.204722,0.016272,0.087050,1.0,0.01,0.020000,0.970000,0.001673,0.009097,0.0
49752,MOLAVE,19476,0.092212,0.923300,0.058092,0.035882,3.178534e-05,0.286545,0.351423,0.702693,...,0.151533,0.015221,0.031742,1.0,0.00,0.027273,0.972727,0.000284,0.009097,0.0


In [18]:
df["typhoon_name"].unique()

array(['DURIAN', 'FENGSHEN', 'KETSANA', 'CONSON', 'NESAT', 'BOPHA',
       'NARI', 'KROSA', 'HAIYAN', 'USAGI', 'UTOR', 'JANGMI', 'KALMAEGI',
       'RAMMASUN', 'HAGUPIT', 'FUNG-WONG', 'LINGLING', 'MUJIGAE', 'MELOR',
       'NOUL', 'GONI', 'LINFA', 'KOPPU', 'MEKKHALA', 'HAIMA', 'TOKAGE',
       'MERANTI', 'NOCK-TEN', 'SARIKA', 'MANGKHUT', 'YUTU', 'KAMMURI',
       'NAKRI', 'PHANFONE', 'SAUDEL', 'VAMCO', 'VONGFONG', 'MOLAVE'],
      dtype=object)

In [19]:
# Define a test_list (including 5 typhoons) randomly were chosen
test_list_1 = ["FENGSHEN", "DURIAN", "NESAT", "VONGFONG", "MOLAVE"]

test_list_2 = ["YUTU", "KAMMURI", "SARIKA", "TOKAGE", "LINGLING"]

test_list_3 = ["SAUDEL", "MANGKHUT", "HAIMA", "BOPHA", "KETSANA"]

test_list_4 = ["GONI", "LINFA", "NOCK-TEN", "NOUL", "JANGMI"]

test_list_5 = ["NAKRI", "UTOR", "HAIYAN", "RAMMASUN", "CONSON"]

test_list_6 = ["PHANFONE", "VAMCO", "KOPPU", "FUNG-WONG", "HAGUPIT"]

test_list_7 = ["MEKKHALA", "NARI", "KROSA", "USAGI", "KALMAEGI"]

In [20]:
# Extract the column of unique ids
grid_id = df["grid_point_id"]

In [21]:
df_test = pd.DataFrame(
    Xnew,
    columns=[
        "typhoon_name",
        "grid_point_id",
        "wind_speed",
        "track_distance",
        "rainfall_max_6h",
        "rainfall_max_24h",
        "total_houses",
        "rwi",
        "strong_roof_strong_wall",
        "strong_roof_light_wall",
        "strong_roof_salvage_wall",
        "light_roof_strong_wall",
        "light_roof_light_wall",
        "light_roof_salvage_wall",
        "salvaged_roof_strong_wall",
        "salvaged_roof_light_wall",
        "salvaged_roof_salvage_wall",
        "mean_slope",
        "std_slope",
        "mean_tri",
        "std_tri",
        "mean_elev",
        "coast_length",
        "with_coast",
        "urban",
        "rural",
        "water",
        "total_pop",
        "percent_houses_damaged_5years",
        "percent_houses_damaged",
    ],
)

df_test = Xnew[Xnew["typhoon_name"] == test_list_3[4]]
df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[3]])
df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[2]])
df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[1]])
df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[0]])

Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[4]], inplace=True)
Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[3]], inplace=True)
Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[2]], inplace=True)
Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[1]], inplace=True)
Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[0]], inplace=True)

display(df_test)
df_train = Xnew
display(df_train)

  df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[3]])
  df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[2]])
  df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[1]])
  df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[0]])


Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,strong_roof_light_wall,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
3804,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.657219,0.284772,...,0.065630,0.020180,0.031687,1.0,0.05,0.03,0.92,0.001211,0.000000,0.0
3805,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.657960,0.278739,...,0.141129,0.060336,0.091281,1.0,0.10,0.60,0.30,0.004117,0.000000,0.0
3806,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.646976,0.266692,...,0.214061,0.058423,0.104104,1.0,0.28,0.59,0.13,0.006658,0.000000,0.0
3807,KETSANA,9236,0.242485,0.144241,0.126329,0.139054,0.008478,0.307074,0.605574,0.265275,...,0.184985,0.033297,0.091221,1.0,0.24,0.62,0.14,0.008454,0.000000,0.0
3808,KETSANA,9237,0.244396,0.108748,0.143194,0.143739,0.005643,0.320598,0.637688,0.231547,...,0.103623,0.028936,0.140126,1.0,0.00,0.89,0.11,0.002965,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43270,SAUDEL,18795,0.061937,0.877058,0.091758,0.077746,0.004009,0.358942,0.292576,0.487645,...,0.158586,0.015329,0.265042,1.0,0.12,0.16,0.72,0.003767,0.009013,0.0
43271,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.325208,0.513887,...,0.095746,0.012607,0.089733,1.0,0.14,0.06,0.80,0.007193,0.007335,0.0
43272,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.473382,0.475945,...,0.088262,0.013689,0.077939,1.0,0.03,0.07,0.90,0.000113,0.011737,0.0
43273,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.293731,0.446105,...,0.117307,0.016159,0.095031,1.0,0.00,0.03,0.97,0.000749,0.010422,0.0


Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,strong_roof_light_wall,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
0,DURIAN,8284,0.168433,0.879044,0.010371,0.009660,2.334511e-07,0.330964,0.314832,0.337448,...,0.413882,0.028344,0.031312,1.0,0.00,0.000000,1.000000,0.000000,0.000000,0.0
1,DURIAN,8286,0.154462,0.949392,0.014378,0.010594,9.712553e-05,0.504983,0.233919,0.273404,...,0.304474,0.043049,0.360224,1.0,0.00,0.140000,0.860000,0.000086,0.000000,0.0
2,DURIAN,8450,0.176799,0.839346,0.011082,0.013105,1.371717e-05,0.155316,0.314832,0.337448,...,0.649623,0.057690,0.393828,1.0,0.00,0.110000,0.890000,0.000139,0.000000,0.0
3,DURIAN,8451,0.169135,0.874636,0.008788,0.010400,1.544535e-04,0.324958,0.314832,0.337448,...,0.380232,0.036547,0.317867,1.0,0.00,0.120000,0.880000,0.000654,0.000000,0.0
4,DURIAN,8452,0.161895,0.909926,0.009111,0.008968,1.690249e-03,0.294989,0.234927,0.274202,...,0.310462,0.062176,0.515864,1.0,0.07,0.460000,0.470000,0.003618,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49749,MOLAVE,19306,0.108159,0.816770,0.034958,0.032650,7.311641e-05,0.111296,0.351423,0.702693,...,0.433746,0.026779,0.011391,1.0,0.01,0.000000,0.990000,0.000103,0.009097,0.0
49750,MOLAVE,19307,0.102816,0.852281,0.033634,0.028851,2.854586e-03,0.379331,0.351423,0.702693,...,0.208269,0.019943,0.271370,1.0,0.05,0.190000,0.760000,0.001682,0.009097,0.0
49751,MOLAVE,19308,0.096754,0.887792,0.029724,0.025744,1.096340e-03,0.498339,0.351423,0.702693,...,0.204722,0.016272,0.087050,1.0,0.01,0.020000,0.970000,0.001673,0.009097,0.0
49752,MOLAVE,19476,0.092212,0.923300,0.058092,0.035882,3.178534e-05,0.286545,0.351423,0.702693,...,0.151533,0.015221,0.031742,1.0,0.00,0.027273,0.972727,0.000284,0.009097,0.0


In [22]:
df_test["typhoon_name"].unique()

array(['KETSANA', 'BOPHA', 'HAIMA', 'MANGKHUT', 'SAUDEL'], dtype=object)

In [23]:
# Split X and y from dataframe features
X_test = df_test[features]
X_train = df_train[features]

y_train = df_train["percent_houses_damaged"]
y_test = df_test["percent_houses_damaged"]

In [24]:
# Define XGBoost Reduced Overfitting model
xgb = XGBRegressor(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=0.8,
    colsample_bynode=0.8,
    colsample_bytree=0.8,
    gamma=3,
    eta=0.01,
    importance_type="gain",
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=4,
    min_child_weight=1,
    missing=1,
    n_estimators=100,
    early_stopping_rounds=10,
    n_jobs=1,
    nthread=None,
    objective="reg:squarederror",
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=0.8,
    verbosity=1,
    eval_metric=["rmse", "logloss"],
    random_state=0,
)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


                              OLS Regression Results                              
Dep. Variable:     percent_houses_damaged   R-squared:                       0.229
Model:                                OLS   Adj. R-squared:                  0.228
Method:                     Least Squares   F-statistic:                     479.1
Date:                    Fri, 09 Jun 2023   Prob (F-statistic):               0.00
Time:                            15:36:34   Log-Likelihood:            -1.2583e+05
No. Observations:                   43643   AIC:                         2.517e+05
Df Residuals:                       43615   BIC:                         2.520e+05
Df Model

In [45]:
eval_set = [(X_test, y_test)]
xgb_model = xgb.fit(X_train, y_train, eval_set=eval_set, verbose=False)

X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

                              OLS Regression Results                              
Dep. Variable:     percent_houses_damaged   R-squared:                       1.000
Model:                                OLS   Adj. R-squared:                  1.000
Method:                     Least Squares   F-statistic:                 2.670e+32
Date:                    Wed, 03 May 2023   Prob (F-statistic):               0.00
Time:                            18:43:50   Log-Likelihood:             1.5376e+06
No. Observations:                   43643   AIC:                        -3.075e+06
Df Residuals:                       43615   BIC:                        -3.075e+06
Df Model:                              27                                         
Covariance Type:                nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

In [25]:
# Make prediction
y_pred_train = xgb.predict(X_train)
y_pred_train_clipped = y_pred_train.clip(0, 100)

y_pred = xgb.predict(X_test)
y_pred_clipped = y_pred.clip(0, 100)

In [26]:
y_pred = y_pred_clipped.tolist()
y_true = df_test["percent_houses_damaged"].tolist()

In [27]:
df_test.reset_index(drop=True, inplace=True)
for i in range(len(df_test)):
    df_test.at[i, "y_pred"] = y_pred[i]
df_test

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,strong_roof_light_wall,...,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,y_pred
0,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.657219,0.284772,...,0.020180,0.031687,1.0,0.05,0.03,0.92,0.001211,0.000000,0.0,0.000000
1,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.657960,0.278739,...,0.060336,0.091281,1.0,0.10,0.60,0.30,0.004117,0.000000,0.0,0.024520
2,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.646976,0.266692,...,0.058423,0.104104,1.0,0.28,0.59,0.13,0.006658,0.000000,0.0,0.042715
3,KETSANA,9236,0.242485,0.144241,0.126329,0.139054,0.008478,0.307074,0.605574,0.265275,...,0.033297,0.091221,1.0,0.24,0.62,0.14,0.008454,0.000000,0.0,0.209506
4,KETSANA,9237,0.244396,0.108748,0.143194,0.143739,0.005643,0.320598,0.637688,0.231547,...,0.028936,0.140126,1.0,0.00,0.89,0.11,0.002965,0.000000,0.0,0.029664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6106,SAUDEL,18795,0.061937,0.877058,0.091758,0.077746,0.004009,0.358942,0.292576,0.487645,...,0.015329,0.265042,1.0,0.12,0.16,0.72,0.003767,0.009013,0.0,0.000000
6107,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.325208,0.513887,...,0.012607,0.089733,1.0,0.14,0.06,0.80,0.007193,0.007335,0.0,0.032423
6108,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.473382,0.475945,...,0.013689,0.077939,1.0,0.03,0.07,0.90,0.000113,0.011737,0.0,0.000000
6109,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.293731,0.446105,...,0.016159,0.095031,1.0,0.00,0.03,0.97,0.000749,0.010422,0.0,0.000000


In [28]:
# Read a CSV file including grid_id and mun_code and import to a df
df_weight = weight_file("/ggl_grid_to_mun_weights.csv")
df_weight.head()

Unnamed: 0,ADM3_PCODE,id_x,Centroid,numbuildings_x,id,numbuildings,weight
0,PH012801000,11049.0,120.9E_18.5N,1052,11049,1794,0.586399
1,PH012810000,11049.0,120.9E_18.5N,0,11049,1794,0.0
2,PH012815000,11049.0,120.9E_18.5N,742,11049,1794,0.413601
3,PH012801000,11050.0,120.9E_18.4N,193,11050,196,0.984694
4,PH012810000,11050.0,120.9E_18.4N,0,11050,196,0.0


In [29]:
# Change name of column ['id'] to ['grid_point_id'] the same name as in input df
df_weight.rename(columns={"id": "grid_point_id"}, inplace=True)

In [30]:
# join main df to the weight df based on grid_point_id
join_final = df_test.merge(df_weight, on="grid_point_id", how="left")

In [31]:
# Remove all columns between column index 21 to 25
join_final.drop(join_final.iloc[:, 23:27], inplace=True, axis=1)

In [32]:
# Multiply %damg and also %predicted_damg with total_houses and weight
join_final["weight*%damg*houses"] = (
    join_final["weight"]
    * join_final["percent_houses_damaged"]
    * join_final["total_houses"]
) / 100
join_final["weight*%predicted_damg*houses"] = (
    join_final["weight"] * join_final["y_pred"] * join_final["total_houses"]
) / 100

# Multiply total_houses with weight
join_final["weight*houses"] = (join_final["weight"] * join_final["total_houses"]) / 100

join_final

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,strong_roof_light_wall,...,y_pred,ADM3_PCODE,id_x,Centroid,numbuildings_x,numbuildings,weight,weight*%damg*houses,weight*%predicted_damg*houses,weight*houses
0,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.657219,0.284772,...,0.000000,PH015514000,9233.0,119.8E_16.4N,689,689,1.000000,0.0,0.000000e+00,0.000008
1,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.657960,0.278739,...,0.024520,PH015508000,9234.0,119.8E_16.3N,1844,5089,0.362350,0.0,4.755256e-07,0.000019
2,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.657960,0.278739,...,0.024520,PH015514000,9234.0,119.8E_16.3N,3245,5089,0.637650,0.0,8.368116e-07,0.000034
3,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.646976,0.266692,...,0.042715,PH015501000,9235.0,119.8E_16.2N,1351,6106,0.221258,0.0,5.674695e-07,0.000013
4,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.646976,0.266692,...,0.042715,PH015508000,9235.0,119.8E_16.2N,4755,6106,0.778742,0.0,1.997274e-06,0.000047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18337,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.325208,0.513887,...,0.032423,PH082606000,18796.0,125.5E_12.0N,4070,5944,0.684724,0.0,1.909853e-06,0.000059
18338,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.473382,0.475945,...,0.000000,PH082622000,18797.0,125.5E_11.9N,463,463,1.000000,0.0,0.000000e+00,0.000006
18339,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.293731,0.446105,...,0.000000,PH082606000,18962.0,125.6E_12.1N,77,461,0.167028,0.0,0.000000e+00,0.000002
18340,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.293731,0.446105,...,0.000000,PH082617000,18962.0,125.6E_12.1N,384,461,0.832972,0.0,0.000000e+00,0.000008


In [33]:
# Read CSV file which includes regoin name and code
region_df = pd.read_csv(data_dir / "adm3_area.csv", index_col=0)
region_df.head()

Unnamed: 0,Shape_Leng,Shape_Area,ADM3_EN,ADM3_PCODE,ADM3_REF,ADM3ALT1EN,ADM3ALT2EN,ADM2_EN,ADM2_PCODE,ADM1_EN,ADM1_PCODE,ADM0_EN,ADM0_PCODE,date,validOn,validTo,geometry,Area
0,1.601219,0.063496,Aborlan,PH175301000,,,,Palawan,PH175300000,Region IV-B,PH170000000,Philippines (the),PH,2016-06-30,2020-05-29,,MULTIPOLYGON (((13200654.48649568 1032355.1025...,771120600.0
1,1.078749,0.050232,Abra de Ilog,PH175101000,,,,Occidental Mindoro,PH175100000,Region IV-B,PH170000000,Philippines (the),PH,2016-06-30,2020-05-29,,POLYGON ((13423362.387871413 1479551.980005401...,601914600.0
2,0.424301,0.006453,Abucay,PH030801000,,,,Bataan,PH030800000,Region III,PH030000000,Philippines (the),PH,2016-06-30,2020-05-29,,POLYGON ((13413856.918075956 1614138.946940594...,76889030.0
3,0.566053,0.011343,Abulug,PH021501000,,,,Cagayan,PH021500000,Region II,PH020000000,Philippines (the),PH,2016-06-30,2020-05-29,,"POLYGON ((13518031.78157248 2007651.089252317,...",132668200.0
4,1.013649,0.026124,Abuyog,PH083701000,,,,Leyte,PH083700000,Region VIII,PH080000000,Philippines (the),PH,2016-06-30,2020-05-29,,MULTIPOLYGON (((13917924.3505296 1180265.08047...,316175200.0


In [34]:
# join regoin_code column to the main df(join_final) based on mun_code
join_region_df = join_final.merge(
    region_df[["ADM1_EN", "ADM1_PCODE", "ADM3_PCODE"]],
    on="ADM3_PCODE",
    how="left",
)
join_region_df

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,strong_roof_light_wall,...,id_x,Centroid,numbuildings_x,numbuildings,weight,weight*%damg*houses,weight*%predicted_damg*houses,weight*houses,ADM1_EN,ADM1_PCODE
0,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.657219,0.284772,...,9233.0,119.8E_16.4N,689,689,1.000000,0.0,0.000000e+00,0.000008,Region I,PH010000000
1,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.657960,0.278739,...,9234.0,119.8E_16.3N,1844,5089,0.362350,0.0,4.755256e-07,0.000019,Region I,PH010000000
2,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.657960,0.278739,...,9234.0,119.8E_16.3N,3245,5089,0.637650,0.0,8.368116e-07,0.000034,Region I,PH010000000
3,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.646976,0.266692,...,9235.0,119.8E_16.2N,1351,6106,0.221258,0.0,5.674695e-07,0.000013,Region I,PH010000000
4,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.646976,0.266692,...,9235.0,119.8E_16.2N,4755,6106,0.778742,0.0,1.997274e-06,0.000047,Region I,PH010000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18337,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.325208,0.513887,...,18796.0,125.5E_12.0N,4070,5944,0.684724,0.0,1.909853e-06,0.000059,Region VIII,PH080000000
18338,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.473382,0.475945,...,18797.0,125.5E_11.9N,463,463,1.000000,0.0,0.000000e+00,0.000006,Region VIII,PH080000000
18339,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.293731,0.446105,...,18962.0,125.6E_12.1N,77,461,0.167028,0.0,0.000000e+00,0.000002,Region VIII,PH080000000
18340,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.293731,0.446105,...,18962.0,125.6E_12.1N,384,461,0.832972,0.0,0.000000e+00,0.000008,Region VIII,PH080000000


In [35]:
# Groupby by municipality with sum as the aggregation function
agg_df = join_region_df.groupby(
    ["ADM3_PCODE", "ADM1_PCODE", "typhoon_name"]
).agg(
    {
        "weight*%damg*houses": "sum",
        "weight*%predicted_damg*houses": "sum",
        "weight": "sum",
        "weight*houses": "sum",
    }
)
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,weight*%damg*houses,weight*%predicted_damg*houses,weight,weight*houses
ADM3_PCODE,ADM1_PCODE,typhoon_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PH012801000,PH010000000,BOPHA,0.000000,5.610564e-07,1.571093,0.000008
PH012801000,PH010000000,HAIMA,0.000003,1.302088e-05,1.571093,0.000008
PH012801000,PH010000000,MANGKHUT,0.000008,1.671826e-05,1.571093,0.000008
PH012801000,PH010000000,SAUDEL,0.000000,5.867593e-07,1.571093,0.000008
PH012802000,PH010000000,BOPHA,0.000000,0.000000e+00,0.724799,0.000135
...,...,...,...,...,...,...
PH175902000,PH170000000,KETSANA,0.000000,3.195157e-06,4.000000,0.000025
PH175905000,PH170000000,KETSANA,0.000000,2.012662e-06,1.000000,0.000018
PH175907000,PH170000000,BOPHA,0.000000,2.687087e-06,0.340199,0.000026
PH175914000,PH170000000,BOPHA,0.000000,3.842557e-06,2.551962,0.000045


In [36]:
# Normalize by the sum of the weights
agg_df["damg_houses_per_mun"] = (
    agg_df["weight*%damg*houses"] / agg_df["weight"]
)
agg_df["predicted_damg_houses_per_mun"] = (
    agg_df["weight*%predicted_damg*houses"] / agg_df["weight"]
)

agg_df["sum_of_weight_mun"] = agg_df["weight*houses"] / agg_df["weight"]

In [37]:
# Keep only %damg_normalized and %pred_damg_normalized columns
agg_df.drop(agg_df.columns[:4], inplace=True, axis=1)

In [38]:
# Groupby by regin with sum as the aggregation function
agg_df_1 = agg_df.groupby(["ADM1_PCODE", "typhoon_name"]).agg(
    {
        "damg_houses_per_mun": "sum",
        "predicted_damg_houses_per_mun": "sum",
        "sum_of_weight_mun": "sum",
    }
)
agg_df_1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,damg_houses_per_mun,predicted_damg_houses_per_mun,sum_of_weight_mun
ADM1_PCODE,typhoon_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PH010000000,BOPHA,2e-06,3.4e-05,0.025227
PH010000000,HAIMA,0.008584,0.002912,0.025226
PH010000000,KETSANA,1.6e-05,0.001282,0.025133
PH010000000,MANGKHUT,0.008383,0.004394,0.025226
PH010000000,SAUDEL,0.0,0.000824,0.025226


In [39]:
# Rename columns' names
agg_df_1 = agg_df_1.rename(
    columns={
        "damg_houses_per_mun": "damg_houses_per_Region",
        "predicted_damg_houses_per_mun": "predicted_damg_houses_per_Region",
        "sum_of_weight_mun": "sum_of_weight_region",
    }
)

In [40]:
# reset indexex
agg_df_2 = agg_df_1.reset_index()

In [41]:
# Estimate the percent difference of real and predicted damaged values  (First way)
agg_df_2["Percent_Difference_total_houses_based"] = (
    (
        agg_df_2["damg_houses_per_Region"]
        - agg_df_2["predicted_damg_houses_per_Region"]
    )
    / (
        agg_df_2["sum_of_weight_region"]
    )  # (agg_df_2["damg_houses_per_Region"] + np.finfo(float).eps)
) * 100

In [66]:
# Estimate the percent difference of real and predicted damaged values (Second way)
difference = (
    agg_df_2["damg_houses_per_Region"]
    - agg_df_2["predicted_damg_houses_per_Region"]
)
ave = (
    agg_df_2["damg_houses_per_Region"]
    + agg_df_2["predicted_damg_houses_per_Region"]
) / 2

agg_df_2["Percent_Difference_average_based"] = (difference / ave) * 100

In [67]:
agg_df_2 = agg_df_2[
    [
        "ADM1_PCODE",
        "typhoon_name",
        "Percent_Difference_total_houses_based",
        "Percent_Difference_average_based",
    ]
]

In [68]:
df_sorted = agg_df_2.sort_values(
    by=["typhoon_name"], ascending=-True
).reset_index(drop=True)
df_sorted

Unnamed: 0,ADM1_PCODE,typhoon_name,Percent_Difference_total_houses_based,Percent_Difference_average_based
0,PH010000000,BOPHA,-0.127543,-174.982917
1,PH120000000,BOPHA,-1.253007,-185.686083
2,PH110000000,BOPHA,83.751252,17.553016
3,PH100000000,BOPHA,-355.292351,-185.758949
4,PH170000000,BOPHA,-22.15011,-110.394098
5,PH020000000,BOPHA,-0.248335,-200.0
6,PH090000000,BOPHA,-69.823665,-199.928579
7,PH080000000,BOPHA,-4.429423,-200.0
8,PH070000000,BOPHA,-30.785225,-151.953915
9,PH160000000,BOPHA,211.166748,94.730554
