# RMSE estimation for region(adm1)

We used a weight and adm3_area files to join weight and adm_1(region) and adm_3(municipality) to our main dataset.
We prepared a dataframe that represents real and damaged value per region(ADM1). Then we train our model(XGBoost Reduced Overfitting) to this input data while we splitted five typhoons(randomly selected) as the test set and the rest of them as the train set.
The final goal is to estimate the difference between real and predicted damage value per region with respect to each typhoon, to check how the model performs for a wide area.

In [1]:
%load_ext jupyter_black

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
import statistics

from utils import get_training_dataset, weight_file

  from pandas import MultiIndex, Int64Index


In [3]:
# Read csv file and import to df
df = get_training_dataset()

# Move target to be the last column for simplicity
df = df.reindex(
    columns=[col for col in df.columns if col != "percent_houses_damaged"]
    + ["percent_houses_damaged"]
)

df

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
0,DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.000000,,1.018526,...,2.699781,5.762712,3445.709753,1,0.00,0.000000,1.000000,0.000000,0.000000,0.0
1,DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.301020,-0.527000,1.579400,...,4.585088,12.799127,8602.645832,1,0.00,0.000000,1.000000,0.000000,0.000000,0.0
2,DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283000,0.551764,...,1.527495,8.833333,5084.012925,1,0.00,0.010000,0.990000,197.339034,0.000000,0.0
3,DURIAN,2006,4640,0.0,614.675270,0.356250,0.101562,645.899660,-0.358889,2.107949,...,11.677657,17.530431,55607.865950,1,0.00,0.310000,0.690000,4970.477311,0.000000,0.0
4,DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.462800,3.538881,...,17.074011,31.931338,35529.342507,1,0.00,0.770000,0.230000,12408.594656,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141253,MOLAVE,2020,20677,0.0,644.575831,2.543750,0.778646,4449.357133,0.508167,3.790141,...,18.012771,36.304688,21559.003490,1,0.08,0.080000,0.840000,17619.701390,0.000000,0.0
141254,MOLAVE,2020,20678,0.0,655.685233,2.558333,0.861458,1521.435795,-0.174100,3.532580,...,13.163042,65.687266,12591.742022,1,0.00,0.420000,0.580000,5623.069564,0.000000,0.0
141255,MOLAVE,2020,20679,0.0,666.794635,2.975000,0.949479,930.647069,-0.244286,4.444498,...,10.901755,37.414996,19740.596834,1,0.00,0.109091,0.890909,5912.671746,0.015207,0.0
141256,MOLAVE,2020,20680,0.0,677.904037,2.889583,1.083333,1800.666044,0.038000,5.816195,...,17.917650,105.812452,26363.303778,1,0.03,0.250000,0.720000,11254.164413,0.020806,0.0


In [4]:
# df.loc[df["typhoon_name"] == "GONI"]

In [5]:
# Fill the missing values of RWI with mean value
df["rwi"].fillna(df["rwi"].mean(), inplace=True)

In [6]:
# Set any values >100% to 100%,
for i in range(len(df)):
    if df.loc[i, "percent_houses_damaged"] > 100:
        df.at[i, "percent_houses_damaged"] = float(100)

In [7]:
# Remove zeros from wind_speed
df = df[(df[["wind_speed"]] != 0).any(axis=1)]
df.reset_index(drop=True, inplace=True)
df = df.drop(columns=["typhoon_year"])
df.head()

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,std_slope,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
0,DURIAN,8284,12.460039,275.018491,0.670833,0.313021,0.479848,-0.213039,12.896581,7.450346,...,34.62955,42.21875,5303.65949,1,0.0,0.0,1.0,0.0,0.0,0.0
1,DURIAN,8286,11.428974,297.027578,0.929167,0.343229,55.649739,0.206,14.070741,6.514647,...,25.475388,72.283154,61015.543599,1,0.0,0.14,0.86,276.871504,0.0,0.0
2,DURIAN,8450,13.077471,262.598363,0.716667,0.424479,8.157414,-0.636,19.758682,10.9407,...,54.353996,102.215198,66707.43807,1,0.0,0.11,0.89,448.539453,0.0,0.0
3,DURIAN,8451,12.511864,273.63933,0.56875,0.336979,88.292015,-0.2275,11.499097,6.901584,...,31.814048,58.988877,53841.050168,1,0.0,0.12,0.88,2101.708435,0.0,0.0
4,DURIAN,8452,11.977511,284.680297,0.589583,0.290625,962.766739,-0.299667,13.866633,6.528689,...,25.976413,111.386527,87378.257957,1,0.07,0.46,0.47,11632.726327,0.0,0.0


In [8]:
# Define bins for data stratification
bins2 = [0, 0.00009, 1, 10, 50, 101]
samples_per_bin2, binsP2 = np.histogram(df["percent_houses_damaged"], bins=bins2)

In [9]:
# Check the bins' intervalls
df["percent_houses_damaged"].value_counts(bins=binsP2)

(-0.001, 9e-05]    38901
(9e-05, 1.0]        7232
(1.0, 10.0]         2552
(10.0, 50.0]         925
(50.0, 101.0]        144
Name: percent_houses_damaged, dtype: int64

In [10]:
bin_index2 = np.digitize(df["percent_houses_damaged"], bins=binsP2)

In [11]:
y_input_strat = bin_index2

In [12]:
# Use MinMaxScaler function for data standardization (it normalaize data in range of [0,1] and not negative values)

# Separate typhoon from other features
dfs = np.split(df, [2], axis=1)
dfa = np.split(dfs[1], [14], axis=1)
# print(dfs[0], dfs[1], dfa[0], dfa[1])

# Standardaize data
scaler = MinMaxScaler().fit(dfa[0])
X1 = scaler.transform(dfa[0])
Xnew = pd.DataFrame(X1)
Xnew_per_pred = pd.DataFrame(X1)
display(Xnew)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.168433,0.879044,0.010371,0.009660,2.334511e-07,0.330964,0.421621,0.526082,0.478800,0.413882,0.028344,0.031312,1.0,0.00
1,0.154462,0.949392,0.014378,0.010594,9.712553e-05,0.504983,0.460007,0.460011,0.440663,0.304474,0.043049,0.360224,1.0,0.00
2,0.176799,0.839346,0.011082,0.013105,1.371717e-05,0.155316,0.645959,0.772542,0.670175,0.649623,0.057690,0.393828,1.0,0.00
3,0.169135,0.874636,0.008788,0.010400,1.544535e-04,0.324958,0.375933,0.487333,0.383667,0.380232,0.036547,0.317867,1.0,0.00
4,0.161895,0.909926,0.009111,0.008968,1.690249e-03,0.294989,0.453334,0.461002,0.421247,0.310462,0.062176,0.515864,1.0,0.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49749,0.108159,0.816770,0.034958,0.032650,7.311641e-05,0.111296,0.327556,0.563018,0.331492,0.433746,0.026779,0.011391,1.0,0.01
49750,0.102816,0.852281,0.033634,0.028851,2.854586e-03,0.379331,0.129550,0.275946,0.129081,0.208269,0.019943,0.271370,1.0,0.05
49751,0.096754,0.887792,0.029724,0.025744,1.096340e-03,0.498339,0.129227,0.261104,0.138776,0.204722,0.016272,0.087050,1.0,0.01
49752,0.092212,0.923300,0.058092,0.035882,3.178534e-05,0.286545,0.132543,0.177055,0.156689,0.151533,0.015221,0.031742,1.0,0.00


In [13]:
dfa[1] = dfa[1].astype(float)

In [14]:
Xnew = pd.concat([Xnew.reset_index(drop=True), dfa[1].reset_index(drop=True)], axis=1)
Xnew

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
0,0.168433,0.879044,0.010371,0.009660,2.334511e-07,0.330964,0.421621,0.526082,0.478800,0.413882,0.028344,0.031312,1.0,0.00,0.000000,1.000000,0.000000,0.000000,0.0
1,0.154462,0.949392,0.014378,0.010594,9.712553e-05,0.504983,0.460007,0.460011,0.440663,0.304474,0.043049,0.360224,1.0,0.00,0.140000,0.860000,276.871504,0.000000,0.0
2,0.176799,0.839346,0.011082,0.013105,1.371717e-05,0.155316,0.645959,0.772542,0.670175,0.649623,0.057690,0.393828,1.0,0.00,0.110000,0.890000,448.539453,0.000000,0.0
3,0.169135,0.874636,0.008788,0.010400,1.544535e-04,0.324958,0.375933,0.487333,0.383667,0.380232,0.036547,0.317867,1.0,0.00,0.120000,0.880000,2101.708435,0.000000,0.0
4,0.161895,0.909926,0.009111,0.008968,1.690249e-03,0.294989,0.453334,0.461002,0.421247,0.310462,0.062176,0.515864,1.0,0.07,0.460000,0.470000,11632.726327,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49749,0.108159,0.816770,0.034958,0.032650,7.311641e-05,0.111296,0.327556,0.563018,0.331492,0.433746,0.026779,0.011391,1.0,0.01,0.000000,0.990000,330.215768,1.143833,0.0
49750,0.102816,0.852281,0.033634,0.028851,2.854586e-03,0.379331,0.129550,0.275946,0.129081,0.208269,0.019943,0.271370,1.0,0.05,0.190000,0.760000,5409.607943,1.143833,0.0
49751,0.096754,0.887792,0.029724,0.025744,1.096340e-03,0.498339,0.129227,0.261104,0.138776,0.204722,0.016272,0.087050,1.0,0.01,0.020000,0.970000,5378.401365,1.143833,0.0
49752,0.092212,0.923300,0.058092,0.035882,3.178534e-05,0.286545,0.132543,0.177055,0.156689,0.151533,0.015221,0.031742,1.0,0.00,0.027273,0.972727,914.677196,1.143833,0.0


In [15]:
features = [
    "wind_speed",
    "track_distance",
    "total_houses",
    "rainfall_max_6h",
    "rainfall_max_24h",
    "rwi",
    "mean_slope",
    "std_slope",
    "mean_tri",
    "std_tri",
    "mean_elev",
    "coast_length",
    "with_coast",
    "urban",
    "rural",
    "water",
    "total_pop",
    "percent_houses_damaged_5years",
]

In [16]:
# Add the features to the columns' headers after standardization
i = 0
for feature in features:
    Xnew = Xnew.rename(columns={i: feature})
    i += 1

Xnew = pd.concat([dfs[0].reset_index(drop=True), Xnew.reset_index(drop=True)], axis=1)
Xnew

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
0,DURIAN,8284,0.168433,0.879044,0.010371,0.009660,2.334511e-07,0.330964,0.421621,0.526082,...,0.413882,0.028344,0.031312,1.0,0.00,0.000000,1.000000,0.000000,0.000000,0.0
1,DURIAN,8286,0.154462,0.949392,0.014378,0.010594,9.712553e-05,0.504983,0.460007,0.460011,...,0.304474,0.043049,0.360224,1.0,0.00,0.140000,0.860000,276.871504,0.000000,0.0
2,DURIAN,8450,0.176799,0.839346,0.011082,0.013105,1.371717e-05,0.155316,0.645959,0.772542,...,0.649623,0.057690,0.393828,1.0,0.00,0.110000,0.890000,448.539453,0.000000,0.0
3,DURIAN,8451,0.169135,0.874636,0.008788,0.010400,1.544535e-04,0.324958,0.375933,0.487333,...,0.380232,0.036547,0.317867,1.0,0.00,0.120000,0.880000,2101.708435,0.000000,0.0
4,DURIAN,8452,0.161895,0.909926,0.009111,0.008968,1.690249e-03,0.294989,0.453334,0.461002,...,0.310462,0.062176,0.515864,1.0,0.07,0.460000,0.470000,11632.726327,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49749,MOLAVE,19306,0.108159,0.816770,0.034958,0.032650,7.311641e-05,0.111296,0.327556,0.563018,...,0.433746,0.026779,0.011391,1.0,0.01,0.000000,0.990000,330.215768,1.143833,0.0
49750,MOLAVE,19307,0.102816,0.852281,0.033634,0.028851,2.854586e-03,0.379331,0.129550,0.275946,...,0.208269,0.019943,0.271370,1.0,0.05,0.190000,0.760000,5409.607943,1.143833,0.0
49751,MOLAVE,19308,0.096754,0.887792,0.029724,0.025744,1.096340e-03,0.498339,0.129227,0.261104,...,0.204722,0.016272,0.087050,1.0,0.01,0.020000,0.970000,5378.401365,1.143833,0.0
49752,MOLAVE,19476,0.092212,0.923300,0.058092,0.035882,3.178534e-05,0.286545,0.132543,0.177055,...,0.151533,0.015221,0.031742,1.0,0.00,0.027273,0.972727,914.677196,1.143833,0.0


In [17]:
df["typhoon_name"].unique()

array(['DURIAN', 'FENGSHEN', 'KETSANA', 'CONSON', 'NESAT', 'BOPHA',
       'NARI', 'KROSA', 'HAIYAN', 'USAGI', 'UTOR', 'JANGMI', 'KALMAEGI',
       'RAMMASUN', 'HAGUPIT', 'FUNG-WONG', 'LINGLING', 'MUJIGAE', 'MELOR',
       'NOUL', 'GONI', 'LINFA', 'KOPPU', 'MEKKHALA', 'HAIMA', 'TOKAGE',
       'MERANTI', 'NOCK-TEN', 'SARIKA', 'MANGKHUT', 'YUTU', 'KAMMURI',
       'NAKRI', 'PHANFONE', 'SAUDEL', 'VAMCO', 'VONGFONG', 'MOLAVE'],
      dtype=object)

In [18]:
# Define a test_list (including 5 typhoons) randomly were chosen
test_list_1 = ["FENGSHEN", "DURIAN", "NESAT", "VONGFONG", "MOLAVE"]

test_list_2 = ["YUTU", "KAMMURI", "SARIKA", "TOKAGE", "LINGLING"]

test_list_3 = ["SAUDEL", "MANGKHUT", "HAIMA", "BOPHA", "KETSANA"]

test_list_4 = ["GONI", "LINFA", "NOCK-TEN", "NOUL", "JANGMI"]

test_list_5 = ["NAKRI", "UTOR", "HAIYAN", "RAMMASUN", "CONSON"]

test_list_6 = ["PHANFONE", "VAMCO", "KOPPU", "FUNG-WONG", "HAGUPIT"]

test_list_7 = ["MEKKHALA", "NARI", "KROSA", "USAGI", "KALMAEGI"]

In [19]:
# Extract the column of unique ids
grid_id = df["grid_point_id"]

In [20]:
df_test = pd.DataFrame(
    Xnew,
    columns=[
        "typhoon_name",
        "grid_point_id",
        "wind_speed",
        "track_distance",
        "total_houses",
        "rainfall_max_6h",
        "rainfall_max_24h",
        "rwi",
        "mean_slope",
        "std_slope",
        "mean_tri",
        "std_tri",
        "mean_elev",
        "coast_length",
        "with_coast",
        "urban",
        "rural",
        "water",
        "total_pop",
        "percent_houses_damaged_5years",
        "percent_houses_damaged",
    ],
)

df_test = Xnew[Xnew["typhoon_name"] == test_list_3[4]]
df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[3]])
df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[2]])
df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[1]])
df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[0]])

Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[4]], inplace=True)
Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[3]], inplace=True)
Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[2]], inplace=True)
Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[1]], inplace=True)
Xnew.drop(Xnew.index[Xnew["typhoon_name"] == test_list_3[0]], inplace=True)

display(df_test)
df_train = Xnew
display(df_train)

  df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[3]])
  df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[2]])
  df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[1]])
  df_test = df_test.append(Xnew[Xnew["typhoon_name"] == test_list_3[0]])


Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
3804,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.069010,0.090805,...,0.065630,0.020180,0.031687,1.0,0.05,0.03,0.92,3893.053124,0.000000,0.0
3805,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,0.141129,0.060336,0.091281,1.0,0.10,0.60,0.30,13238.460497,0.000000,0.0
3806,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,0.214061,0.058423,0.104104,1.0,0.28,0.59,0.13,21410.246051,0.000000,0.0
3807,KETSANA,9236,0.242485,0.144241,0.126329,0.139054,0.008478,0.307074,0.147618,0.257436,...,0.184985,0.033297,0.091221,1.0,0.24,0.62,0.14,27185.054763,0.000000,0.0
3808,KETSANA,9237,0.244396,0.108748,0.143194,0.143739,0.005643,0.320598,0.088132,0.140876,...,0.103623,0.028936,0.140126,1.0,0.00,0.89,0.11,9535.117048,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43270,SAUDEL,18795,0.061937,0.877058,0.091758,0.077746,0.004009,0.358942,0.094337,0.197095,...,0.158586,0.015329,0.265042,1.0,0.12,0.16,0.72,12112.204272,1.133269,0.0
43271,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.043329,0.115342,...,0.095746,0.012607,0.089733,1.0,0.14,0.06,0.80,23128.451605,0.922223,0.0
43272,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.066634,0.119561,...,0.088262,0.013689,0.077939,1.0,0.03,0.07,0.90,361.762983,1.475799,0.0
43273,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,0.117307,0.016159,0.095031,1.0,0.00,0.03,0.97,2407.611398,1.310422,0.0


Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged
0,DURIAN,8284,0.168433,0.879044,0.010371,0.009660,2.334511e-07,0.330964,0.421621,0.526082,...,0.413882,0.028344,0.031312,1.0,0.00,0.000000,1.000000,0.000000,0.000000,0.0
1,DURIAN,8286,0.154462,0.949392,0.014378,0.010594,9.712553e-05,0.504983,0.460007,0.460011,...,0.304474,0.043049,0.360224,1.0,0.00,0.140000,0.860000,276.871504,0.000000,0.0
2,DURIAN,8450,0.176799,0.839346,0.011082,0.013105,1.371717e-05,0.155316,0.645959,0.772542,...,0.649623,0.057690,0.393828,1.0,0.00,0.110000,0.890000,448.539453,0.000000,0.0
3,DURIAN,8451,0.169135,0.874636,0.008788,0.010400,1.544535e-04,0.324958,0.375933,0.487333,...,0.380232,0.036547,0.317867,1.0,0.00,0.120000,0.880000,2101.708435,0.000000,0.0
4,DURIAN,8452,0.161895,0.909926,0.009111,0.008968,1.690249e-03,0.294989,0.453334,0.461002,...,0.310462,0.062176,0.515864,1.0,0.07,0.460000,0.470000,11632.726327,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49749,MOLAVE,19306,0.108159,0.816770,0.034958,0.032650,7.311641e-05,0.111296,0.327556,0.563018,...,0.433746,0.026779,0.011391,1.0,0.01,0.000000,0.990000,330.215768,1.143833,0.0
49750,MOLAVE,19307,0.102816,0.852281,0.033634,0.028851,2.854586e-03,0.379331,0.129550,0.275946,...,0.208269,0.019943,0.271370,1.0,0.05,0.190000,0.760000,5409.607943,1.143833,0.0
49751,MOLAVE,19308,0.096754,0.887792,0.029724,0.025744,1.096340e-03,0.498339,0.129227,0.261104,...,0.204722,0.016272,0.087050,1.0,0.01,0.020000,0.970000,5378.401365,1.143833,0.0
49752,MOLAVE,19476,0.092212,0.923300,0.058092,0.035882,3.178534e-05,0.286545,0.132543,0.177055,...,0.151533,0.015221,0.031742,1.0,0.00,0.027273,0.972727,914.677196,1.143833,0.0


In [21]:
df_test["typhoon_name"].unique()

array(['KETSANA', 'BOPHA', 'HAIMA', 'MANGKHUT', 'SAUDEL'], dtype=object)

In [22]:
# Split X and y from dataframe features
X_test = df_test[features]
X_train = df_train[features]

y_train = df_train["percent_houses_damaged"]
y_test = df_test["percent_houses_damaged"]

In [23]:
# Define XGBoost Reduced Overfitting model
xgb = XGBRegressor(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=0.8,
    colsample_bynode=0.8,
    colsample_bytree=0.8,
    gamma=3,
    eta=0.01,
    importance_type="gain",
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=4,
    min_child_weight=1,
    missing=1,
    n_estimators=100,
    early_stopping_rounds=10,
    n_jobs=1,
    nthread=None,
    objective="reg:squarederror",
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=0.8,
    verbosity=1,
    eval_metric=["rmse", "logloss"],
    random_state=0,
)

eval_set = [(X_test, y_test)]
xgb_model = xgb.fit(X_train, y_train, eval_set=eval_set, verbose=False)

X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


                              OLS Regression Results                              
Dep. Variable:     percent_houses_damaged   R-squared:                       0.215
Model:                                OLS   Adj. R-squared:                  0.215
Method:                     Least Squares   F-statistic:                     704.8
Date:                    Thu, 23 Mar 2023   Prob (F-statistic):               0.00
Time:                            18:45:40   Log-Likelihood:            -1.2620e+05
No. Observations:                   43643   AIC:                         2.524e+05
Df Residuals:                       43625   BIC:                         2.526e+05
Df Model

In [24]:
# Make prediction
y_pred_train = xgb.predict(X_train)
y_pred_train_clipped = y_pred_train.clip(0, 100)

y_pred = xgb.predict(X_test)
y_pred_clipped = y_pred.clip(0, 100)

In [25]:
y_pred = y_pred_clipped.tolist()
y_true = df_test["percent_houses_damaged"].tolist()

In [26]:
df_test.reset_index(drop=True, inplace=True)
for i in range(len(df_test)):
    df_test.at[i, "y_pred"] = y_pred[i]
df_test

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,...,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,y_pred
0,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.069010,0.090805,...,0.020180,0.031687,1.0,0.05,0.03,0.92,3893.053124,0.000000,0.0,0.025106
1,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,0.060336,0.091281,1.0,0.10,0.60,0.30,13238.460497,0.000000,0.0,0.062428
2,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,0.058423,0.104104,1.0,0.28,0.59,0.13,21410.246051,0.000000,0.0,0.087257
3,KETSANA,9236,0.242485,0.144241,0.126329,0.139054,0.008478,0.307074,0.147618,0.257436,...,0.033297,0.091221,1.0,0.24,0.62,0.14,27185.054763,0.000000,0.0,0.166181
4,KETSANA,9237,0.244396,0.108748,0.143194,0.143739,0.005643,0.320598,0.088132,0.140876,...,0.028936,0.140126,1.0,0.00,0.89,0.11,9535.117048,0.000000,0.0,0.567609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6106,SAUDEL,18795,0.061937,0.877058,0.091758,0.077746,0.004009,0.358942,0.094337,0.197095,...,0.015329,0.265042,1.0,0.12,0.16,0.72,12112.204272,1.133269,0.0,0.000000
6107,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.043329,0.115342,...,0.012607,0.089733,1.0,0.14,0.06,0.80,23128.451605,0.922223,0.0,0.000000
6108,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.066634,0.119561,...,0.013689,0.077939,1.0,0.03,0.07,0.90,361.762983,1.475799,0.0,0.008930
6109,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,0.016159,0.095031,1.0,0.00,0.03,0.97,2407.611398,1.310422,0.0,0.005853


In [27]:
# Read a CSV file including grid_id and mun_code and import to a df
df_weight = weight_file("/ggl_grid_to_mun_weights.csv")
df_weight.head()

Unnamed: 0,ADM3_PCODE,id_x,Centroid,numbuildings_x,id,numbuildings,weight
0,PH012801000,11049.0,120.9E_18.5N,1052,11049,1794,0.586399
1,PH012810000,11049.0,120.9E_18.5N,0,11049,1794,0.0
2,PH012815000,11049.0,120.9E_18.5N,742,11049,1794,0.413601
3,PH012801000,11050.0,120.9E_18.4N,193,11050,196,0.984694
4,PH012810000,11050.0,120.9E_18.4N,0,11050,196,0.0


In [28]:
# Change name of column ['id'] to ['grid_point_id'] the same name as in input df
df_weight.rename(columns={"id": "grid_point_id"}, inplace=True)
df_weight.head()

Unnamed: 0,ADM3_PCODE,id_x,Centroid,numbuildings_x,grid_point_id,numbuildings,weight
0,PH012801000,11049.0,120.9E_18.5N,1052,11049,1794,0.586399
1,PH012810000,11049.0,120.9E_18.5N,0,11049,1794,0.0
2,PH012815000,11049.0,120.9E_18.5N,742,11049,1794,0.413601
3,PH012801000,11050.0,120.9E_18.4N,193,11050,196,0.984694
4,PH012810000,11050.0,120.9E_18.4N,0,11050,196,0.0


In [29]:
# join main df to the weight df based on grid_point_id
join_final = df_test.merge(df_weight, on="grid_point_id", how="left")
join_final

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,...,total_pop,percent_houses_damaged_5years,percent_houses_damaged,y_pred,ADM3_PCODE,id_x,Centroid,numbuildings_x,numbuildings,weight
0,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.069010,0.090805,...,3893.053124,0.000000,0.0,0.025106,PH015514000,9233.0,119.8E_16.4N,689,689,1.000000
1,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,13238.460497,0.000000,0.0,0.062428,PH015508000,9234.0,119.8E_16.3N,1844,5089,0.362350
2,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,13238.460497,0.000000,0.0,0.062428,PH015514000,9234.0,119.8E_16.3N,3245,5089,0.637650
3,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,21410.246051,0.000000,0.0,0.087257,PH015501000,9235.0,119.8E_16.2N,1351,6106,0.221258
4,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,21410.246051,0.000000,0.0,0.087257,PH015508000,9235.0,119.8E_16.2N,4755,6106,0.778742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18337,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.043329,0.115342,...,23128.451605,0.922223,0.0,0.000000,PH082606000,18796.0,125.5E_12.0N,4070,5944,0.684724
18338,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.066634,0.119561,...,361.762983,1.475799,0.0,0.008930,PH082622000,18797.0,125.5E_11.9N,463,463,1.000000
18339,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,2407.611398,1.310422,0.0,0.005853,PH082606000,18962.0,125.6E_12.1N,77,461,0.167028
18340,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,2407.611398,1.310422,0.0,0.005853,PH082617000,18962.0,125.6E_12.1N,384,461,0.832972


In [30]:
# Remove all columns between column index 21 to 25
join_final.drop(join_final.iloc[:, 23:27], inplace=True, axis=1)
join_final

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,...,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,y_pred,ADM3_PCODE,weight
0,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.069010,0.090805,...,1.0,0.05,0.03,0.92,3893.053124,0.000000,0.0,0.025106,PH015514000,1.000000
1,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,1.0,0.10,0.60,0.30,13238.460497,0.000000,0.0,0.062428,PH015508000,0.362350
2,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,1.0,0.10,0.60,0.30,13238.460497,0.000000,0.0,0.062428,PH015514000,0.637650
3,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,1.0,0.28,0.59,0.13,21410.246051,0.000000,0.0,0.087257,PH015501000,0.221258
4,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,1.0,0.28,0.59,0.13,21410.246051,0.000000,0.0,0.087257,PH015508000,0.778742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18337,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.043329,0.115342,...,1.0,0.14,0.06,0.80,23128.451605,0.922223,0.0,0.000000,PH082606000,0.684724
18338,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.066634,0.119561,...,1.0,0.03,0.07,0.90,361.762983,1.475799,0.0,0.008930,PH082622000,1.000000
18339,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,1.0,0.00,0.03,0.97,2407.611398,1.310422,0.0,0.005853,PH082606000,0.167028
18340,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,1.0,0.00,0.03,0.97,2407.611398,1.310422,0.0,0.005853,PH082617000,0.832972


In [31]:
# Multiply %damg and also %predicted_damg with total_houses and weight
join_final["weight*%damg*houses"] = (
    join_final["weight"]
    * join_final["percent_houses_damaged"]
    * join_final["total_houses"]
) / 100
join_final["weight*%predicted_damg*houses"] = (
    join_final["weight"] * join_final["y_pred"] * join_final["total_houses"]
) / 100

# Multiply total_houses with weight
join_final["weight*houses"] = (join_final["weight"] * join_final["total_houses"]) / 100

join_final

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,...,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,y_pred,ADM3_PCODE,weight,weight*%damg*houses,weight*%predicted_damg*houses,weight*houses
0,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.069010,0.090805,...,0.92,3893.053124,0.000000,0.0,0.025106,PH015514000,1.000000,0.0,1.106424e-05,0.000441
1,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,0.30,13238.460497,0.000000,0.0,0.062428,PH015508000,0.362350,0.0,1.625420e-05,0.000260
2,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,0.30,13238.460497,0.000000,0.0,0.062428,PH015514000,0.637650,0.0,2.860352e-05,0.000458
3,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,0.13,21410.246051,0.000000,0.0,0.087257,PH015501000,0.221258,0.0,1.908737e-05,0.000219
4,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,0.13,21410.246051,0.000000,0.0,0.087257,PH015508000,0.778742,0.0,6.718019e-05,0.000770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18337,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.043329,0.115342,...,0.80,23128.451605,0.922223,0.0,0.000000,PH082606000,0.684724,0.0,0.000000e+00,0.000620
18338,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.066634,0.119561,...,0.90,361.762983,1.475799,0.0,0.008930,PH082622000,1.000000,0.0,6.849179e-06,0.000767
18339,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,0.97,2407.611398,1.310422,0.0,0.005853,PH082606000,0.167028,0.0,5.274526e-07,0.000090
18340,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,0.97,2407.611398,1.310422,0.0,0.005853,PH082617000,0.832972,0.0,2.630413e-06,0.000449


In [32]:
# Read CSV file which includes regoin name and code
region_df = pd.read_csv("data/adm3_area.csv", index_col=0)
region_df.head()

Unnamed: 0,Shape_Leng,Shape_Area,ADM3_EN,ADM3_PCODE,ADM3_REF,ADM3ALT1EN,ADM3ALT2EN,ADM2_EN,ADM2_PCODE,ADM1_EN,ADM1_PCODE,ADM0_EN,ADM0_PCODE,date,validOn,validTo,geometry,Area
0,1.601219,0.063496,Aborlan,PH175301000,,,,Palawan,PH175300000,Region IV-B,PH170000000,Philippines (the),PH,2016-06-30,2020-05-29,,MULTIPOLYGON (((13200654.48649568 1032355.1025...,771120600.0
1,1.078749,0.050232,Abra de Ilog,PH175101000,,,,Occidental Mindoro,PH175100000,Region IV-B,PH170000000,Philippines (the),PH,2016-06-30,2020-05-29,,POLYGON ((13423362.387871413 1479551.980005401...,601914600.0
2,0.424301,0.006453,Abucay,PH030801000,,,,Bataan,PH030800000,Region III,PH030000000,Philippines (the),PH,2016-06-30,2020-05-29,,POLYGON ((13413856.918075956 1614138.946940594...,76889030.0
3,0.566053,0.011343,Abulug,PH021501000,,,,Cagayan,PH021500000,Region II,PH020000000,Philippines (the),PH,2016-06-30,2020-05-29,,"POLYGON ((13518031.78157248 2007651.089252317,...",132668200.0
4,1.013649,0.026124,Abuyog,PH083701000,,,,Leyte,PH083700000,Region VIII,PH080000000,Philippines (the),PH,2016-06-30,2020-05-29,,MULTIPOLYGON (((13917924.3505296 1180265.08047...,316175200.0


In [33]:
# join regoin_code column to the main df(join_final) based on mun_code
join_region_df = join_final.merge(
    region_df[["ADM1_EN", "ADM1_PCODE", "ADM3_PCODE"]], on="ADM3_PCODE", how="left"
)
join_region_df

Unnamed: 0,typhoon_name,grid_point_id,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,...,percent_houses_damaged_5years,percent_houses_damaged,y_pred,ADM3_PCODE,weight,weight*%damg*houses,weight*%predicted_damg*houses,weight*houses,ADM1_EN,ADM1_PCODE
0,KETSANA,9233,0.228997,0.250719,0.044070,0.044935,0.000753,0.462625,0.069010,0.090805,...,0.000000,0.0,0.025106,PH015514000,1.000000,0.0,1.106424e-05,0.000441,Region I,PH010000000
1,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,0.000000,0.0,0.062428,PH015508000,0.362350,0.0,1.625420e-05,0.000260,Region I,PH010000000
2,KETSANA,9234,0.236445,0.215226,0.071856,0.077553,0.005352,0.324779,0.084089,0.189581,...,0.000000,0.0,0.062428,PH015514000,0.637650,0.0,2.860352e-05,0.000458,Region I,PH010000000
3,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,0.000000,0.0,0.087257,PH015501000,0.221258,0.0,1.908737e-05,0.000219,Region I,PH010000000
4,KETSANA,9235,0.239295,0.179733,0.098866,0.114486,0.006004,0.308338,0.139945,0.291882,...,0.000000,0.0,0.087257,PH015508000,0.778742,0.0,6.718019e-05,0.000770,Region I,PH010000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18337,SAUDEL,18796,0.059568,0.907444,0.090530,0.066750,0.008603,0.548934,0.043329,0.115342,...,0.922223,0.0,0.000000,PH082606000,0.684724,0.0,0.000000e+00,0.000620,Region VIII,PH080000000
18338,SAUDEL,18797,0.053634,0.938190,0.076702,0.057364,0.000619,0.329734,0.066634,0.119561,...,1.475799,0.0,0.008930,PH082622000,1.000000,0.0,6.849179e-06,0.000767,Region VIII,PH080000000
18339,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,1.310422,0.0,0.005853,PH082606000,0.167028,0.0,5.274526e-07,0.000090,Region VIII,PH080000000
18340,SAUDEL,18962,0.062720,0.858897,0.053956,0.054691,0.001009,0.372093,0.106989,0.150028,...,1.310422,0.0,0.005853,PH082617000,0.832972,0.0,2.630413e-06,0.000449,Region VIII,PH080000000


In [34]:
# Groupby by municipality with sum as the aggregation function
agg_df = join_region_df.groupby(["ADM3_PCODE", "ADM1_PCODE", "typhoon_name"]).agg(
    {
        "weight*%damg*houses": "sum",
        "weight*%predicted_damg*houses": "sum",
        "weight": "sum",
        "weight*houses": "sum",
    }
)
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,weight*%damg*houses,weight*%predicted_damg*houses,weight,weight*houses
ADM3_PCODE,ADM1_PCODE,typhoon_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PH012801000,PH010000000,BOPHA,0.000000,0.000000,1.571093,0.000338
PH012801000,PH010000000,HAIMA,0.000664,0.004569,1.571093,0.002437
PH012801000,PH010000000,MANGKHUT,0.002289,0.015345,1.571093,0.003863
PH012801000,PH010000000,SAUDEL,0.000000,0.000000,1.571093,0.000650
PH012802000,PH010000000,BOPHA,0.000000,0.000000,0.724799,0.000044
...,...,...,...,...,...,...
PH175902000,PH170000000,KETSANA,0.000000,0.000388,4.000000,0.005455
PH175905000,PH170000000,KETSANA,0.000000,0.000329,1.000000,0.001521
PH175907000,PH170000000,BOPHA,0.000000,0.000000,0.340199,0.000043
PH175914000,PH170000000,BOPHA,0.000000,0.000004,2.551962,0.000365


In [35]:
# Normalize by the sum of the weights
agg_df["damg_houses_per_mun"] = agg_df["weight*%damg*houses"] / agg_df["weight"]
agg_df["predicted_damg_houses_per_mun"] = (
    agg_df["weight*%predicted_damg*houses"] / agg_df["weight"]
)

agg_df["sum_of_weight_mun"] = agg_df["weight*houses"] / agg_df["weight"]

agg_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,weight*%damg*houses,weight*%predicted_damg*houses,weight,weight*houses,damg_houses_per_mun,predicted_damg_houses_per_mun,sum_of_weight_mun
ADM3_PCODE,ADM1_PCODE,typhoon_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
PH012801000,PH010000000,BOPHA,0.0,0.0,1.571093,0.000338,0.0,0.0,0.000215
PH012801000,PH010000000,HAIMA,0.000664,0.004569,1.571093,0.002437,0.000423,0.002908,0.001551
PH012801000,PH010000000,MANGKHUT,0.002289,0.015345,1.571093,0.003863,0.001457,0.009767,0.002459
PH012801000,PH010000000,SAUDEL,0.0,0.0,1.571093,0.00065,0.0,0.0,0.000414
PH012802000,PH010000000,BOPHA,0.0,0.0,0.724799,4.4e-05,0.0,0.0,6.1e-05


In [36]:
# Keep only %damg_normalized and %pred_damg_normalized columns
agg_df.drop(agg_df.columns[:4], inplace=True, axis=1)
agg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,damg_houses_per_mun,predicted_damg_houses_per_mun,sum_of_weight_mun
ADM3_PCODE,ADM1_PCODE,typhoon_name,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PH012801000,PH010000000,BOPHA,0.000000,0.000000e+00,0.000215
PH012801000,PH010000000,HAIMA,0.000423,2.908288e-03,0.001551
PH012801000,PH010000000,MANGKHUT,0.001457,9.767222e-03,0.002459
PH012801000,PH010000000,SAUDEL,0.000000,0.000000e+00,0.000414
PH012802000,PH010000000,BOPHA,0.000000,0.000000e+00,0.000061
...,...,...,...,...,...
PH175902000,PH170000000,KETSANA,0.000000,9.707278e-05,0.001364
PH175905000,PH170000000,KETSANA,0.000000,3.286993e-04,0.001521
PH175907000,PH170000000,BOPHA,0.000000,0.000000e+00,0.000125
PH175914000,PH170000000,BOPHA,0.000000,1.674685e-06,0.000143


In [37]:
# Groupby by regin with sum as the aggregation function
agg_df_1 = agg_df.groupby(["ADM1_PCODE", "typhoon_name"]).agg(
    {
        "damg_houses_per_mun": "sum",
        "predicted_damg_houses_per_mun": "sum",
        "sum_of_weight_mun": "sum",
    }
)
agg_df_1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,damg_houses_per_mun,predicted_damg_houses_per_mun,sum_of_weight_mun
ADM1_PCODE,typhoon_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PH010000000,BOPHA,1e-06,0.000242,0.017613
PH010000000,HAIMA,0.132145,0.067877,0.194996
PH010000000,KETSANA,6e-05,0.006893,0.111819
PH010000000,MANGKHUT,0.127631,0.164637,0.301262
PH010000000,SAUDEL,0.0,0.005398,0.099009


In [38]:
# Rename columns' names
agg_df_1 = agg_df_1.rename(
    columns={
        "damg_houses_per_mun": "damg_houses_per_Region",
        "predicted_damg_houses_per_mun": "predicted_damg_houses_per_Region",
        "sum_of_weight_mun": "sum_of_weight_region",
    }
)

agg_df_1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,damg_houses_per_Region,predicted_damg_houses_per_Region,sum_of_weight_region
ADM1_PCODE,typhoon_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PH010000000,BOPHA,1e-06,0.000242,0.017613
PH010000000,HAIMA,0.132145,0.067877,0.194996
PH010000000,KETSANA,6e-05,0.006893,0.111819
PH010000000,MANGKHUT,0.127631,0.164637,0.301262
PH010000000,SAUDEL,0.0,0.005398,0.099009


In [39]:
agg_df_2 = agg_df_1.reset_index()
agg_df_2.head()

Unnamed: 0,ADM1_PCODE,typhoon_name,damg_houses_per_Region,predicted_damg_houses_per_Region,sum_of_weight_region
0,PH010000000,BOPHA,1e-06,0.000242,0.017613
1,PH010000000,HAIMA,0.132145,0.067877,0.194996
2,PH010000000,KETSANA,6e-05,0.006893,0.111819
3,PH010000000,MANGKHUT,0.127631,0.164637,0.301262
4,PH010000000,SAUDEL,0.0,0.005398,0.099009


In [40]:
# Estimate the percent difference of real and predicted damaged values  (First way)
agg_df_2["Percent_Difference_total_houses_based"] = (
    (agg_df_2["damg_houses_per_Region"] - agg_df_2["predicted_damg_houses_per_Region"])
    / (
        agg_df_2["sum_of_weight_region"]
    )  # (agg_df_2["damg_houses_per_Region"] + np.finfo(float).eps)
) * 100

In [41]:
# Estimate the percent difference of real and predicted damaged values (Second way)
difference = (
    agg_df_2["damg_houses_per_Region"] - agg_df_2["predicted_damg_houses_per_Region"]
)
ave = (
    agg_df_2["damg_houses_per_Region"] + agg_df_2["predicted_damg_houses_per_Region"]
) / 2

agg_df_2["Percent_Difference_average_based"] = (abs(difference) / ave) * 100
agg_df_2

Unnamed: 0,ADM1_PCODE,typhoon_name,damg_houses_per_Region,predicted_damg_houses_per_Region,sum_of_weight_region,Percent_Difference_total_houses_based,Percent_Difference_average_based
0,PH010000000,BOPHA,1.094611e-06,0.0002424821,0.017613,-1.370529,198.202436
1,PH010000000,HAIMA,0.132145,0.06787738,0.194996,32.958484,64.260414
2,PH010000000,KETSANA,5.969492e-05,0.006892515,0.111819,-6.110583,196.565413
3,PH010000000,MANGKHUT,0.1276309,0.1646369,0.301262,-12.283663,25.323341
4,PH010000000,SAUDEL,0.0,0.005397968,0.099009,-5.452019,200.0
5,PH020000000,BOPHA,0.0,0.00016255,0.017272,-0.941131,200.0
6,PH020000000,HAIMA,0.6779745,0.3108077,0.180373,203.559265,74.266467
7,PH020000000,KETSANA,0.0,0.005698073,0.107808,-5.285384,200.0
8,PH020000000,MANGKHUT,0.4488461,0.6855844,0.208631,-113.472285,41.736938
9,PH020000000,SAUDEL,0.0,0.01325783,0.102609,-12.92068,200.0


In [42]:
agg_df_2 = agg_df_2[
    [
        "ADM1_PCODE",
        "typhoon_name",
        "Percent_Difference_total_houses_based",
        "Percent_Difference_average_based",
    ]
]

In [43]:
df_sorted = agg_df_2.sort_values(by=["typhoon_name"], ascending=-True).reset_index(
    drop=True
)
df_sorted

Unnamed: 0,ADM1_PCODE,typhoon_name,Percent_Difference_total_houses_based,Percent_Difference_average_based
0,PH010000000,BOPHA,-1.370529,198.202436
1,PH120000000,BOPHA,-0.936847,186.30213
2,PH110000000,BOPHA,1006.103064,92.235226
3,PH100000000,BOPHA,-267.626501,165.767892
4,PH170000000,BOPHA,-63.221954,83.048895
5,PH020000000,BOPHA,-0.941131,200.0
6,PH090000000,BOPHA,-31.262838,199.893267
7,PH080000000,BOPHA,-1.126596,200.0
8,PH070000000,BOPHA,-28.098951,122.750504
9,PH160000000,BOPHA,586.35355,137.971322
