# Naive baseline
#### municipality-based dataset

In [1]:
%load_ext jupyter_black

In [2]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
import xgboost as xgb
import pandas as pd
import numpy as np
import statistics
import os

from math import sqrt
from collections import defaultdict
from sklearn import preprocessing
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

  from pandas import MultiIndex, Int64Index


In [3]:
# Import the CSV file to a dataframe
df = pd.read_csv("data/df_merged_2.csv")
df

Unnamed: 0,Mun_Code,typhoon,HAZ_rainfall_Total,HAZ_rainfall_max_6h,HAZ_rainfall_max_24h,HAZ_v_max,HAZ_dis_track_min,GEN_landslide_per,GEN_stormsurge_per,GEN_Bu_p_inSSA,...,VUL_LightRoof_LightWall,VUL_LightRoof_SalvageWall,VUL_SalvagedRoof_StrongWall,VUL_SalvagedRoof_LightWall,VUL_SalvagedRoof_SalvageWall,VUL_vulnerable_groups,VUL_pantawid_pamilya_beneficiary,DAM_perc_dmg,HAZ_v_max_3,y_norm
0,PH175101000,DURIAN2006,185.828571,14.716071,7.381696,55.032241,2.478142,2.64,6.18,6.18,...,41.892832,1.002088,0.000000,0.027836,0.083507,2.951511,46.931106,3.632568,166667.757548,3.34975
1,PH083701000,DURIAN2006,8.818750,0.455208,0.255319,8.728380,288.358553,0.06,0.00,0.00,...,13.645253,0.549120,0.030089,0.090266,0.112833,3.338873,25.989168,0.000000,664.968323,0.00000
2,PH015501000,DURIAN2006,24.175000,2.408333,0.957639,10.945624,274.953818,1.52,1.28,1.28,...,15.592295,0.075838,0.000000,0.015168,0.075838,2.131755,32.185651,0.000000,1311.358762,0.00000
3,PH015502000,DURIAN2006,14.930000,1.650000,0.586250,12.108701,252.828578,0.00,0.00,0.00,...,7.100454,0.023280,0.011640,0.000000,0.128041,1.589369,29.612385,0.000000,1775.385328,0.00000
4,PH175302000,DURIAN2006,13.550000,1.054167,0.528125,10.660943,258.194381,5.52,0.36,0.36,...,30.354796,0.000000,0.000000,0.032852,0.000000,1.387007,35.052562,0.000000,1211.676901,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8068,PH084823000,NOUL2015,9.700000,0.408333,0.216146,8.136932,277.107823,1.80,6.25,6.25,...,32.492212,0.311526,0.031153,0.155763,0.031153,2.827833,31.308411,0.000000,538.743551,0.00000
8069,PH015547000,NOUL2015,17.587500,1.414583,0.386458,9.818999,305.789817,0.00,0.00,0.00,...,4.703833,0.027875,0.000000,0.034843,0.097561,1.073268,12.766551,0.000000,946.676507,0.00000
8070,PH025014000,NOUL2015,11.487500,0.614583,0.230319,15.791907,210.313249,0.06,0.09,0.09,...,3.063753,0.022528,0.000000,0.067583,0.022528,1.140109,9.348952,0.000000,3938.254316,0.00000
8071,PH140127000,NOUL2015,11.600000,1.400000,0.412766,13.867145,218.189328,0.00,0.00,0.00,...,3.119093,0.000000,0.000000,0.000000,0.000000,2.837537,21.928166,0.000000,2666.620370,0.00000


In [4]:
# Hist plot after data stratification
bins2 = [0, 0.00009, 1, 10, 50, 101]
samples_per_bin2, binsP2 = np.histogram(df["y_norm"], bins=bins2)

In [5]:
print(samples_per_bin2)
print(binsP2)

[3606 2874 1141  388   64]
[0.00e+00 9.00e-05 1.00e+00 1.00e+01 5.00e+01 1.01e+02]


In [6]:
bin_index2 = np.digitize(df["y_norm"], bins=binsP2)

In [7]:
y_input_strat = bin_index2

In [8]:
# Split X and y from dataframe features

X = pd.Series([0] * 8073)
y = df["y_norm"]

In [9]:
# Defin two lists to save RMSE and Average Error

RMSE = defaultdict(list)
AVE = defaultdict(list)

In [11]:
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        df["y_norm"],
        stratify=y_input_strat,
        test_size=0.2,
    )

    # create a dummy regressor
    dummy_reg = DummyRegressor(strategy="mean")

    # fit it on the training set
    dummy_reg.fit(X_train, y_train)

    # make predictions on the test set
    y_pred = dummy_reg.predict(X_test)
    y_pred_clipped = y_pred.clip(0, 100)

    # Calculate RMSE & Average Error in total for converted grid_based model to Mun_based
    rmse = sqrt(mean_squared_error(y_test, y_pred_clipped))
    ave = (y_pred_clipped - y_test).sum() / len(y_test)

    print(f"RMSE for mun_based model: {rmse:.2f}")
    print(f"Average Error for mun_based model: {ave:.2f}")

    RMSE["all"].append(rmse)
    AVE["all"].append(ave)

    bin_index = np.digitize(y_test, bins=binsP2)

    for bin_num in range(1, 6):

        mse_idx = mean_squared_error(
            y_test[bin_index == bin_num],
            y_pred_clipped[bin_index == bin_num],
        )
        rmse = np.sqrt(mse_idx)

        ave = (
            y_pred_clipped[bin_index == bin_num] - y_test[bin_index == bin_num]
        ).sum() / len(y_test[bin_index == bin_num])

        RMSE[bin_num].append(rmse)
        AVE[bin_num].append(ave)

RMSE for mun_based model: 8.30
Average Error for mun_based model: -0.12
RMSE for mun_based model: 7.48
Average Error for mun_based model: 0.14
RMSE for mun_based model: 7.96
Average Error for mun_based model: -0.08
RMSE for mun_based model: 7.64
Average Error for mun_based model: 0.01
RMSE for mun_based model: 7.90
Average Error for mun_based model: -0.01
RMSE for mun_based model: 8.38
Average Error for mun_based model: -0.05
RMSE for mun_based model: 8.51
Average Error for mun_based model: -0.09
RMSE for mun_based model: 7.91
Average Error for mun_based model: -0.08
RMSE for mun_based model: 7.86
Average Error for mun_based model: 0.04
RMSE for mun_based model: 8.01
Average Error for mun_based model: -0.03
RMSE for mun_based model: 8.18
Average Error for mun_based model: -0.08
RMSE for mun_based model: 8.24
Average Error for mun_based model: -0.05
RMSE for mun_based model: 7.85
Average Error for mun_based model: -0.00
RMSE for mun_based model: 8.15
Average Error for mun_based model: -

In [12]:
# Define a function to plot RMSEs
def rmse_ave_mean(rmse, ave):

    # Mean of RMSE and Standard deviation
    m_rmse = statistics.mean(rmse)
    sd_rmse = statistics.stdev(rmse)

    m_ave = statistics.mean(ave)
    sd_ave = statistics.stdev(ave)

    print(f"mean_RMSE: {m_rmse:.2f}")
    print(f"stdev_RMSE: {sd_rmse:.2f}")

    print(f"mean_average_error: {m_ave:.2f}")
    print(f"stdev_average_error: {sd_ave:.2f}")

In [13]:
print("RMSE and Average Error in total", "\n")
rmse_ave_mean(RMSE["all"], AVE["all"])

RMSE and Average Error in total 

mean_RMSE: 8.03
stdev_RMSE: 0.24
mean_average_error: -0.03
stdev_average_error: 0.07


In [14]:
for bin_num in range(1, 6):

    print(f"\n RMSE and Average Error per bin {bin_num}\n")
    rmse_ave_mean(RMSE[bin_num], AVE[bin_num])


 RMSE and Average Error per bin 1

mean_RMSE: 2.22
stdev_RMSE: 0.01
mean_average_error: 2.22
stdev_average_error: 0.01

 RMSE and Average Error per bin 2

mean_RMSE: 2.03
stdev_RMSE: 0.02
mean_average_error: 2.01
stdev_average_error: 0.02

 RMSE and Average Error per bin 3

mean_RMSE: 2.82
stdev_RMSE: 0.16
mean_average_error: -1.42
stdev_average_error: 0.16

 RMSE and Average Error per bin 4

mean_RMSE: 24.81
stdev_RMSE: 1.15
mean_average_error: -21.77
stdev_average_error: 1.18

 RMSE and Average Error per bin 5

mean_RMSE: 60.88
stdev_RMSE: 3.39
mean_average_error: -59.97
stdev_average_error: 3.03
