# Naive baseline
#### Converting from grid-based to municipality-based 

In [1]:
%load_ext jupyter_black

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import shap
import imblearn
import statsmodels.api as sm
import statistics


from math import sqrt
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from xgboost.sklearn import XGBRegressor
from sklearn.dummy import DummyRegressor
from xgboost import XGBClassifier
from sty import fg, rs

from sklearn.metrics import confusion_matrix
from matplotlib import cm
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

from utils import get_training_dataset, weight_file

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [3]:
# Import the created dataset to a df
df = get_training_dataset()
df

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged,percent_houses_damaged_5years
0,DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.000000,,22.580645,...,2.699781,5.762712,3445.709753,1,0.00,0.000000,1.000000,0.000000,0.0,0.000000
1,DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.301020,-0.527000,2.639401,...,4.585088,12.799127,8602.645832,1,0.00,0.000000,1.000000,0.000000,0.0,0.000000
2,DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283000,2.639401,...,1.527495,8.833333,5084.012925,1,0.00,0.010000,0.990000,197.339034,0.0,0.000000
3,DURIAN,2006,4640,0.0,614.675270,0.356250,0.101562,645.899660,-0.358889,2.639401,...,11.677657,17.530431,55607.865950,1,0.00,0.310000,0.690000,4970.477311,0.0,0.000000
4,DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.462800,2.639401,...,17.074011,31.931338,35529.342507,1,0.00,0.770000,0.230000,12408.594656,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141253,MOLAVE,2020,20677,0.0,644.575831,2.543750,0.778646,4449.357133,0.508167,44.762048,...,18.012771,36.304688,21559.003490,1,0.08,0.080000,0.840000,17619.701390,0.0,0.000000
141254,MOLAVE,2020,20678,0.0,655.685233,2.558333,0.861458,1521.435795,-0.174100,44.762048,...,13.163042,65.687266,12591.742022,1,0.00,0.420000,0.580000,5623.069564,0.0,0.000000
141255,MOLAVE,2020,20679,0.0,666.794635,2.975000,0.949479,930.647069,-0.244286,25.078318,...,10.901755,37.414996,19740.596834,1,0.00,0.109091,0.890909,5912.671746,0.0,0.015207
141256,MOLAVE,2020,20680,0.0,677.904037,2.889583,1.083333,1800.666044,0.038000,16.796996,...,17.917650,105.812452,26363.303778,1,0.03,0.250000,0.720000,11254.164413,0.0,0.020806


In [4]:
# Set any values >100% to 100%,
for i in range(len(df)):
    if df.loc[i, "percent_houses_damaged"] > 100:
        df.at[i, "percent_houses_damaged"] = float(100)

In [5]:
# Fill NaNs with average estimated value of 'rwi'
df["rwi"].fillna(df["rwi"].mean(), inplace=True)

In [6]:
# Read the new weight CSV file and import to a df
df_weight = weight_file("/ggl_grid_to_mun_weights.csv")
df_weight.head()

Unnamed: 0,ADM3_PCODE,id_x,Centroid,numbuildings_x,id,numbuildings,weight
0,PH012801000,11049.0,120.9E_18.5N,1052,11049,1794,0.586399
1,PH012810000,11049.0,120.9E_18.5N,0,11049,1794,0.0
2,PH012815000,11049.0,120.9E_18.5N,742,11049,1794,0.413601
3,PH012801000,11050.0,120.9E_18.4N,193,11050,196,0.984694
4,PH012810000,11050.0,120.9E_18.4N,0,11050,196,0.0


In [7]:
# Change name of column ['id'] to ['grid_point_id'] the same name as in input df
df_weight.rename(columns={"id": "grid_point_id"}, inplace=True)
df_weight.head()

Unnamed: 0,ADM3_PCODE,id_x,Centroid,numbuildings_x,grid_point_id,numbuildings,weight
0,PH012801000,11049.0,120.9E_18.5N,1052,11049,1794,0.586399
1,PH012810000,11049.0,120.9E_18.5N,0,11049,1794,0.0
2,PH012815000,11049.0,120.9E_18.5N,742,11049,1794,0.413601
3,PH012801000,11050.0,120.9E_18.4N,193,11050,196,0.984694
4,PH012810000,11050.0,120.9E_18.4N,0,11050,196,0.0


### Following Steps are to convert grid_based model into Municipality based one

In [8]:
# Remove zeros from wind_speed
df = df[(df[["wind_speed"]] != 0).any(axis=1)]
df_data = df.drop(columns=["grid_point_id", "typhoon_year"])

In [9]:
display(df.head())
display(df_data.head())

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged,percent_houses_damaged_5years
138,DURIAN,2006,8284,12.460039,275.018491,0.670833,0.313021,0.479848,-0.213039,31.336503,...,34.62955,42.21875,5303.65949,1,0.0,0.0,1.0,0.0,0.0,0.0
139,DURIAN,2006,8286,11.428974,297.027578,0.929167,0.343229,55.649739,0.206,23.447758,...,25.475388,72.283154,61015.543599,1,0.0,0.14,0.86,276.871504,0.0,0.0
148,DURIAN,2006,8450,13.077471,262.598363,0.716667,0.424479,8.157414,-0.636,31.336503,...,54.353996,102.215198,66707.43807,1,0.0,0.11,0.89,448.539453,0.0,0.0
149,DURIAN,2006,8451,12.511864,273.63933,0.56875,0.336979,88.292015,-0.2275,31.336503,...,31.814048,58.988877,53841.050168,1,0.0,0.12,0.88,2101.708435,0.0,0.0
150,DURIAN,2006,8452,11.977511,284.680297,0.589583,0.290625,962.766739,-0.299667,23.546053,...,25.976413,111.386527,87378.257957,1,0.07,0.46,0.47,11632.726327,0.0,0.0


Unnamed: 0,typhoon_name,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,strong_roof_strong_wall,strong_roof_light_wall,strong_roof_salvage_wall,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged,percent_houses_damaged_5years
138,DURIAN,12.460039,275.018491,0.670833,0.313021,0.479848,-0.213039,31.336503,29.117802,0.042261,...,34.62955,42.21875,5303.65949,1,0.0,0.0,1.0,0.0,0.0,0.0
139,DURIAN,11.428974,297.027578,0.929167,0.343229,55.649739,0.206,23.447758,23.591571,0.037516,...,25.475388,72.283154,61015.543599,1,0.0,0.14,0.86,276.871504,0.0,0.0
148,DURIAN,13.077471,262.598363,0.716667,0.424479,8.157414,-0.636,31.336503,29.117802,0.042261,...,54.353996,102.215198,66707.43807,1,0.0,0.11,0.89,448.539453,0.0,0.0
149,DURIAN,12.511864,273.63933,0.56875,0.336979,88.292015,-0.2275,31.336503,29.117802,0.042261,...,31.814048,58.988877,53841.050168,1,0.0,0.12,0.88,2101.708435,0.0,0.0
150,DURIAN,11.977511,284.680297,0.589583,0.290625,962.766739,-0.299667,23.546053,23.660429,0.037576,...,25.976413,111.386527,87378.257957,1,0.07,0.46,0.47,11632.726327,0.0,0.0


In [10]:
# Define bins
bins2 = [0, 0.00009, 1, 10, 50, 101]
samples_per_bin2, binsP2 = np.histogram(df_data["percent_houses_damaged"], bins=bins2)

In [11]:
bin_index2 = np.digitize(df_data["percent_houses_damaged"], bins=binsP2)

In [12]:
y_input_strat = bin_index2

In [13]:
# Define X and y data
# We define a vector of all 0 with same length of y for X data
X = pd.Series([0] * 49754)
y = df_data["percent_houses_damaged"]

In [14]:
# Define two lists to save RMSE and Average Error

RMSE = defaultdict(list)
AVE = defaultdict(list)

In [15]:
for i in range(20):
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        df_data["percent_houses_damaged"],
        stratify=y_input_strat,
        test_size=0.2,
    )

    # create a dummy regressor
    dummy_reg = DummyRegressor(strategy="mean")

    # fit it on the training set
    dummy_reg.fit(X_train, y_train)

    # make predictions on the test set
    y_pred = dummy_reg.predict(X_test)
    y_pred_clipped = y_pred.clip(0, 100)

    pred_df = pd.DataFrame(columns=["y_all", "y_pred_all"])
    pred_df["y_all"] = y_test
    pred_df["y_pred_all"] = y_pred_clipped

    # Join data with y_all and y_all_pred
    df_data_w_pred = pd.merge(pred_df, df_data, left_index=True, right_index=True)
    # Join data with grid_point_id typhoon_year
    df_data_w_pred_grid = pd.merge(
        df[["grid_point_id", "typhoon_year"]],
        df_data_w_pred,
        left_index=True,
        right_index=True,
    )
    df_data_w_pred_grid.sort_values("y_pred_all", ascending=False)

    # join with weights df
    join_df = df_data_w_pred_grid.merge(df_weight, on="grid_point_id", how="left")

    # Indicate where values are valid and not missing
    join_df = join_df.loc[join_df["weight"].notna()]

    # Multiply weight by y_all and y_pred_all
    join_df["weight*y_pred*houses"] = (
        join_df["y_pred_all"] * join_df["weight"] * join_df["total_houses"] / 100
    )
    join_df["weight*y*houses"] = (
        join_df["y_all"] * join_df["weight"] * join_df["total_houses"] / 100
    )
    join_df["weight*houses"] = join_df["weight"] * join_df["total_houses"]

    join_df.sort_values("y_pred_all", ascending=False)

    # Groupby by municipality and typhoon_name with sum as the aggregation function
    agg_df = join_df.groupby(["ADM3_PCODE", "typhoon_name", "typhoon_year"]).agg("sum")

    # Normalize by the sum of the weights
    agg_df["y_pred_norm"] = (
        agg_df["weight*y_pred*houses"] / agg_df["weight*houses"] * 100
    )
    agg_df["y_norm"] = agg_df["weight*y*houses"] / agg_df["weight*houses"] * 100

    # Drop not required column y and y_pred before multiplying by weight
    agg_df.drop("y_all", axis=1, inplace=True)
    agg_df.drop("y_pred_all", axis=1, inplace=True)

    # Remove rows with NaN after normalization
    final_df = agg_df.dropna()

    # Calculate RMSE & Average Error in total for converted grid_based model to Mun_based
    rmse = sqrt(mean_squared_error(final_df["y_norm"], final_df["y_pred_norm"]))
    ave = (final_df["y_pred_norm"] - final_df["y_norm"]).sum() / len(final_df["y_norm"])

    print(f"RMSE for grid_based model: {rmse:.2f}")
    print(f"Average Error for grid_based model: {ave:.2f}")

    RMSE["all"].append(rmse)
    AVE["all"].append(ave)

    bin_index = np.digitize(final_df["y_norm"], bins=binsP2)

    for bin_num in range(1, 6):

        mse_idx = mean_squared_error(
            final_df["y_norm"][bin_index == bin_num],
            final_df["y_pred_norm"][bin_index == bin_num],
        )
        rmse = np.sqrt(mse_idx)

        ave = (
            final_df["y_pred_norm"][bin_index == bin_num]
            - final_df["y_norm"][bin_index == bin_num]
        ).sum() / len(final_df["y_norm"][bin_index == bin_num])

        RMSE[bin_num].append(rmse)
        AVE[bin_num].append(ave)

RMSE for grid_based model: 4.39
Average Error for grid_based model: 0.15
RMSE for grid_based model: 4.73
Average Error for grid_based model: 0.10
RMSE for grid_based model: 4.55
Average Error for grid_based model: 0.12
RMSE for grid_based model: 4.54
Average Error for grid_based model: 0.12
RMSE for grid_based model: 4.57
Average Error for grid_based model: 0.11
RMSE for grid_based model: 4.60
Average Error for grid_based model: 0.13
RMSE for grid_based model: 4.72
Average Error for grid_based model: 0.09
RMSE for grid_based model: 4.57
Average Error for grid_based model: 0.13
RMSE for grid_based model: 4.73
Average Error for grid_based model: 0.11
RMSE for grid_based model: 4.62
Average Error for grid_based model: 0.11
RMSE for grid_based model: 4.46
Average Error for grid_based model: 0.15
RMSE for grid_based model: 4.97
Average Error for grid_based model: 0.09
RMSE for grid_based model: 4.38
Average Error for grid_based model: 0.13
RMSE for grid_based model: 4.48
Average Error for g

In [16]:
# Define a function to plot RMSEs
def rmse_ave_mean(rmse, ave):

    # Mean of RMSE and Standard deviation
    m_rmse = statistics.mean(rmse)
    sd_rmse = statistics.stdev(rmse)

    m_ave = statistics.mean(ave)
    sd_ave = statistics.stdev(ave)

    print(f"mean_RMSE: {m_rmse:.2f}")
    print(f"stdev_RMSE: {sd_rmse:.2f}")

    print(f"mean_average_error: {m_ave:.2f}")
    print(f"stdev_average_error: {sd_ave:.2f}")

In [17]:
print("RMSE and Average Error in total", "\n")
rmse_ave_mean(RMSE["all"], AVE["all"])

RMSE and Average Error in total 

mean_RMSE: 4.64
stdev_RMSE: 0.21
mean_average_error: 0.11
stdev_average_error: 0.03


In [18]:
for bin_num in range(1, 6):

    print(f"\n RMSE and Average Error per bin {bin_num}\n")
    rmse_ave_mean(RMSE[bin_num], AVE[bin_num])


 RMSE and Average Error per bin 1

mean_RMSE: 0.83
stdev_RMSE: 0.00
mean_average_error: 0.83
stdev_average_error: 0.00

 RMSE and Average Error per bin 2

mean_RMSE: 0.72
stdev_RMSE: 0.00
mean_average_error: 0.69
stdev_average_error: 0.00

 RMSE and Average Error per bin 3

mean_RMSE: 3.69
stdev_RMSE: 0.11
mean_average_error: -2.79
stdev_average_error: 0.09

 RMSE and Average Error per bin 4

mean_RMSE: 25.42
stdev_RMSE: 0.95
mean_average_error: -22.68
stdev_average_error: 0.80

 RMSE and Average Error per bin 5

mean_RMSE: 62.41
stdev_RMSE: 3.37
mean_average_error: -61.26
stdev_average_error: 3.02
