# Combined  Model (XGBoost Undersampling + XGBoost Regression)


In [1]:
%load_ext jupyter_black

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import shap
import imblearn
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from xgboost.sklearn import XGBRegressor
from sklearn.dummy import DummyRegressor
from xgboost import XGBClassifier

# from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from matplotlib import cm

# from mlxtend.plotting import plot_confusion_matrix
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

from utils import get_training_dataset

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [3]:
# Read csv file and import to df
df = get_training_dataset()
df.head()

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged,percent_houses_damaged_5years
0,DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.0,,1.018526,...,2.699781,5.762712,3445.709753,1,0.0,0.0,1.0,0.0,0.0,0.0
1,DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.30102,-0.527,1.5794,...,4.585088,12.799127,8602.645832,1,0.0,0.0,1.0,0.0,0.0,0.0
2,DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283,0.551764,...,1.527495,8.833333,5084.012925,1,0.0,0.01,0.99,197.339034,0.0,0.0
3,DURIAN,2006,4640,0.0,614.67527,0.35625,0.101562,645.89966,-0.358889,2.107949,...,11.677657,17.530431,55607.86595,1,0.0,0.31,0.69,4970.477311,0.0,0.0
4,DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.4628,3.538881,...,17.074011,31.931338,35529.342507,1,0.0,0.77,0.23,12408.594656,0.0,0.0


In [4]:
# Fill NaNs with average estimated value of 'rwi'
df["rwi"].fillna(df["rwi"].mean(), inplace=True)

# Set any values >100% to 100%,
for i in range(len(df)):
    if df.loc[i, "percent_houses_damaged"] > 100:
        df.at[i, "percent_houses_damaged"] = float(100)

In [5]:
# Remove zeros from wind_speed
df = (df[(df[["wind_speed"]] != 0).any(axis=1)]).reset_index(drop=True)
df = df.drop(columns=["grid_point_id", "typhoon_year"])
df.head()

Unnamed: 0,typhoon_name,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged,percent_houses_damaged_5years
0,DURIAN,12.460039,275.018491,0.670833,0.313021,0.479848,-0.213039,12.896581,7.450346,74.625539,34.62955,42.21875,5303.65949,1,0.0,0.0,1.0,0.0,0.0,0.0
1,DURIAN,11.428974,297.027578,0.929167,0.343229,55.649739,0.206,14.070741,6.514647,68.681417,25.475388,72.283154,61015.543599,1,0.0,0.14,0.86,276.871504,0.0,0.0
2,DURIAN,13.077471,262.598363,0.716667,0.424479,8.157414,-0.636,19.758682,10.9407,104.453163,54.353996,102.215198,66707.43807,1,0.0,0.11,0.89,448.539453,0.0,0.0
3,DURIAN,12.511864,273.63933,0.56875,0.336979,88.292015,-0.2275,11.499097,6.901584,59.798108,31.814048,58.988877,53841.050168,1,0.0,0.12,0.88,2101.708435,0.0,0.0
4,DURIAN,11.977511,284.680297,0.589583,0.290625,962.766739,-0.299667,13.866633,6.528689,65.65528,25.976413,111.386527,87378.257957,1,0.07,0.46,0.47,11632.726327,0.0,0.0


In [6]:
# Define bins for data stratification
bins2 = [0, 0.00009, 1, 10, 50, 101]
bins_eval = [0, 1, 10, 20, 50, 101]
samples_per_bin2, binsP2 = np.histogram(df["percent_houses_damaged"], bins=bins2)

In [7]:
# Check the bins' intervalls (first bin means all zeros, second bin means 0 < values <= 1)
df["percent_houses_damaged"].value_counts(bins=binsP2)

(-0.001, 9e-05]    38901
(9e-05, 1.0]        7232
(1.0, 10.0]         2552
(10.0, 50.0]         925
(50.0, 101.0]        144
Name: percent_houses_damaged, dtype: int64

In [8]:
print(samples_per_bin2)
print(binsP2)

[38901  7232  2552   925   144]
[0.00e+00 9.00e-05 1.00e+00 1.00e+01 5.00e+01 1.01e+02]


In [9]:
bin_index2 = np.digitize(df["percent_houses_damaged"], bins=binsP2)

In [10]:
y_input_strat = bin_index2

In [11]:
features = [
    "wind_speed",
    "track_distance",
    "total_houses",
    "rainfall_max_6h",
    "rainfall_max_24h",
    "rwi",
    "mean_slope",
    "std_slope",
    "mean_tri",
    "std_tri",
    "mean_elev",
    "coast_length",
    "with_coast",
    "urban",
    "rural",
    "water",
    "total_pop",
    "percent_houses_damaged_5years",
]

# Split X and y from dataframe features
X = df[features]
display(X.columns)
y = df["percent_houses_damaged"]

Index(['wind_speed', 'track_distance', 'total_houses', 'rainfall_max_6h',
       'rainfall_max_24h', 'rwi', 'mean_slope', 'std_slope', 'mean_tri',
       'std_tri', 'mean_elev', 'coast_length', 'with_coast', 'urban', 'rural',
       'water', 'total_pop', 'percent_houses_damaged_5years'],
      dtype='object')

In [12]:
# Define train and test data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    df["percent_houses_damaged"],
    test_size=0.2,
    stratify=y_input_strat,
)

## First step is to train XGBoost Regression model for train data

In [13]:
# XGBoost Reduced Overfitting
xgb = XGBRegressor(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=0.8,
    colsample_bynode=0.8,
    colsample_bytree=0.8,
    gamma=3,
    eta=0.01,
    importance_type="gain",
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=4,
    min_child_weight=1,
    missing=1,
    n_estimators=100,
    early_stopping_rounds=10,
    n_jobs=1,
    nthread=None,
    objective="reg:squarederror",
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=0.8,
    verbosity=1,
    eval_metric=["rmse", "logloss"],
    random_state=0,
)

eval_set = [(X_test, y_test)]
xgb_model = xgb.fit(X_train, y_train, eval_set=eval_set, verbose=False)

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [14]:
# Make prediction on train and test data
y_pred_train = xgb.predict(X_train)
y_pred = xgb.predict(X_test)

In [15]:
# Calculate RMSE in total

mse_train_idx = mean_squared_error(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train_idx)

mse_idx = mean_squared_error(y_test, y_pred)
rmseM1 = np.sqrt(mse_idx)

print(f"RMSE_test_in_total: {rmseM1:.2f}")
print(f"RMSE_train_in_total: {rmse_train:.2f}")

RMSE_test_in_total: 3.34
RMSE_train_in_total: 2.55


In [16]:
# Calculate RMSE per bins

bin_index_test = np.digitize(y_test, bins=bins_eval)
bin_index_train = np.digitize(y_train, bins=bins_eval)

RSME_test_model1 = np.zeros(len(bins_eval) - 1)

for bin_num in range(1, len(bins_eval)):

    # Estimation of RMSE for train data
    mse_train_idx = mean_squared_error(
        y_train[bin_index_train == bin_num], y_pred_train[bin_index_train == bin_num]
    )
    rmse_train = np.sqrt(mse_train_idx)

    # Estimation of RMSE for test data
    mse_idx = mean_squared_error(
        y_test[bin_index_test == bin_num], y_pred[bin_index_test == bin_num]
    )
    RSME_test_model1[bin_num - 1] = np.sqrt(mse_idx)

    print(
        f"RMSE_test  [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {RSME_test_model1[bin_num-1]:.2f}"
    )
    print(
        f"RMSE_train [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {rmse_train:.2f}"
    )

RMSE_test  [0,1): 0.90
RMSE_train [0,1): 0.94
RMSE_test  [1,10): 5.20
RMSE_train [1,10): 3.75
RMSE_test  [10,20): 10.72
RMSE_train [10,20): 8.87
RMSE_test  [20,50): 20.22
RMSE_train [20,50): 15.46
RMSE_test  [50,101): 37.61
RMSE_train [50,101): 26.77


## Second step is to train XGBoost Binary model for same train data

In [17]:
# Define a threshold to separate target into damaged and not_damaged
thres = 10.0
y_test_bool = y_test >= thres
y_train_bool = y_train >= thres
y_test_bin = (y_test_bool) * 1
y_train_bin = (y_train_bool) * 1

In [18]:
sum(y_train_bin)

855

In [19]:
print(Counter(y_train_bin))

Counter({0: 38948, 1: 855})


In [20]:
# Undersampling

# Define undersampling strategy
under = RandomUnderSampler(sampling_strategy=0.1)
# Fit and apply the transform
X_train_us, y_train_us = under.fit_resample(X_train, y_train_bin)

print(Counter(y_train_us))

Counter({0: 8550, 1: 855})


In [21]:
# Use XGBClassifier as a Machine Learning model to fit the data
xgb_model = XGBClassifier(eval_metric=["error", "logloss"])

# eval_set = [(X_train, y_train), (X_train, y_train)]
eval_set = [(X_test, y_test_bin)]
xgb_model.fit(
    X_train_us,
    y_train_us,
    eval_set=eval_set,
    verbose=False,
)

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [22]:
# Make prediction on test data
y_pred_test = xgb_model.predict(X_test)

In [23]:
# Print Confusion Matrix
cm = confusion_matrix(y_test_bin, y_pred_test)
cm

array([[9582,  155],
       [  58,  156]])

In [24]:
# Classification Report
print(metrics.classification_report(y_test_bin, y_pred_test))
print(metrics.confusion_matrix(y_test_bin, y_pred_test))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      9737
           1       0.50      0.73      0.59       214

    accuracy                           0.98      9951
   macro avg       0.75      0.86      0.79      9951
weighted avg       0.98      0.98      0.98      9951

[[9582  155]
 [  58  156]]


In [25]:
# Make prediction on train data
y_pred_train = xgb_model.predict(X_train)

In [26]:
# Print Confusion Matrix
cm = confusion_matrix(y_train_bin, y_pred_train)
cm

array([[38448,   500],
       [    0,   855]])

In [27]:
# Classification Report
print(metrics.classification_report(y_train_bin, y_pred_train))
print(metrics.confusion_matrix(y_train_bin, y_pred_train))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     38948
           1       0.63      1.00      0.77       855

    accuracy                           0.99     39803
   macro avg       0.82      0.99      0.88     39803
weighted avg       0.99      0.99      0.99     39803

[[38448   500]
 [    0   855]]


In [28]:
reduced_df = X_train.copy()

In [29]:
reduced_df["percent_houses_damaged"] = y_train.values
reduced_df["predicted_value"] = y_pred_train

In [30]:
fliterd_df = reduced_df[reduced_df.predicted_value == 1]

In [31]:
fliterd_df

Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value
2424,43.404880,39.300549,136.509920,14.543750,5.632813,-0.213039,11.587330,7.742965,55.924508,33.619439,61.583113,3215.455186,1,0.000000,0.020000,0.980000,445.287454,0.000000,0.624047,1
11808,51.981443,51.273522,11.825838,6.583333,3.422917,-0.213039,5.119779,3.506823,30.751538,15.735356,17.721088,9890.217046,1,0.000000,0.030000,0.970000,335.178579,0.000000,43.801797,1
9800,52.017777,26.073507,1184.016735,11.856250,5.596354,-0.529111,17.338755,9.759110,84.438034,48.084050,652.712178,0.000000,0,0.000000,1.000000,0.000000,5673.509570,0.000000,22.832118,1
18542,54.427176,28.922879,2739.821502,13.029167,5.319271,-0.510267,5.498124,4.433683,28.857387,19.775172,137.849501,1320.235327,1,0.090000,0.910000,0.000000,16112.793584,0.002058,10.630033,1
30741,35.033509,19.887724,155.098442,24.895833,13.113021,-0.213039,1.317428,0.748526,7.141522,3.153878,8.732759,2265.541300,1,0.000000,0.010000,0.990000,600.897850,1.603432,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9720,60.120015,14.675643,10691.046869,15.212500,6.747396,-0.228625,3.111621,4.252950,16.707319,19.176553,128.296291,0.000000,0,0.350000,0.650000,0.000000,74487.422879,0.000000,58.276689,1
31364,50.119185,24.116342,30377.746762,19.164583,9.050521,0.346333,2.180806,2.734827,11.950723,12.408409,44.045489,0.000000,0,0.450000,0.550000,0.000000,183264.795053,0.000080,18.290481,1
1023,48.625808,44.288739,5257.178894,14.029167,6.522396,-0.444120,5.478355,3.424823,28.970639,13.638941,131.080527,0.000000,0,0.150000,0.850000,0.000000,24350.754488,0.000000,22.472332,1
31365,49.942580,34.602375,2631.416409,17.995833,8.657813,-0.282789,2.510584,2.320038,13.625045,10.835400,67.762516,0.000000,0,0.040000,0.960000,0.000000,11430.877979,0.011154,20.425508,1


### Third step is to train XGBoost regression model for this reduced train data (including damg>10.0%)

In [32]:
# Define bins for data stratification in regression model
bins2 = [0, 1, 10, 20, 50, 101]
samples_per_bin2, binsP2 = np.histogram(
    fliterd_df["percent_houses_damaged"], bins=bins2
)

print(samples_per_bin2)
print(binsP2)

[204 296 389 351 115]
[  0   1  10  20  50 101]


In [33]:
bin_index2 = np.digitize(fliterd_df["percent_houses_damaged"], bins=binsP2)

In [34]:
y_input_strat = bin_index2

In [35]:
# Split X and y from dataframe features
X_r = fliterd_df[features]
display(X.columns)
y_r = fliterd_df["percent_houses_damaged"]

Index(['wind_speed', 'track_distance', 'total_houses', 'rainfall_max_6h',
       'rainfall_max_24h', 'rwi', 'mean_slope', 'std_slope', 'mean_tri',
       'std_tri', 'mean_elev', 'coast_length', 'with_coast', 'urban', 'rural',
       'water', 'total_pop', 'percent_houses_damaged_5years'],
      dtype='object')

In [36]:
# XGBoost Reduced Overfitting
xgbR = XGBRegressor(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=0.8,
    colsample_bynode=0.8,
    colsample_bytree=0.8,
    gamma=3,
    eta=0.01,
    importance_type="gain",
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=4,
    min_child_weight=1,
    missing=1,
    n_estimators=100,
    early_stopping_rounds=10,
    n_jobs=1,
    nthread=None,
    objective="reg:squarederror",
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=0.8,
    verbosity=1,
    eval_metric=["rmse", "logloss"],
    random_state=0,
)

eval_set = [(X_r, y_r)]
xgbR_model = xgbR.fit(X_r, y_r, eval_set=eval_set, verbose=False)

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [37]:
# Make prediction on train and global test data
y_pred_r = xgbR.predict(X_r)
y_pred_test_total = xgbR.predict(X_test)

In [38]:
# Calculate RMSE in total

mse_train_idxR = mean_squared_error(y_r, y_pred_r)
rmse_trainR = np.sqrt(mse_train_idxR)


mse_idxR = mean_squared_error(y_test, y_pred_test_total)
rmseR = np.sqrt(mse_idxR)

print(f"RMSE_test_in_total MR: {rmseR:.2f}")
print(f"RMSE_test_in_total M1: {rmseM1:.2f}")
print(f"RMSE_train_in_reduced: {rmse_trainR:.2f}")

RMSE_test_in_total MR: 14.93
RMSE_test_in_total M1: 3.34
RMSE_train_in_reduced: 9.67


In [39]:
# Calculate RMSE per bins
bin_index_r = np.digitize(y_r, bins=bins_eval)

RSME_test_model1R = np.zeros(len(bins_eval) - 1)
for bin_num in range(1, len(bins_eval)):

    # Estimation of RMSE for train data
    mse_train_idxR = mean_squared_error(
        y_r[bin_index_r == bin_num], y_pred_r[bin_index_r == bin_num]
    )
    rmse_trainR = np.sqrt(mse_train_idxR)

    # Estimation of RMSE for test data
    mse_idxR = mean_squared_error(
        y_test[bin_index_test == bin_num], y_pred_test_total[bin_index_test == bin_num]
    )
    RSME_test_model1R[bin_num - 1] = np.sqrt(mse_idxR)

    # print(f"RMSE_test: {rmse:.2f}")
    print(
        f"RMSE_train_reduced [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {rmse_trainR:.2f}"
    )
    print(
        f"RMSE_test_total_MR [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {RSME_test_model1R[bin_num-1]:.2f}"
    )
    print(
        f"RMSE_test_total_M1 [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {RSME_test_model1[bin_num-1]:.2f}"
    )
    RSME_test_model1
    # print(f"RMSE_train: {rmse_train:.2f}")

RMSE_train_reduced [0,1): 10.06
RMSE_test_total_MR [0,1): 15.04
RMSE_test_total_M1 [0,1): 0.90
RMSE_train_reduced [1,10): 7.43
RMSE_test_total_MR [1,10): 11.56
RMSE_test_total_M1 [1,10): 5.20
RMSE_train_reduced [10,20): 4.91
RMSE_test_total_MR [10,20): 8.16
RMSE_test_total_M1 [10,20): 10.72
RMSE_train_reduced [20,50): 9.77
RMSE_test_total_MR [20,50): 15.19
RMSE_test_total_M1 [20,50): 20.22
RMSE_train_reduced [50,101): 20.15
RMSE_test_total_MR [50,101): 35.27
RMSE_test_total_M1 [50,101): 37.61


## Fourth step is to add model combination (model M1 with model MR)

In [40]:
# Check the result of classifier for test set
reduced_test_df = X_test.copy()

In [41]:
# joined X_test with countinous target and binary predicted values
reduced_test_df["percent_houses_damaged"] = y_test.values
reduced_test_df["predicted_value"] = y_pred_test

reduced_test_df

Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value
34270,20.428453,134.242300,41.116529,5.595833,1.646354,-0.309000,18.960296,8.814788,93.816338,42.001652,758.826173,0.000000,0,0.00,1.00,0.00,5.879550,0.005518,0.000000,0
4190,9.566616,229.560373,277.683599,5.712500,2.329687,-0.646800,17.791017,6.517432,84.427430,24.539067,978.878219,0.000000,0,0.00,1.00,0.00,1124.463072,0.000000,0.000000,0
14915,4.757212,256.185586,1680.011892,2.656250,1.415625,-0.469500,1.562203,1.241087,9.050527,5.375291,23.931885,24070.356760,1,0.01,0.58,0.41,6952.314116,0.000000,0.000000,0
37146,29.367331,100.354717,260.448841,11.068750,5.540104,0.205000,7.489558,6.030367,35.465675,26.617130,22.149068,6067.070800,1,0.01,0.01,0.98,1422.207942,0.000000,0.665112,0
47061,10.864263,70.988173,141.088539,15.179167,6.753125,-0.535571,19.381871,8.895792,93.345980,38.334789,674.835667,0.000000,0,0.00,1.00,0.00,2593.765141,0.099862,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30462,30.469774,24.447650,6675.827725,20.450000,13.766667,0.012385,4.827060,4.194534,24.402339,17.696554,44.853130,39381.167938,1,0.28,0.32,0.40,42229.461943,0.129482,9.937483,0
32482,25.160468,72.277974,5919.014823,5.758333,3.278125,0.401333,5.520873,3.822856,29.597745,17.048274,46.567797,27487.153232,1,0.19,0.04,0.77,35127.903703,0.053485,0.000000,0
27745,44.228354,34.131057,31760.347517,28.589583,13.023438,0.387500,0.792990,0.475002,5.531668,2.283653,13.575780,0.000000,0,0.89,0.11,0.00,198217.003425,0.000000,0.775316,0
14431,12.981903,241.809086,2914.660427,2.212500,0.968750,0.221750,5.202626,6.094444,25.560472,26.955514,40.089247,22565.833362,1,0.18,0.20,0.62,24592.702085,0.000000,0.000000,0


In [42]:
# damaged prediction
fliterd_test_df1 = reduced_test_df[reduced_test_df.predicted_value == 1]

# not damaged prediction
fliterd_test_df0 = reduced_test_df[reduced_test_df.predicted_value == 0]

In [43]:
# Use X0 and X1 for the M1 and MR models' predictions
X1 = fliterd_test_df1[features]
X0 = fliterd_test_df0[features]

In [44]:
# For the output equal to 1 apply MR to evaluate the performance
y1_pred = xgbR.predict(X1)
y1 = fliterd_test_df1["percent_houses_damaged"]

In [45]:
# For the output equal to 0 apply M1 to evaluate the performance
y0_pred = xgb.predict(X0)
y0 = fliterd_test_df0["percent_houses_damaged"]

In [46]:
## Combined the two outputs

In [47]:
fliterd_test_df0["predicted_percent_damage"] = y0_pred
fliterd_test_df0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,...,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value,predicted_percent_damage
34270,20.428453,134.242300,41.116529,5.595833,1.646354,-0.309000,18.960296,8.814788,93.816338,42.001652,...,0.000000,0,0.00,1.00,0.00,5.879550,0.005518,0.000000,0,0.022122
4190,9.566616,229.560373,277.683599,5.712500,2.329687,-0.646800,17.791017,6.517432,84.427430,24.539067,...,0.000000,0,0.00,1.00,0.00,1124.463072,0.000000,0.000000,0,0.027847
14915,4.757212,256.185586,1680.011892,2.656250,1.415625,-0.469500,1.562203,1.241087,9.050527,5.375291,...,24070.356760,1,0.01,0.58,0.41,6952.314116,0.000000,0.000000,0,0.048114
37146,29.367331,100.354717,260.448841,11.068750,5.540104,0.205000,7.489558,6.030367,35.465675,26.617130,...,6067.070800,1,0.01,0.01,0.98,1422.207942,0.000000,0.665112,0,0.154815
47061,10.864263,70.988173,141.088539,15.179167,6.753125,-0.535571,19.381871,8.895792,93.345980,38.334789,...,0.000000,0,0.00,1.00,0.00,2593.765141,0.099862,0.000000,0,0.190091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30462,30.469774,24.447650,6675.827725,20.450000,13.766667,0.012385,4.827060,4.194534,24.402339,17.696554,...,39381.167938,1,0.28,0.32,0.40,42229.461943,0.129482,9.937483,0,2.131492
32482,25.160468,72.277974,5919.014823,5.758333,3.278125,0.401333,5.520873,3.822856,29.597745,17.048274,...,27487.153232,1,0.19,0.04,0.77,35127.903703,0.053485,0.000000,0,0.085587
27745,44.228354,34.131057,31760.347517,28.589583,13.023438,0.387500,0.792990,0.475002,5.531668,2.283653,...,0.000000,0,0.89,0.11,0.00,198217.003425,0.000000,0.775316,0,2.241902
14431,12.981903,241.809086,2914.660427,2.212500,0.968750,0.221750,5.202626,6.094444,25.560472,26.955514,...,22565.833362,1,0.18,0.20,0.62,24592.702085,0.000000,0.000000,0,-0.039751


In [48]:
fliterd_test_df1["predicted_percent_damage"] = y1_pred
fliterd_test_df1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,...,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value,predicted_percent_damage
25350,58.975139,21.788474,1806.668474,20.591667,7.233854,-0.557857,6.808584,4.651499,33.110546,19.857770,...,0.000000,0,0.03,0.97,0.00,8927.704942,0.533701,30.709679,1,28.920584
37495,59.331946,14.076546,3044.190724,11.577083,7.779167,-0.408667,2.742017,2.553250,15.183953,11.260273,...,0.000000,0,0.05,0.95,0.00,14204.887223,0.000000,10.378913,1,14.332630
13360,69.584266,3.881618,121.484861,16.379167,6.406771,-0.251000,5.477664,4.594031,29.381264,21.274151,...,9700.043352,1,0.00,0.02,0.98,1466.117288,0.380829,80.748749,1,51.169895
18583,52.557720,35.001779,2768.705357,11.506250,5.149479,-0.532154,4.798060,4.080212,26.078265,18.326510,...,0.000000,0,0.09,0.91,0.00,20308.130049,0.008957,5.152960,1,10.997349
6176,38.898742,16.971961,72.329307,9.147917,3.418750,-0.133000,3.455609,1.916692,18.728310,7.918151,...,7001.816584,1,0.01,0.02,0.97,456.687146,0.747455,0.572519,1,12.519022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12711,70.940246,15.061252,717.234684,4.606250,3.053125,0.140000,0.766802,0.469404,4.945972,2.301954,...,4576.179345,1,0.02,0.01,0.97,3786.637436,3.274273,75.002139,1,61.161118
30746,34.856245,29.799421,1869.841048,14.243750,10.509375,-0.110000,2.308159,1.660397,12.791179,7.281515,...,16194.639177,1,0.04,0.13,0.83,2478.622712,1.613880,0.000000,1,11.027788
18520,55.375813,25.883428,4149.311534,12.350000,5.307812,-0.217500,3.701815,2.912204,19.335230,12.668360,...,29003.110275,1,0.16,0.37,0.47,25445.957874,0.001643,6.189017,1,9.590966
30511,31.047022,4.340869,5268.625381,22.381250,15.601042,-0.398385,4.827781,5.074387,24.933197,21.827975,...,57028.054958,1,0.16,0.75,0.09,27462.321684,0.973516,8.215567,1,28.737001


In [49]:
# Join two dataframes together

join_test_dfs = pd.concat([fliterd_test_df0, fliterd_test_df1])
join_test_dfs

Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,...,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value,predicted_percent_damage
34270,20.428453,134.242300,41.116529,5.595833,1.646354,-0.309000,18.960296,8.814788,93.816338,42.001652,...,0.000000,0,0.00,1.00,0.00,5.879550,0.005518,0.000000,0,0.022122
4190,9.566616,229.560373,277.683599,5.712500,2.329687,-0.646800,17.791017,6.517432,84.427430,24.539067,...,0.000000,0,0.00,1.00,0.00,1124.463072,0.000000,0.000000,0,0.027847
14915,4.757212,256.185586,1680.011892,2.656250,1.415625,-0.469500,1.562203,1.241087,9.050527,5.375291,...,24070.356760,1,0.01,0.58,0.41,6952.314116,0.000000,0.000000,0,0.048114
37146,29.367331,100.354717,260.448841,11.068750,5.540104,0.205000,7.489558,6.030367,35.465675,26.617130,...,6067.070800,1,0.01,0.01,0.98,1422.207942,0.000000,0.665112,0,0.154815
47061,10.864263,70.988173,141.088539,15.179167,6.753125,-0.535571,19.381871,8.895792,93.345980,38.334789,...,0.000000,0,0.00,1.00,0.00,2593.765141,0.099862,0.000000,0,0.190091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12711,70.940246,15.061252,717.234684,4.606250,3.053125,0.140000,0.766802,0.469404,4.945972,2.301954,...,4576.179345,1,0.02,0.01,0.97,3786.637436,3.274273,75.002139,1,61.161118
30746,34.856245,29.799421,1869.841048,14.243750,10.509375,-0.110000,2.308159,1.660397,12.791179,7.281515,...,16194.639177,1,0.04,0.13,0.83,2478.622712,1.613880,0.000000,1,11.027788
18520,55.375813,25.883428,4149.311534,12.350000,5.307812,-0.217500,3.701815,2.912204,19.335230,12.668360,...,29003.110275,1,0.16,0.37,0.47,25445.957874,0.001643,6.189017,1,9.590966
30511,31.047022,4.340869,5268.625381,22.381250,15.601042,-0.398385,4.827781,5.074387,24.933197,21.827975,...,57028.054958,1,0.16,0.75,0.09,27462.321684,0.973516,8.215567,1,28.737001


In [50]:
# join_test_dfs = join_test_dfs.reset_index(drop=True)

In [51]:
# Calculate RMSE in total

mse_combined_model = mean_squared_error(
    join_test_dfs["percent_houses_damaged"], join_test_dfs["predicted_percent_damage"]
)
rmse_combined_model = np.sqrt(mse_combined_model)


print(f"RMSE_in_total: {rmse_combined_model:.2f}")

RMSE_in_total: 3.42


In [52]:
y_join = join_test_dfs["percent_houses_damaged"]
y_pred_join = join_test_dfs["predicted_percent_damage"]

bin_index_test = np.digitize(y_join, bins=bins_eval)

RSME_combined_model = np.zeros(len(bins_eval) - 1)

for bin_num in range(1, len(bins_eval)):

    mse_combined_model = mean_squared_error(
        y_join[bin_index_test == bin_num],
        y_pred_join[bin_index_test == bin_num],
    )
    RSME_combined_model[bin_num - 1] = np.sqrt(mse_combined_model)

    print(
        f"RMSE_total [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {RSME_combined_model[bin_num-1]:.2f}"
    )

RMSE_total [0,1): 1.30
RMSE_total [1,10): 6.00
RMSE_total [10,20): 9.96
RMSE_total [20,50): 18.03
RMSE_total [50,101): 38.64
