# Combined  Model (XGBoost Undersampling + XGBoost Regression)

We developed a hybrid model using both xgboost regression and xgboost classification(while undersampling technique was implemented to enhance its performance). Subsequently, we evaluated the performance of this combined model on the test dataset and compared it with the result of the simple xgboost regression model.


In [1]:
%load_ext jupyter_black

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import shap
import imblearn
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from xgboost.sklearn import XGBRegressor
from sklearn.dummy import DummyRegressor
from xgboost import XGBClassifier
from sty import fg, rs

from sklearn.metrics import confusion_matrix
from matplotlib import cm
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

from utils import get_training_dataset

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [3]:
# Read csv file and import to df
df = get_training_dataset()
df.head()

Unnamed: 0,typhoon_name,typhoon_year,grid_point_id,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,...,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged,percent_houses_damaged_5years
0,DURIAN,2006,101,0.0,303.180555,0.122917,0.085417,31.0,,1.018526,...,2.699781,5.762712,3445.709753,1,0.0,0.0,1.0,0.0,0.0,0.0
1,DURIAN,2006,4475,0.0,638.027502,0.091667,0.027083,3.30102,-0.527,1.5794,...,4.585088,12.799127,8602.645832,1,0.0,0.0,1.0,0.0,0.0,0.0
2,DURIAN,2006,4639,0.0,603.631997,0.535417,0.146354,12.103741,-0.283,0.551764,...,1.527495,8.833333,5084.012925,1,0.0,0.01,0.99,197.339034,0.0,0.0
3,DURIAN,2006,4640,0.0,614.67527,0.35625,0.101562,645.89966,-0.358889,2.107949,...,11.677657,17.530431,55607.86595,1,0.0,0.31,0.69,4970.477311,0.0,0.0
4,DURIAN,2006,4641,0.0,625.720905,0.202083,0.057812,1071.731293,-0.4628,3.538881,...,17.074011,31.931338,35529.342507,1,0.0,0.77,0.23,12408.594656,0.0,0.0


In [4]:
# Fill NaNs with average estimated value of 'rwi'
df["rwi"].fillna(df["rwi"].mean(), inplace=True)

# Set any values >100% to 100%,
for i in range(len(df)):
    if df.loc[i, "percent_houses_damaged"] > 100:
        df.at[i, "percent_houses_damaged"] = float(100)

In [5]:
# Remove zeros from wind_speed
df = (df[(df[["wind_speed"]] != 0).any(axis=1)]).reset_index(drop=True)
df = df.drop(columns=["grid_point_id", "typhoon_year"])
df.head()

Unnamed: 0,typhoon_name,wind_speed,track_distance,rainfall_max_6h,rainfall_max_24h,total_houses,rwi,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged,percent_houses_damaged_5years
0,DURIAN,12.460039,275.018491,0.670833,0.313021,0.479848,-0.213039,12.896581,7.450346,74.625539,34.62955,42.21875,5303.65949,1,0.0,0.0,1.0,0.0,0.0,0.0
1,DURIAN,11.428974,297.027578,0.929167,0.343229,55.649739,0.206,14.070741,6.514647,68.681417,25.475388,72.283154,61015.543599,1,0.0,0.14,0.86,276.871504,0.0,0.0
2,DURIAN,13.077471,262.598363,0.716667,0.424479,8.157414,-0.636,19.758682,10.9407,104.453163,54.353996,102.215198,66707.43807,1,0.0,0.11,0.89,448.539453,0.0,0.0
3,DURIAN,12.511864,273.63933,0.56875,0.336979,88.292015,-0.2275,11.499097,6.901584,59.798108,31.814048,58.988877,53841.050168,1,0.0,0.12,0.88,2101.708435,0.0,0.0
4,DURIAN,11.977511,284.680297,0.589583,0.290625,962.766739,-0.299667,13.866633,6.528689,65.65528,25.976413,111.386527,87378.257957,1,0.07,0.46,0.47,11632.726327,0.0,0.0


In [6]:
# Define bins for data stratification
bins2 = [0, 0.00009, 1, 10, 50, 101]
bins_eval = [0, 1, 10, 20, 50, 101]
samples_per_bin2, binsP2 = np.histogram(df["percent_houses_damaged"], bins=bins2)

In [7]:
# Check the bins' intervalls (first bin means all zeros, second bin means 0 < values <= 1)
df["percent_houses_damaged"].value_counts(bins=binsP2)

(-0.001, 9e-05]    38901
(9e-05, 1.0]        7232
(1.0, 10.0]         2552
(10.0, 50.0]         925
(50.0, 101.0]        144
Name: percent_houses_damaged, dtype: int64

In [8]:
print(samples_per_bin2)
print(binsP2)

[38901  7232  2552   925   144]
[0.00e+00 9.00e-05 1.00e+00 1.00e+01 5.00e+01 1.01e+02]


In [9]:
bin_index2 = np.digitize(df["percent_houses_damaged"], bins=binsP2)

In [10]:
y_input_strat = bin_index2

In [11]:
features = [
    "wind_speed",
    "track_distance",
    "total_houses",
    "rainfall_max_6h",
    "rainfall_max_24h",
    "rwi",
    "mean_slope",
    "std_slope",
    "mean_tri",
    "std_tri",
    "mean_elev",
    "coast_length",
    "with_coast",
    "urban",
    "rural",
    "water",
    "total_pop",
    "percent_houses_damaged_5years",
]

# Split X and y from dataframe features
X = df[features]
display(X.columns)
y = df["percent_houses_damaged"]

Index(['wind_speed', 'track_distance', 'total_houses', 'rainfall_max_6h',
       'rainfall_max_24h', 'rwi', 'mean_slope', 'std_slope', 'mean_tri',
       'std_tri', 'mean_elev', 'coast_length', 'with_coast', 'urban', 'rural',
       'water', 'total_pop', 'percent_houses_damaged_5years'],
      dtype='object')

In [12]:
# Define train and test data
X_train, X_test, y_train, y_test = train_test_split(
    X,
    df["percent_houses_damaged"],
    test_size=0.2,
    stratify=y_input_strat,
)

## First step is to train XGBoost Regression model for train data

In [13]:
# XGBoost Reduced Overfitting
xgb = XGBRegressor(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=0.8,
    colsample_bynode=0.8,
    colsample_bytree=0.8,
    gamma=3,
    eta=0.01,
    importance_type="gain",
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=4,
    min_child_weight=1,
    missing=1,
    n_estimators=100,
    early_stopping_rounds=10,
    n_jobs=1,
    nthread=None,
    objective="reg:squarederror",
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=0.8,
    verbosity=1,
    eval_metric=["rmse", "logloss"],
    random_state=0,
)

eval_set = [(X_test, y_test)]
xgb_model = xgb.fit(X_train, y_train, eval_set=eval_set, verbose=False)

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [14]:
# Make prediction on train and test data
y_pred_train = xgb.predict(X_train)
y_pred = xgb.predict(X_test)

In [15]:
# Calculate RMSE in total

mse_train_idx = mean_squared_error(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train_idx)

mse_idx = mean_squared_error(y_test, y_pred)
rmseM1 = np.sqrt(mse_idx)

print(f"RMSE_test_in_total: {rmseM1:.2f}")
print(f"RMSE_train_in_total: {rmse_train:.2f}")

RMSE_test_in_total: 3.08
RMSE_train_in_total: 2.65


In [16]:
# Calculate RMSE per bins

bin_index_test = np.digitize(y_test, bins=bins_eval)
bin_index_train = np.digitize(y_train, bins=bins_eval)

RSME_test_model1 = np.zeros(len(bins_eval) - 1)

for bin_num in range(1, len(bins_eval)):

    # Estimation of RMSE for train data
    mse_train_idx = mean_squared_error(
        y_train[bin_index_train == bin_num], y_pred_train[bin_index_train == bin_num]
    )
    rmse_train = np.sqrt(mse_train_idx)

    # Estimation of RMSE for test data
    mse_idx = mean_squared_error(
        y_test[bin_index_test == bin_num], y_pred[bin_index_test == bin_num]
    )
    RSME_test_model1[bin_num - 1] = np.sqrt(mse_idx)

    print(
        f"RMSE_test  [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {RSME_test_model1[bin_num-1]:.2f}"
    )
    print(
        f"RMSE_train [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {rmse_train:.2f}"
    )

RMSE_test  [0,1): 1.17
RMSE_train [0,1): 0.94
RMSE_test  [1,10): 4.54
RMSE_train [1,10): 3.93
RMSE_test  [10,20): 9.31
RMSE_train [10,20): 9.03
RMSE_test  [20,50): 19.75
RMSE_train [20,50): 15.83
RMSE_test  [50,101): 33.02
RMSE_train [50,101): 28.50


## Second step is to train XGBoost Binary model for same train data

In [17]:
# Define a threshold to separate target into damaged and not_damaged
thres = 10.0
y_test_bool = y_test >= thres
y_train_bool = y_train >= thres
y_test_bin = (y_test_bool) * 1
y_train_bin = (y_train_bool) * 1

In [18]:
sum(y_train_bin)

855

In [19]:
print(Counter(y_train_bin))

Counter({0: 38948, 1: 855})


In [20]:
# Undersampling

# Define undersampling strategy
under = RandomUnderSampler(sampling_strategy=0.1)
# Fit and apply the transform
X_train_us, y_train_us = under.fit_resample(X_train, y_train_bin)

print(Counter(y_train_us))

Counter({0: 8550, 1: 855})


In [21]:
# Use XGBClassifier as a Machine Learning model to fit the data
xgb_model = XGBClassifier(eval_metric=["error", "logloss"])

# eval_set = [(X_train, y_train), (X_train, y_train)]
eval_set = [(X_test, y_test_bin)]
xgb_model.fit(
    X_train_us,
    y_train_us,
    eval_set=eval_set,
    verbose=False,
)

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [22]:
# Make prediction on test data
y_pred_test = xgb_model.predict(X_test)

In [23]:
# Print Confusion Matrix
cm = confusion_matrix(y_test_bin, y_pred_test)
cm

array([[9601,  136],
       [  61,  153]])

In [24]:
# Classification Report
print(metrics.classification_report(y_test_bin, y_pred_test))
print(metrics.confusion_matrix(y_test_bin, y_pred_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      9737
           1       0.53      0.71      0.61       214

    accuracy                           0.98      9951
   macro avg       0.76      0.85      0.80      9951
weighted avg       0.98      0.98      0.98      9951

[[9601  136]
 [  61  153]]


In [25]:
# Make prediction on train data
y_pred_train = xgb_model.predict(X_train)

In [26]:
# Print Confusion Matrix
cm = confusion_matrix(y_train_bin, y_pred_train)
cm

array([[38510,   438],
       [    0,   855]])

In [27]:
# Classification Report
print(metrics.classification_report(y_train_bin, y_pred_train))
print(metrics.confusion_matrix(y_train_bin, y_pred_train))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     38948
           1       0.66      1.00      0.80       855

    accuracy                           0.99     39803
   macro avg       0.83      0.99      0.90     39803
weighted avg       0.99      0.99      0.99     39803

[[38510   438]
 [    0   855]]


In [28]:
reduced_df = X_train.copy()

In [29]:
reduced_df["percent_houses_damaged"] = y_train.values
reduced_df["predicted_value"] = y_pred_train

In [30]:
fliterd_df = reduced_df[reduced_df.predicted_value == 1]

In [31]:
fliterd_df

Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value
13145,72.251930,31.753148,2383.683635,9.260417,5.607813,0.308800,8.698314,6.900810,42.025400,29.846938,196.562202,20400.179150,1,0.14,0.57,0.29,13763.519461,1.508420,57.235598,1
40740,45.274487,13.634354,930.303668,16.427083,7.480208,-0.274000,3.423450,2.192708,16.258848,9.069180,54.330210,13168.851697,1,0.12,0.05,0.83,12719.089291,0.368513,19.528704,1
11787,58.300001,13.592497,962.193200,12.337500,4.972396,-0.368250,11.041480,7.713488,55.112382,33.720203,75.541652,89295.865888,1,0.00,0.61,0.39,2064.177740,0.000000,43.801797,1
13078,73.259273,27.797975,45673.266226,7.850000,4.606771,0.575067,5.679837,7.339446,29.279100,33.668380,37.761544,36068.688998,1,0.58,0.20,0.22,270572.544877,0.066457,40.204053,1
40912,45.000398,22.089574,12261.516957,13.060417,7.175000,0.054200,3.019431,3.551144,16.342650,15.550940,22.023757,94201.780356,1,0.65,0.26,0.09,65401.137000,2.043719,5.417064,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12863,59.637022,46.784325,3408.319457,8.012500,4.380208,-0.303818,5.977114,3.937252,29.124319,16.237819,69.981072,25551.064830,1,0.04,0.55,0.41,10457.396786,0.020811,33.660164,1
40886,45.636009,2.664347,2576.956387,10.502083,6.399479,0.090667,1.593387,1.504150,9.580483,7.237524,9.295925,8426.045863,1,0.15,0.01,0.84,14043.023422,0.489396,6.892182,1
707,58.454575,22.214850,2531.702542,15.068750,7.422396,0.032500,8.040434,5.532480,39.034626,23.157728,94.637186,16279.288402,1,0.14,0.22,0.64,11078.798147,0.000000,3.223924,1
11773,54.906015,43.292512,183.611479,8.879167,3.808333,0.055000,6.953282,6.640077,35.870349,30.475411,53.573066,13977.710894,1,0.00,0.15,0.85,1077.304325,0.000000,31.399046,1


### Third step is to train XGBoost regression model for this reduced train data (including damg>10.0%)

In [32]:
# Define bins for data stratification in regression model
bins2 = [0, 1, 10, 20, 50, 101]
samples_per_bin2, binsP2 = np.histogram(
    fliterd_df["percent_houses_damaged"], bins=bins2
)

print(samples_per_bin2)
print(binsP2)

[168 270 373 367 115]
[  0   1  10  20  50 101]


In [33]:
bin_index2 = np.digitize(fliterd_df["percent_houses_damaged"], bins=binsP2)

In [34]:
y_input_strat = bin_index2

In [35]:
# Split X and y from dataframe features
X_r = fliterd_df[features]
display(X.columns)
y_r = fliterd_df["percent_houses_damaged"]

Index(['wind_speed', 'track_distance', 'total_houses', 'rainfall_max_6h',
       'rainfall_max_24h', 'rwi', 'mean_slope', 'std_slope', 'mean_tri',
       'std_tri', 'mean_elev', 'coast_length', 'with_coast', 'urban', 'rural',
       'water', 'total_pop', 'percent_houses_damaged_5years'],
      dtype='object')

In [36]:
# XGBoost Reduced Overfitting
xgbR = XGBRegressor(
    base_score=0.5,
    booster="gbtree",
    colsample_bylevel=0.8,
    colsample_bynode=0.8,
    colsample_bytree=0.8,
    gamma=3,
    eta=0.01,
    importance_type="gain",
    learning_rate=0.1,
    max_delta_step=0,
    max_depth=4,
    min_child_weight=1,
    missing=1,
    n_estimators=100,
    early_stopping_rounds=10,
    n_jobs=1,
    nthread=None,
    objective="reg:squarederror",
    reg_alpha=0,
    reg_lambda=1,
    scale_pos_weight=1,
    seed=None,
    silent=None,
    subsample=0.8,
    verbosity=1,
    eval_metric=["rmse", "logloss"],
    random_state=0,
)

eval_set = [(X_r, y_r)]
xgbR_model = xgbR.fit(X_r, y_r, eval_set=eval_set, verbose=False)

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.


In [37]:
# Make prediction on train and global test data
y_pred_r = xgbR.predict(X_r)
y_pred_test_total = xgbR.predict(X_test)

In [38]:
# Calculate RMSE in total

mse_train_idxR = mean_squared_error(y_r, y_pred_r)
rmse_trainR = np.sqrt(mse_train_idxR)


mse_idxR = mean_squared_error(y_test, y_pred_test_total)
rmseR = np.sqrt(mse_idxR)

print(f"RMSE_test_in_total MR: {rmseR:.2f}")
print(f"RMSE_test_in_total M1: {rmseM1:.2f}")
print(f"RMSE_train_in_reduced: {rmse_trainR:.2f}")

RMSE_test_in_total MR: 15.44
RMSE_test_in_total M1: 3.08
RMSE_train_in_reduced: 10.01


In [39]:
# Calculate RMSE per bins
bin_index_r = np.digitize(y_r, bins=bins_eval)

RSME_test_model1R = np.zeros(len(bins_eval) - 1)
for bin_num in range(1, len(bins_eval)):

    # Estimation of RMSE for train data
    mse_train_idxR = mean_squared_error(
        y_r[bin_index_r == bin_num], y_pred_r[bin_index_r == bin_num]
    )
    rmse_trainR = np.sqrt(mse_train_idxR)

    # Estimation of RMSE for test data
    mse_idxR = mean_squared_error(
        y_test[bin_index_test == bin_num], y_pred_test_total[bin_index_test == bin_num]
    )
    RSME_test_model1R[bin_num - 1] = np.sqrt(mse_idxR)

    # print(f"RMSE_test: {rmse:.2f}")
    print(
        f"RMSE_train_reduced [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {rmse_trainR:.2f}"
    )
    print(
        f"RMSE_test_total_MR [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {RSME_test_model1R[bin_num-1]:.2f}"
    )
    print(
        f"RMSE_test_total_M1 [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {RSME_test_model1[bin_num-1]:.2f}"
    )
    RSME_test_model1
    # print(f"RMSE_train: {rmse_train:.2f}")

RMSE_train_reduced [0,1): 11.16
RMSE_test_total_MR [0,1): 15.60
RMSE_test_total_M1 [0,1): 1.17
RMSE_train_reduced [1,10): 8.32
RMSE_test_total_MR [1,10): 12.26
RMSE_test_total_M1 [1,10): 4.54
RMSE_train_reduced [10,20): 4.63
RMSE_test_total_MR [10,20): 6.80
RMSE_test_total_M1 [10,20): 9.31
RMSE_train_reduced [20,50): 9.69
RMSE_test_total_MR [20,50): 15.19
RMSE_test_total_M1 [20,50): 19.75
RMSE_train_reduced [50,101): 20.31
RMSE_test_total_MR [50,101): 30.13
RMSE_test_total_M1 [50,101): 33.02


## Last step is to add model combination (model M1 with model MR)

In [40]:
# Check the result of classifier for test set
reduced_test_df = X_test.copy()

In [41]:
# joined X_test with countinous target and binary predicted values
reduced_test_df["percent_houses_damaged"] = y_test.values
reduced_test_df["predicted_value"] = y_pred_test

reduced_test_df

Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,mean_elev,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value
49601,10.396554,199.979958,96.158352,6.631250,4.363542,-0.380000,9.022485,3.787582,50.970692,13.488631,34.924242,4280.863151,1,0.00,0.00,1.00,1284.250123,1.667437,0.000000,0
33863,12.159087,146.270346,152.360892,7.714583,5.008854,-0.637500,11.081625,6.365845,54.018911,26.881018,568.237415,0.000000,0,0.00,1.00,0.00,126.263593,0.007442,0.000000,0
23361,14.495280,152.151368,3001.479050,5.568750,3.439062,-0.490250,11.923229,11.107064,57.110345,55.768360,568.612936,0.000000,0,0.08,0.92,0.00,15023.231141,0.086431,0.000000,0
868,46.227039,45.016424,8.198027,22.633333,9.579688,-0.213039,0.848832,0.441849,5.880668,2.697199,5.285714,2068.237335,1,0.00,0.02,0.98,109.011122,0.000000,28.078216,0
13086,54.615895,59.662645,11131.247660,11.627083,8.856771,-0.046429,12.245994,8.618815,60.532437,39.613202,215.824860,18705.584002,1,0.37,0.55,0.08,52800.638758,0.003004,1.054973,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37057,14.582867,272.365088,7.528400,2.612500,1.492708,-0.268000,8.548323,6.229575,41.852405,26.515616,293.939758,0.000000,0,0.00,1.00,0.00,0.000000,2.361292,0.000000,0
34995,28.151356,98.926726,3887.811319,7.581250,3.271875,-0.488308,6.347557,4.674013,31.508331,18.730070,80.469435,0.000000,0,0.06,0.94,0.00,26563.024972,0.006633,0.000000,0
14737,21.640331,139.918881,1736.128056,9.329167,4.877083,0.244600,6.914429,4.618878,35.696393,19.693002,44.132684,25684.186927,1,0.13,0.11,0.76,14250.107287,0.048113,0.000000,0
11627,18.899248,169.396725,809.229447,4.327083,2.413021,-0.108000,9.301184,9.389671,45.058957,42.537441,68.646985,7531.806226,1,0.00,0.12,0.88,3481.665415,0.000000,0.000000,0


In [42]:
# damaged prediction
fliterd_test_df1 = reduced_test_df[reduced_test_df.predicted_value == 1]

# not damaged prediction
fliterd_test_df0 = reduced_test_df[reduced_test_df.predicted_value == 0]

In [43]:
# Use X0 and X1 for the M1 and MR models' predictions
X1 = fliterd_test_df1[features]
X0 = fliterd_test_df0[features]

In [44]:
# For the output equal to 1 apply MR to evaluate the performance
y1_pred = xgbR.predict(X1)
y1 = fliterd_test_df1["percent_houses_damaged"]

In [45]:
# For the output equal to 0 apply M1 to evaluate the performance
y0_pred = xgb.predict(X0)
y0 = fliterd_test_df0["percent_houses_damaged"]

In [46]:
## Combined the two outputs

In [47]:
fliterd_test_df0["predicted_percent_damage"] = y0_pred
fliterd_test_df0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,...,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value,predicted_percent_damage
49601,10.396554,199.979958,96.158352,6.631250,4.363542,-0.380000,9.022485,3.787582,50.970692,13.488631,...,4280.863151,1,0.00,0.00,1.00,1284.250123,1.667437,0.000000,0,0.077989
33863,12.159087,146.270346,152.360892,7.714583,5.008854,-0.637500,11.081625,6.365845,54.018911,26.881018,...,0.000000,0,0.00,1.00,0.00,126.263593,0.007442,0.000000,0,0.067659
23361,14.495280,152.151368,3001.479050,5.568750,3.439062,-0.490250,11.923229,11.107064,57.110345,55.768360,...,0.000000,0,0.08,0.92,0.00,15023.231141,0.086431,0.000000,0,0.031464
868,46.227039,45.016424,8.198027,22.633333,9.579688,-0.213039,0.848832,0.441849,5.880668,2.697199,...,2068.237335,1,0.00,0.02,0.98,109.011122,0.000000,28.078216,0,4.383256
21558,10.797003,194.379571,212.663162,3.683333,1.591667,-0.695750,21.473339,9.728256,103.875000,45.545172,...,0.000000,0,0.00,1.00,0.00,1950.745470,0.532006,0.000000,0,-0.021949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37057,14.582867,272.365088,7.528400,2.612500,1.492708,-0.268000,8.548323,6.229575,41.852405,26.515616,...,0.000000,0,0.00,1.00,0.00,0.000000,2.361292,0.000000,0,-0.010648
34995,28.151356,98.926726,3887.811319,7.581250,3.271875,-0.488308,6.347557,4.674013,31.508331,18.730070,...,0.000000,0,0.06,0.94,0.00,26563.024972,0.006633,0.000000,0,0.118362
14737,21.640331,139.918881,1736.128056,9.329167,4.877083,0.244600,6.914429,4.618878,35.696393,19.693002,...,25684.186927,1,0.13,0.11,0.76,14250.107287,0.048113,0.000000,0,0.022514
11627,18.899248,169.396725,809.229447,4.327083,2.413021,-0.108000,9.301184,9.389671,45.058957,42.537441,...,7531.806226,1,0.00,0.12,0.88,3481.665415,0.000000,0.000000,0,0.077200


In [48]:
fliterd_test_df1["predicted_percent_damage"] = y1_pred
fliterd_test_df1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,...,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value,predicted_percent_damage
13086,54.615895,59.662645,11131.247660,11.627083,8.856771,-0.046429,12.245994,8.618815,60.532437,39.613202,...,18705.584002,1,0.370000,0.550000,0.080000,52800.638758,0.003004,1.054973,1,12.963193
25327,57.914400,23.967321,3777.509865,23.145833,7.442708,-0.255571,6.624408,5.388203,33.055565,23.298431,...,0.000000,0,0.200000,0.800000,0.000000,34375.595370,0.220207,8.235989,1,24.898096
24441,57.065985,23.656993,6.359082,29.652083,11.751563,-0.291000,7.761711,4.006668,39.335665,20.571214,...,7614.883914,1,0.000000,0.010000,0.990000,26.183816,0.005252,0.312617,1,6.329030
18752,56.759480,14.205862,2866.363044,6.739583,5.038542,-0.069727,14.809156,7.506341,71.822364,31.962377,...,13153.653463,1,0.180000,0.560000,0.260000,18630.720582,0.001808,12.142118,1,8.230263
35334,59.161392,11.245539,5498.293421,19.520833,7.477604,-0.221333,11.949656,8.944561,58.190334,39.587512,...,20812.246604,1,0.260000,0.430000,0.310000,30647.367865,0.001631,27.527417,1,22.746412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18519,50.307346,36.569580,2835.828430,11.170833,4.689063,-0.568571,5.250933,4.907627,26.505635,21.275108,...,0.000000,0,0.010000,0.990000,0.000000,15827.071106,0.005740,6.484813,1,10.533318
12748,67.845083,26.702615,6098.570466,16.964583,6.516667,-0.270462,5.565469,3.809935,33.778871,19.031124,...,23962.089446,1,0.250000,0.400000,0.350000,22459.643218,0.012958,34.585345,1,31.269945
44339,52.180348,11.334142,14719.075948,19.141667,7.269792,-0.060727,2.374585,3.056920,12.969910,13.675511,...,2056.872703,1,0.572727,0.418182,0.009091,90515.935382,0.709506,12.554129,1,11.210395
12273,64.733917,23.859533,680.145259,8.412500,5.266667,-0.100167,4.016969,4.098531,21.618080,18.588007,...,14323.633429,1,0.060000,0.040000,0.900000,4659.437934,0.986445,55.374130,1,32.997231


In [49]:
# Join two dataframes together

join_test_dfs = pd.concat([fliterd_test_df0, fliterd_test_df1])
join_test_dfs

Unnamed: 0,wind_speed,track_distance,total_houses,rainfall_max_6h,rainfall_max_24h,rwi,mean_slope,std_slope,mean_tri,std_tri,...,coast_length,with_coast,urban,rural,water,total_pop,percent_houses_damaged_5years,percent_houses_damaged,predicted_value,predicted_percent_damage
49601,10.396554,199.979958,96.158352,6.631250,4.363542,-0.380000,9.022485,3.787582,50.970692,13.488631,...,4280.863151,1,0.000000,0.000000,1.000000,1284.250123,1.667437,0.000000,0,0.077989
33863,12.159087,146.270346,152.360892,7.714583,5.008854,-0.637500,11.081625,6.365845,54.018911,26.881018,...,0.000000,0,0.000000,1.000000,0.000000,126.263593,0.007442,0.000000,0,0.067659
23361,14.495280,152.151368,3001.479050,5.568750,3.439062,-0.490250,11.923229,11.107064,57.110345,55.768360,...,0.000000,0,0.080000,0.920000,0.000000,15023.231141,0.086431,0.000000,0,0.031464
868,46.227039,45.016424,8.198027,22.633333,9.579688,-0.213039,0.848832,0.441849,5.880668,2.697199,...,2068.237335,1,0.000000,0.020000,0.980000,109.011122,0.000000,28.078216,0,4.383256
21558,10.797003,194.379571,212.663162,3.683333,1.591667,-0.695750,21.473339,9.728256,103.875000,45.545172,...,0.000000,0,0.000000,1.000000,0.000000,1950.745470,0.532006,0.000000,0,-0.021949
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18519,50.307346,36.569580,2835.828430,11.170833,4.689063,-0.568571,5.250933,4.907627,26.505635,21.275108,...,0.000000,0,0.010000,0.990000,0.000000,15827.071106,0.005740,6.484813,1,10.533318
12748,67.845083,26.702615,6098.570466,16.964583,6.516667,-0.270462,5.565469,3.809935,33.778871,19.031124,...,23962.089446,1,0.250000,0.400000,0.350000,22459.643218,0.012958,34.585345,1,31.269945
44339,52.180348,11.334142,14719.075948,19.141667,7.269792,-0.060727,2.374585,3.056920,12.969910,13.675511,...,2056.872703,1,0.572727,0.418182,0.009091,90515.935382,0.709506,12.554129,1,11.210395
12273,64.733917,23.859533,680.145259,8.412500,5.266667,-0.100167,4.016969,4.098531,21.618080,18.588007,...,14323.633429,1,0.060000,0.040000,0.900000,4659.437934,0.986445,55.374130,1,32.997231


In [50]:
# join_test_dfs = join_test_dfs.reset_index(drop=True)

### Compare performance of M1 with combined model

In [51]:
# Calculate RMSE in total

mse_combined_model = mean_squared_error(
    join_test_dfs["percent_houses_damaged"], join_test_dfs["predicted_percent_damage"]
)
rmse_combined_model = np.sqrt(mse_combined_model)


print(fg.red + f"RMSE_in_total(combined_model): {rmse_combined_model:.2f}" + fg.rs)
print(f"RMSE_in_total(M1_model): {rmseM1:.2f}")

[31mRMSE_in_total(combined_model): 3.20[39m
RMSE_in_total(M1_model): 3.08


In [52]:
# Calculate RMSE per bin

y_join = join_test_dfs["percent_houses_damaged"]
y_pred_join = join_test_dfs["predicted_percent_damage"]

bin_index_test = np.digitize(y_join, bins=bins_eval)

RSME_combined_model = np.zeros(len(bins_eval) - 1)

for bin_num in range(1, len(bins_eval)):

    mse_combined_model = mean_squared_error(
        y_join[bin_index_test == bin_num],
        y_pred_join[bin_index_test == bin_num],
    )
    RSME_combined_model[bin_num - 1] = np.sqrt(mse_combined_model)

    print(
        fg.red
        + f"RMSE_combined_model [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {RSME_combined_model[bin_num-1]:.2f}"
        + fg.rs
    )

    print(
        f"RMSE_M1_model       [{bins_eval[bin_num-1]:.0f},{bins_eval[bin_num]:.0f}): {RSME_test_model1[bin_num-1]:.2f}"
    )
    print("\n")

[31mRMSE_combined_model [0,1): 1.55[39m
RMSE_M1_model       [0,1): 1.17


[31mRMSE_combined_model [1,10): 5.56[39m
RMSE_M1_model       [1,10): 4.54


[31mRMSE_combined_model [10,20): 9.10[39m
RMSE_M1_model       [10,20): 9.31


[31mRMSE_combined_model [20,50): 17.79[39m
RMSE_M1_model       [20,50): 19.75


[31mRMSE_combined_model [50,101): 32.35[39m
RMSE_M1_model       [50,101): 33.02


