# Notebook to obtain model performance

In this notebook, all the different models are trained and tested to obtain optimal selected features, hyperparameters and performance scores. The models to obtain the selected features and performance estimate are called from separate scripts. For all models, Recursive Feature Elimination with Cross Validation (to find the optimal number of features) is applied on the full dataset. With the selected features, the performance is estimated (using Nested CV). In addition to comparing the performance scores of the models trained, the scores are also compared to a benchmark model.

This notebook consists of three main sections:



Binary Classification <br>
The models to perform a binary classification with threshold of 30% damage are trained and tested. 

Multiclass Classification <br>
The models to perform multiclass classification with three classes are trained and tests.
- 0 - 30%
- 30% - 80%
- 80% - 100%

Regression <br>
The models to obtain a continous prediction are trained and tested.

## General Libraries

In [122]:
%load_ext autoreload
%autoreload 2

import numpy as np
import random
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score
from xgboost import XGBClassifier
import os
from sklearn.feature_selection import RFECV
import pandas as pd
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedKFold,
    KFold,
)
from sklearn.metrics import f1_score, mean_squared_error, mean_absolute_error
import numpy as np
from numpy.lib.function_base import average
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import (
    recall_score,
    f1_score,
    precision_score,
    confusion_matrix,
    make_scorer,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedKFold,
    KFold,
)
from sklearn.feature_selection import SelectKBest, SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import importlib
import os
from sklearn.feature_selection import (
    SelectKBest,
    RFE,
    mutual_info_regression,
    f_regression,
    mutual_info_classif,
)
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.inspection import permutation_importance
import xgboost as xgb
import random
import pickle
import openpyxl
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
import pickle
from sklearn.linear_model import LinearRegression


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Local libraries

In [123]:
# Setting path to the initial folder
os.chdir("C:\\Users\\Marieke\\GitHub\\Typhoon_IBF_Rice_Damage_Model")
cdir = os.getcwd()
import importlib

# Binary classification functions
from IBF_typhoon_model.models.binary_classification.xgb_binary import (
    xgb_binary_features,
    xgb_binary_performance,
)
from IBF_typhoon_model.models.binary_classification.rf_binary import (
    rf_binary_features,
    rf_binary_performance,
)

# Multiclass classification functions
from IBF_typhoon_model.models.multiclass_classification.rf_multi import (
    rf_multi_features,
    rf_multi_performance,
)
from IBF_typhoon_model.models.multiclass_classification.xgb_multi import (
    xgb_multi_features,
    xgb_multi_performance,
)

# Regression functions
from IBF_typhoon_model.models.regression.rf_regression import (
    rf_regression_features,
    rf_regression_performance,
)
from IBF_typhoon_model.models.regression.xgb_regression import (
    xgb_regression_features,
    xgb_regression_performance,
)

# Utility functions
from IBF_typhoon_model.models.utility_functions.splitting_train_test import (
    splitting_train_test,
)
from IBF_typhoon_model.models.utility_functions.determine_class import determine_class
from IBF_typhoon_model.models.utility_functions.unweighted_random import (
    unweighted_random,
)
from IBF_typhoon_model.models.utility_functions.weighted_random import weighted_random


## Loading the Dataset

In [124]:
# Input data: the sheet that contains all the processed input data
name = "IBF_typhoon_model\\data\\restricted_data\\combined_input_data\\input_data_05.xlsx"
path = os.path.join(cdir, name)
df = pd.read_excel(path, engine="openpyxl")
display(df.head(5))

# Typhoon overview
file_name = "IBF_typhoon_model\\data\\data_overview.xlsx"
path = os.path.join(cdir, file_name)
df_typh_overview = pd.read_excel(path, sheet_name="typhoon_overview", engine="openpyxl")
display(df_typh_overview.head(5))

Unnamed: 0,mun_code,typhoon,area_affected,storm_id,year,reg_code,prov_code,rice_area,perc_loss,mean_slope,...,glat,glon,coast_peri_ratio,rainfall_max_6h,rainfall_max_24h,vmax,dis_track_min,perc_loss_new,damage_above_30,class_old
0,PH142708000,goni2015,0.0,2015226N12151,2015,PH140000000,PH142700000,124.72,0.0,10.35,...,7.475,124.58,0.0,2.863333,2.347917,11.295801,271.221492,0.0,False,0.0
1,PH142708000,mangkhut2018,104.31,2018250N12170,2018,PH140000000,PH142700000,236.24,0.441542,6.89,...,7.483,124.71,0.0,5.861667,2.93125,22.718248,111.246866,0.441542,True,1.0
2,PH142708000,molave2020,,2020298N13131,2020,PH140000000,PH142700000,143.32,,5.48,...,7.313,124.76,0.0,6.805833,2.076875,3.590794,400.835034,,False,
3,PH142708000,usagi2013,,2013259N17132,2013,PH140000000,PH142700000,126.36,,7.21,...,6.71,124.46,0.0,3.519167,1.859583,4.850429,389.636727,,False,
4,PH142708000,vamco2020,89.73,2020314N12131,2020,PH140000000,PH142700000,135.4,0.662703,13.79,...,5.785,125.34,0.458986,4.8675,2.0575,10.782503,199.648355,0.662703,True,1.0


Unnamed: 0,pagasa_name,unofficial_name,year,unofficial_name_year,name_year,start_date,end_date,landfall_date,landfall_time,storm_id,Unnamed: 10
0,aere,Bebeng,2011,Bebeng2011,aere2011,2011-05-05,2011-05-15,2011-05-07,21:00:00,2011126N11129,
1,atsani,siony,2020,siony2020,atsani2020,2020-10-29,2020-11-07,2020-11-06,00:00:00,2020304N08148,no landfall
2,bopha,pablo,2012,pablo2012,bopha2012,2012-11-25,2012-12-09,2012-12-03,21:00:00,2012331N03157,
3,danas,falcon,2019,falcon2019,danas2019,2019-07-14,2019-07-23,2019-07-17,00:00:00,2019195N13136,no landfall in PH
4,durian,reming,2006,reming2006,durian2006,2006-11-24,2006-12-09,2006-11-30,06:00:00,2006329N06150,


In [125]:
# Selecting the features to be used: should be available for historical and future typhoons
features = [
    "mean_slope",
    "mean_elevation_m",
    "ruggedness_stdev",
    "mean_ruggedness",
    "slope_stdev",
    "area_km2",
    "poverty_perc",
    "with_coast",
    "coast_length",
    "perimeter",
    "glat",
    "glon",
    "coast_peri_ratio",
    "rainfall_max_6h",
    "rainfall_max_24h",
    "dis_track_min",
    "vmax",
]

# Binary Classification

This section obtain the optimal Binary Classification models and the performance estimates, for a 30% threshold. Two models are implemented: Random Forest Classifier, XGBoost Classifier. First, the model is trained on the full dataset to obtain the optimal features followed by a model that obtains the performance estimate using Nested Cross Validation. 

- Performance Metric
- Nested Cross Validation
- Benchmark Models
- Main findings

In [59]:
# Setting the general input variables: for the dataframe with threshold 30
# Contorplot threshold was used to create damage_above_30 variable
df_binary = df[df['damage_above_30'].notnull()]
df_binary["class_value_binary"] = [
    1 if df_binary["damage_above_30"][i] == True else 0 for i in range(len(df_binary))
]

# Setting for feature selection on full data set
X = df_binary[features]
y = df_binary["class_value_binary"]
y = y.astype(int)

# Setting the train and the test sets for obtaining performance estimate
df_train_list, df_test_list = splitting_train_test(df_binary)


## Random Forest

### Obtain features and optimal hyperparameters

Number of selected features RF Binary: 3

Selected features RF Binary:
- rainfall_max_6h
- dis_track_min
- vmax


Selected parameters RF Binary:
- max_depth = 20
- min_samples_leaf = 5
- min_samples_split = 15
- n_estimators = 250


In [18]:
# Setting the random forest search grid
rf_search_space = [
    {
        "estimator__n_estimators": [100, 250],
        "estimator__max_depth": [20, None],
        "estimator__min_samples_split": [2, 8, 10, 15],
        "estimator__min_samples_leaf": [1, 3, 5],
    }
]

# Obtaining the selected features based on the full dataset
selected_features_rf_binary, selected_params_rf_binary_full = rf_binary_features(
    X=X,
    y=y,
    features=features,
    search_space=rf_search_space,
    cv_splits=5,
    class_weight="balanced",
    min_features_to_select=1,
    GS_score="f1",
    GS_randomized=False,
    GS_n_iter=10,
    verbose=10,
)

print(f"Number of selected features RF Binary: {len(selected_features_rf_binary)}")
print(f"Selected features RF Binary: {selected_features_rf_binary}")
print(f"Selected Parameters RF Binary {selected_params_rf_binary_full}")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5; 1/48] START estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100
[CV 1/5; 1/48] END estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100;, score=(train=1.000, test=0.597) total time=  56.3s
[CV 2/5; 1/48] START estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100
[CV 2/5; 1/48] END estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100;, score=(train=1.000, test=0.588) total time=  56.0s
[CV 3/5; 1/48] START estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100
[CV 3/5; 1/48] END estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100;, score=(train=0

### Obtaining performance estimate

In [7]:
# Setting the selected features for RF --> based on outcome in previous cell
selected_features_rf_binary = [
    "rice_area",
    "mean_slope",
    "mean_elevation_m",
    "ruggedness_stdev",
    "mean_ruggedness",
    "slope_stdev",
    "area_km2",
    "poverty_perc",
    "with_coast",
    "coast_length",
    "perimeter",
    "glat",
    "glon",
    "coast_peri_ratio",
    "rainfall_sum",
    "rainfall_max",
    "dis_track_min",
    "vmax_sust",
]


In [19]:
# Setting the random forest search grid
rf_search_space = [
    {
        "rf__n_estimators": [100, 250],
        "rf__max_depth": [20, None],
        "rf__min_samples_split": [2, 8, 15],
        "rf__min_samples_leaf": [1, 3, 5],
    }
]

# Obtaining the performance estimate
df_predicted_rf_binary, selected_params_rf_binary = rf_binary_performance(
    df_train_list=df_train_list,
    df_test_list=df_test_list,
    features=selected_features_rf_binary,
    search_space=rf_search_space,
    stratK=True,
    cv_splits=5,
    class_weight="balanced",
    GS_score="f1",
    GS_randomized=False,
    GS_n_iter=10,
    verbose=10,
)


Running for 1 out of a total of 5
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5; 1/36] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100
[CV 1/5; 1/36] END rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100;, score=(train=1.000, test=0.598) total time=   0.2s
[CV 2/5; 1/36] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100
[CV 2/5; 1/36] END rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100;, score=(train=1.000, test=0.599) total time=   0.2s
[CV 3/5; 1/36] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100
[CV 3/5; 1/36] END rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100;, score=(train=1.000, test=0.638) total time=   0.2s
[CV 4/5; 1/36] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_est

In [20]:
file_name = "IBF_typhoon_model\\models\\output\\02\\selected_params_rf_binary.p"
path = os.path.join(cdir, file_name)
pickle.dump(selected_params_rf_binary, open(path, "wb"))

file_name = "IBF_typhoon_model\\models\\output\\02\\df_predicted_rf_binary.csv"
path = os.path.join(cdir, file_name)
df_predicted_rf_binary.to_csv(path, index=False)




## XGBoost

### Obtaining optimal features and hyperparamters

Number of selected features XGGBoost Binary: 

Selected features XGBoost Binary:


Selected parameters XGBoost Binary:

- Rainfall in the pipeline only looks at predictions of rainfall prior to making landfall —> rain mostly follows the typhoon
- Distance and Wind data are collected only for municipalities that are close to the track
    - this is too limiting for rice damage predictions as historical data shows damage can also occur further away from the track. Rainfall is a very important variable here
- In wind script: find bug & correct for municipalities that are consistently missing
- There is a filter on windspeed intensity and distance in the Climada package, but in the windfield grid excel sheet that is generated, there are also values outside of the threshold
    - how is this possible —> general follow-up on the Climada package
- Rice area is currently not included in the model (same holds for the growing stage)
    - This is expected to improve model performs, but can only be included if the data is also available for new typhoons
    - Discuss the type of data that is available and how this can be obtained with PRiSM
    - If near-real-time rice area estimates are available: the rice area and growing stage can be included

In [48]:
# Setting the XGBoost search grid for full dataset
xgb_search_space = [
    {
        "estimator__learning_rate": [0.1, 0.5, 1],
        "estimator__gamma": [0.1, 0.5, 2],
        "estimator__max_depth": [6, 8],
        "estimator__reg_lambda": [0.001, 0.1, 1],
        "estimator__n_estimators": [100, 200],
        "estimator__colsample_bytree": [0.5, 0.7],
    }
]

# Obtaining the selected features based on the full dataset
selected_features_xgb_binary, selected_params_xgb_binary_full = xgb_binary_features(
    X=X,
    y=y,
    features=features,
    search_space=xgb_search_space,
    objective="binary:hinge",
    cv_splits=5,
    min_features_to_select=1,
    GS_score="f1",
    GS_n_iter=50,
    GS_randomized=True,
    verbose=10,
)

print(f"Number of selected features XGBoost Binary {len(selected_features_xgb_binary)}")
print(f"Selected features XGBoost Binary: {selected_features_xgb_binary}")
print(f"Selected parameters XGBoost Binary: {selected_params_xgb_binary_full}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5; 1/50] START estimator__colsample_bytree=0.5, estimator__gamma=2, estimator__learning_rate=0.5, estimator__max_depth=8, estimator__n_estimators=200, estimator__reg_lambda=0.001
[CV 1/5; 1/50] END estimator__colsample_bytree=0.5, estimator__gamma=2, estimator__learning_rate=0.5, estimator__max_depth=8, estimator__n_estimators=200, estimator__reg_lambda=0.001;, score=(train=1.000, test=0.471) total time= 1.3min
[CV 2/5; 1/50] START estimator__colsample_bytree=0.5, estimator__gamma=2, estimator__learning_rate=0.5, estimator__max_depth=8, estimator__n_estimators=200, estimator__reg_lambda=0.001
[CV 2/5; 1/50] END estimator__colsample_bytree=0.5, estimator__gamma=2, estimator__learning_rate=0.5, estimator__max_depth=8, estimator__n_estimators=200, estimator__reg_lambda=0.001;, score=(train=1.000, test=0.511) total time= 1.2min
[CV 3/5; 1/50] START estimator__colsample_bytree=0.5, estimator__gamma=2, estimator__learning_ra

### Obtaining performance estimate

In [None]:
# Setting the selected features for XGB --> based on outcome previous cell
selected_features_xgb_binary = [
    'rice_area', 
    'mean_slope', 
    'mean_elevation_m', 
    'ruggedness_stdev', 
    'mean_ruggedness', 
    'slope_stdev', 
    'area_km2', 
    'poverty_perc', 
    'with_coast', 
    'coast_length', 
    'perimeter', 
    'glat', 
    'glon', 
    'coast_peri_ratio', 
    'rainfall_sum', 
    'rainfall_max', 
    'dis_track_min', 
    'vmax_sust'
]

In [49]:
# Setting the XGBoost search grid
xgb_search_space = [
    {
        "xgb__learning_rate": [0.1, 0.5, 1],
        "xgb__gamma": [0.1, 0.5, 2],
        "xgb__max_depth": [6, 8],
        "xgb__reg_lambda": [0.001, 0.1, 1],
        "xgb__n_estimators": [100, 200],
        "xgb__colsample_bytree": [0.5, 0.7],
    }
]

# Obtaining the performance estimate
df_predicted_xgb_binary, selected_params_xgb_binary = xgb_binary_performance(
    df_train_list=df_train_list,
    df_test_list=df_test_list,
    features=selected_features_xgb_binary,
    search_space=xgb_search_space,
    stratK=True,
    cv_splits=5,
    objective="binary:hinge",
    GS_score="f1",
    GS_randomized=True,
    GS_n_iter=50,
    verbose=10,
)


Running for 1 out of a total of 5
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5; 1/50] START xgb__colsample_bytree=0.5, xgb__gamma=0.1, xgb__learning_rate=0.5, xgb__max_depth=6, xgb__n_estimators=200, xgb__reg_lambda=0.1
[CV 1/5; 1/50] END xgb__colsample_bytree=0.5, xgb__gamma=0.1, xgb__learning_rate=0.5, xgb__max_depth=6, xgb__n_estimators=200, xgb__reg_lambda=0.1;, score=(train=0.994, test=0.510) total time=   0.3s
[CV 2/5; 1/50] START xgb__colsample_bytree=0.5, xgb__gamma=0.1, xgb__learning_rate=0.5, xgb__max_depth=6, xgb__n_estimators=200, xgb__reg_lambda=0.1
[CV 2/5; 1/50] END xgb__colsample_bytree=0.5, xgb__gamma=0.1, xgb__learning_rate=0.5, xgb__max_depth=6, xgb__n_estimators=200, xgb__reg_lambda=0.1;, score=(train=0.996, test=0.509) total time=   0.2s
[CV 3/5; 1/50] START xgb__colsample_bytree=0.5, xgb__gamma=0.1, xgb__learning_rate=0.5, xgb__max_depth=6, xgb__n_estimators=200, xgb__reg_lambda=0.1
[CV 3/5; 1/50] END xgb__colsample_bytree=0.5, xgb__gamma=

In [50]:
file_name = "IBF_typhoon_model\\models\\output\\02\\selected_params_xgb_binary.p"
path = os.path.join(cdir, file_name)
pickle.dump(selected_params_xgb_binary, open(path, "wb"))

file_name = "IBF_typhoon_model\\models\\output\\02\\df_predicted_xgb_binary.csv"
path = os.path.join(cdir, file_name)
df_predicted_xgb_binary.to_csv(path, index=False)




## Benchmark

In [60]:
# Random unweighted predictions
df_predicted_random = pd.DataFrame(columns=["year", "actual", "predicted"])

for i in range(len(df_train_list)):

    train = df_train_list[i]
    test = df_test_list[i]

    y_train = train["class_value_binary"]
    y_test = test["class_value_binary"]

    y_pred_test = unweighted_random(y_train, y_test)
    df_predicted_temp = pd.DataFrame(
        {"year": test["year"], "actual": y_test, "predicted": y_pred_test}
    )

    df_predicted_random = pd.concat([df_predicted_random, df_predicted_temp])



In [61]:
# Random Weighted Predictions
df_predicted_random_weighted = pd.DataFrame(columns=["year", "actual", "predicted"])
for i in range(len(df_train_list)):

    train = df_train_list[i]
    test = df_test_list[i]

    y_train = train["class_value_binary"]
    y_test = test["class_value_binary"]

    y_pred_test = weighted_random(y_train, y_test)
    df_predicted_temp = pd.DataFrame(
        {"year": test["year"], "actual": y_test, "predicted": y_pred_test}
    )

    df_predicted_random_weighted = pd.concat(
        [df_predicted_random_weighted, df_predicted_temp]
    )



## Results

In [72]:
models = {
    "Random Fores": df_predicted_rf_binary,
    "XGBoost": df_predicted_xgb_binary,
    "Random": df_predicted_random,
    "Weighted Random": df_predicted_random_weighted,
}

f1 = []
precision = []
recall = []

# add 'list' if error
for df_temp in models.values():
    f1.append(f1_score(list(df_temp["actual"]), list(df_temp["predicted"])))
    precision.append(precision_score(list(df_temp["actual"]), list(df_temp["predicted"])))
    recall.append(recall_score(list(df_temp["actual"]), list(df_temp["predicted"])))

df_results_binary = pd.DataFrame(
    {"Models": list(models.keys()), "F1 score": f1, "Recall": recall, "Precision": precision}
)

display(df_results_binary)



Unnamed: 0,Models,F1 score,Recall,Precision
0,Random Fores,0.527349,0.535613,0.519337
1,XGBoost,0.518862,0.558405,0.484549
2,Random,0.377255,0.491453,0.306122
3,Weighted Random,0.329497,0.387464,0.286617


## Training the Optimal Model

In [32]:
### Training the optimal model
rf = RandomForestClassifier(
    class_weight="balanced",
    n_estimators=250,
    max_depth=20,
    min_samples_leaf=5,
    min_samples_split=15,
)

selected_features_rf_binary = [
    "rainfall_max_6h",
    "dis_track_min",
    "vmax",
]

rf_fitted = rf.fit(X[selected_features_rf_binary], y)

file_name = "IBF_typhoon_model\\models\\saved_models\\trained_binary_rf.sav"
path = os.path.join(cdir, file_name)
pickle.dump(rf_fitted, open(path, "wb"))

# # Display feature importance
# importances = rf_fitted.feature_importances_
# forest_importances = pd.Series(importances, index=selected_features_rf_binary)
# fig, ax = plt.subplots()
# forest_importances.plot.bar(ax=ax)
# ax.set_title("Feature importances using MDI")
# ax.set_ylabel("Mean decrease in impurity")
# fig.tight_layout()
# plt.show()




# Multiclass Classification

This section obtains the optimal Multiclass Classification models and the performance estimates, with three classes. Two models are implemented: Random Forest Classifier, XGBoost Classifier. First, the model is trained on the full dataset to obtain the optimal features followed by a model that obtains the performance estimate using Nested Cross Validation. The classes are:
- 0 - 30%
- 30% - 80%
- 80% - 100% <br> <br>


- Performance Metric
- Nested Cross Validation
- Benchmark Models
- Main findings

In [126]:
# Setting class value
# Set final boundary slightly over 1 so 1's are included as well
df_multi = df[df['perc_loss'].notnull()]
classes = {"0": [0, 0.3], "1": [0.3, 0.8], "2": [0.8, 1.1]}
df_multi["class_value_multi"] = df_multi["perc_loss"].apply(
    lambda x: determine_class(x, classes=classes)
)

# Setting for feature seleciton on full data set
X = df_multi[features]
y = df_multi["class_value_multi"]

# Setting train and test set for obtaining performance estimate
df_train_list, df_test_list = splitting_train_test(df_multi)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Random Forest

### Selecting the optimal hyperparameters and features

Number of features selected in RF multiclass: 14

The selected features ares:
- mean_slope
- mean_elevation_m
- ruggedness_stdev
- mean_ruggedness
- slope_stdev
- area_km2
- poverty_perc
- perimeter
- glat
- glon
- rainfall_max_6h
- rainfall_max_24h
- dis_track_min
- vmax

Selected Parameters in RF multiclass: 
- max_depth = None
- min_samples_leaf = 3
- min_samples_split = 15
- n_estimators = 50

In [22]:
# Setting the random forest search grid
rf_search_space = [
    {
        "estimator__n_estimators": [50, 100, 150],
        "estimator__max_depth": [20, None],
        "estimator__min_samples_split": [2, 10, 15],
        "estimator__min_samples_leaf": [1, 3, 5],
    }
]

selected_features_rf_multi, selected_params_rf_multi_full = rf_multi_features(
    X=X,
    y=y,
    features=features,
    search_space=rf_search_space,
    cv_splits=5,
    class_weight="balanced",
    min_features_to_select=1,
    GS_score="f1_macro",
    GS_randomized=False,
    GS_n_iter=10,
    verbose=10,
)

print(
    f"Number of features selected in RF multiclass: {len(selected_features_rf_multi)}"
)
print(f"Selected features RF multiclass: {selected_features_rf_multi}")
print(f"Selected Parameters in RF multiclass: {selected_params_rf_multi_full}")


Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5; 1/54] START estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=50
[CV 1/5; 1/54] END estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=50;, score=(train=0.999, test=0.420) total time=  25.2s
[CV 2/5; 1/54] START estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=50
[CV 2/5; 1/54] END estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=50;, score=(train=1.000, test=0.403) total time=  24.9s
[CV 3/5; 1/54] START estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=50
[CV 3/5; 1/54] END estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=50;, score=(train=1.000, 

### Obtaining the performance estimate

In [None]:
# Setting the selected features for RF --> based on output previous cell
selected_features_rf_multi = [
    "rice_area",
    "mean_slope",
    "mean_elevation_m",
    "ruggedness_stdev",
    "mean_ruggedness",
    "slope_stdev",
    "area_km2",
    "poverty_perc",
    "with_coast",
    "coast_length",
    "perimeter",
    "glat",
    "glon",
    "coast_peri_ratio",
    "rainfall_sum",
    "rainfall_max",
    "dis_track_min",
    "vmax_sust",
]


In [23]:
# Obtain the performance estimate
rf_search_space = [
    {
        "rf__n_estimators": [50, 100, 150],
        "rf__max_depth": [20, None],
        "rf__min_samples_split": [2, 10, 15],
        "rf__min_samples_leaf": [1, 3, 5],
    }
]

df_predicted_rf_multi, selected_params_rf_multi = rf_multi_performance(
    df_train_list=df_train_list,
    df_test_list=df_test_list,
    features=selected_features_rf_multi,
    search_space=rf_search_space,
    stratK=True,
    cv_splits=5,
    class_weight="balanced",
    GS_score="f1_macro",
    GS_randomized=False,
    GS_n_iter=10,
    verbose=10,
)



Running for 1 out of a total of 5
Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5; 1/54] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=50
[CV 1/5; 1/54] END rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=50;, score=(train=0.999, test=0.416) total time=   0.1s
[CV 2/5; 1/54] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=50
[CV 2/5; 1/54] END rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=50;, score=(train=1.000, test=0.404) total time=   0.1s
[CV 3/5; 1/54] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=50
[CV 3/5; 1/54] END rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=50;, score=(train=1.000, test=0.408) total time=   0.1s
[CV 4/5; 1/54] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimator

In [24]:
#Saving the results
file_name = "IBF_typhoon_model\\models\\output\\02\\selected_params_rf_multi.p"
path = os.path.join(cdir, file_name)
pickle.dump(selected_params_rf_multi, open(path, "wb"))

file_name = "IBF_typhoon_model\\models\\output\\02\\df_predicted_rf_multi.csv"
path = os.path.join(cdir, file_name)
df_predicted_rf_multi.to_csv(path)




## XGBoost

### Obtaining the optimal features and hyperparameters

Number of selected features:

Selected features:

Selected hyperparameters:

In [127]:
# Setting class value
# Set final boundary slightly over 1 so 1's are included as well
df_multi = df[df['perc_loss'].notnull()]
classes = {0: [0, 0.3], 1: [0.3, 0.8], 2: [0.8, 1.1]}
df_multi["class_value_multi"] = df_multi["perc_loss"].apply(
    lambda x: determine_class(x, classes=classes)
)

# Setting for feature seleciton on full data set
X = df_multi[features]
y = df_multi["class_value_multi"]

# Setting train and test set for obtaining performance estimate
df_train_list, df_test_list = splitting_train_test(df_multi)

# Setting the XGBoost search grid
xgb_search_space = [
    {
        "estimator__learning_rate": [0.1, 0.5, 1],
        "estimator__gamma": [0.1, 0.5, 2],
        "estimator__max_depth": [6, 8],
        "estimator__reg_lambda": [0.001, 0.1, 1],
        "estimator__n_estimators": [100, 200],
        "estimator__colsample_bytree": [0.5, 0.7],
    }
]

selected_features_xgb_multi, selected_params_xgb_multi_full = xgb_multi_features(
    X=X,
    y=y,
    features=features,
    num_class=len(classes),
    search_space=xgb_search_space,
    objective="multi:softmax",
    cv_splits=5,
    min_features_to_select=1,
    GS_score="f1_macro",
    GS_randomized=True,
    GS_n_iter=50,
    verbose=10,
)

print(
    f"Number of features selected in RF multiclass: {len(selected_features_xgb_multi)}"
)
print(f"Selected features RF multiclass: {selected_features_xgb_multi}")
print(f"Selected Parameters in RF multiclass: {selected_params_xgb_multi_full}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5; 1/50] START estimator__colsample_bytree=0.7, estimator__gamma=2, estimator__learning_rate=0.1, estimator__max_depth=8, estimator__n_estimators=200, estimator__reg_lambda=0.1
[CV 1/5; 1/50] END estimator__colsample_bytree=0.7, estimator__gamma=2, estimator__learning_rate=0.1, estimator__max_depth=8, estimator__n_estimators=200, estimator__reg_lambda=0.1;, score=(train=0.360, test=0.336) total time= 3.6min
[CV 2/5; 1/50] START estimator__colsample_bytree=0.7, estimator__gamma=2, estimator__learning_rate=0.1, estimator__max_depth=8, estimator__n_estimators=200, estimator__reg_lambda=0.1
[CV 2/5; 1/50] END estimator__colsample_bytree=0.7, estimator__gamma=2, estimator__learning_rate=0.1, estimator__max_depth=8, estimator__n_estimators=200, estimator__reg_lambda=0.1;, score=(train=0.375, test=0.315) total time= 3.6min
[CV 3/5; 1/50] START estimator__colsample_bytree=0.7, estimator__gamma=2, estimator__learning_rate=0.1, 

### Obtaining model performance

In [None]:
# Setting the selected features for XGB
selected_features_xgb_multi = [
    "rice_area",
    "mean_slope",
    "mean_elevation_m",
    "ruggedness_stdev",
    "mean_ruggedness",
    "slope_stdev",
    "area_km2",
    "poverty_perc",
    "with_coast",
    "coast_length",
    "perimeter",
    "glat",
    "glon",
    "coast_peri_ratio",
    "rainfall_sum",
    "rainfall_max",
    "dis_track_min",
    "vmax_sust",
]


In [None]:
# Setting the XGBoost search grid
xgb_search_space = [
    {
        "xgb__learning_rate": [0.1, 0.5, 1],
        "xgb__gamma": [0.1, 0.5, 2],
        "xgb__max_depth": [6, 8],
        "xgb__reg_lambda": [0.001, 0.1, 1],
        "xgb__n_estimators": [100, 200],
        "xgb__colsample_bytree": [0.5, 0.7],
    }
]

df_predicted_xgb_multi, selected_params_xgb_multi = xgb_multi_performance(
    df_train_list=df_train_list,
    df_test_list=df_test_list,
    num_class=len(classes),
    features=selected_features_xgb_multi,
    search_space=xgb_search_space,
    stratK=True,
    cv_splits=5,
    objective="multi:softmax",
    GS_score="f1_macro",
    GS_randomized=True,
    GS_n_iter=50,
    verbose=10,
)


In [None]:
# Saving the results
file_name = "IBF_typhoon_model\\models\\output\\02\\selected_params_xgb_multi.p"
path = os.path.join(cdir, file_name)
pickle.dump(selected_params_xgb_multi, open(path, "wb"))

file_name = "IBF_typhoon_model\\models\\output\\02\\df_predicted_xgb_multi.csv"
path = os.path.join(cdir, file_name)
df_predicted_xgb_multi.to_csv(path)


## Benchmark

In [78]:
# Random unweighted predictions
df_predicted_random_multi = pd.DataFrame(columns=["year", "actual", "predicted"])

for i in range(len(df_train_list)):

    train = df_train_list[i]
    test = df_test_list[i]

    y_train = train["class_value_multi"]
    y_test = test["class_value_multi"]

    y_pred_test = unweighted_random(y_train, y_test)
    df_predicted_temp = pd.DataFrame(
        {"year": test["year"], "actual": y_test, "predicted": y_pred_test}
    )

    df_predicted_random_multi = pd.concat(
        [df_predicted_random_multi, df_predicted_temp]
    )


In [79]:
# Random Weighted Predictions
df_predicted_random_weighted_multi = pd.DataFrame(
    columns=["year", "actual", "predicted"]
)
for i in range(len(df_train_list)):

    train = df_train_list[i]
    test = df_test_list[i]

    y_train = train["class_value_multi"]
    y_test = test["class_value_multi"]

    y_pred_test = weighted_random(y_train, y_test)
    df_predicted_temp = pd.DataFrame(
        {"year": test["year"], "actual": y_test, "predicted": y_pred_test}
    )

    df_predicted_random_weighted_multi = pd.concat(
        [df_predicted_random_weighted_multi, df_predicted_temp]
    )



## Results

In [82]:
models = {
    "Random Forest": df_predicted_rf_multi,
    # "XGBoost": df_predicted_xgb_multi,
    "Random": df_predicted_random_multi,
    "Weighted Random": df_predicted_random_weighted_multi,
}

f1 = []
precision = []
recall = []

# add 'list' if error
for df_temp in models.values():
    f1.append(f1_score(df_temp["actual"], df_temp["predicted"], average="macro"))
    precision.append(precision_score(df_temp["actual"], df_temp["predicted"], average="macro"))
    recall.append(recall_score(df_temp["actual"], df_temp["predicted"], average="macro"))

df_results_multi = pd.DataFrame(
    {"Models": list(models.keys()), "F1 score": f1, "Recall": recall, "Precision": precision}
)

display(df_results_multi)


Unnamed: 0,Models,F1 score,Recall,Precision
0,Random Forest,0.409386,0.410419,0.409806
1,Random,0.298415,0.330347,0.329676
2,Weighted Random,0.335421,0.340306,0.339631


## Training the optimal model

In [34]:
### Training the optimal model
rf = RandomForestClassifier(
    class_weight="balanced",
    n_estimators=50,
    max_depth=None,
    min_samples_leaf=3,
    min_samples_split=15,
)

selected_features_rf_multi = [
    "mean_slope",
    "mean_elevation_m",
    "ruggedness_stdev",
    "mean_ruggedness",
    "slope_stdev",
    "area_km2",
    "poverty_perc",
    "perimeter",
    "glat",
    "glon",
    "rainfall_max_6h",
    "rainfall_max_24h",
    "dis_track_min",
    "vmax",
]

rf_fitted = rf.fit(X[selected_features_rf_multi], y)

file_name = "IBF_typhoon_model\\models\\saved_models\\trained_multi_rf.sav"
path = os.path.join(cdir, file_name)
pickle.dump(rf_fitted, open(path, "wb"))




# Regression

This sections contains the Regression models that are trained and tested to obtain the optimal model, hyperparameter settings and features. First the model is trained on the full dataset to obtain the optimal features followed by a model that obtains the performance estimate using Nested Cross Validation.


- Performance metrics
- Nested Cross Validation
- Benchmark Models
- Main finding

In [None]:
# Full dataset for feature selection
df_regr = df[df['perc_loss'].notnull()]

X = df_regr[features]
y = df_regr["perc_loss"]

# Setting the train and the test sets for obtaining performance estimate
df_train_list, df_test_list = splitting_train_test(df_regr)


## Random Forest

### Training the optimal model

Number of selected features RF Regression: 12

Selected features RF Regression:
- mean_slope
- mean_elevation_m
- ruggedness_stdev
- mean_ruggedness
- area_km2
- coast_length
- poverty_perc
- perimeter
- glat
- glon
- coast_peri_ratio
- rainfall_max_6h
- rainfall_max_24h
- dis_track_min
- vmax


Selected Parameters RF Regression: 
- max_depth = 20
- min_samples_leaf = 1
- min_samples_split = 8
- n_estimators = 100



In [26]:
#%% Setting input varialbes
rf_search_space = [
    {
        "estimator__n_estimators": [100, 250],
        "estimator__max_depth": [20, None],
        "estimator__min_samples_split": [2, 8, 10],
        "estimator__min_samples_leaf": [1, 3, 5],
    }
]

(
    selected_features_rf_regr,
    selected_params_rf_regr_full,
) = rf_regression_features(
    X=X,
    y=y,
    features=features,
    search_space=rf_search_space,
    min_features_to_select=1,
    cv_splits=5,
    GS_score="neg_root_mean_squared_error",
    GS_randomized=False,
    GS_n_iter=10,
    verbose=10,
)

print(
    f"Number of selected features RF Regression {len(selected_features_rf_regr)}"
)
print(f"Selected features RF Regression: {selected_features_rf_regr}")
print(f"Selected Parameters RF Regression: {selected_params_rf_regr_full}")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5; 1/36] START estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100
[CV 1/5; 1/36] END estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100;, score=(train=-0.123, test=-0.324) total time= 1.9min
[CV 2/5; 1/36] START estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100
[CV 2/5; 1/36] END estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100;, score=(train=-0.121, test=-0.307) total time= 2.2min
[CV 3/5; 1/36] START estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100
[CV 3/5; 1/36] END estimator__max_depth=20, estimator__min_samples_leaf=1, estimator__min_samples_split=2, estimator__n_estimators=100;, score=(tra

### Obtaining the performance estimate

In [None]:
# Based on output previous cell
selected_features_rf_regr = [
    "rice_area",
    "mean_slope",
    "mean_elevation_m",
    "ruggedness_stdev",
    "mean_ruggedness",
    "slope_stdev",
    "area_km2",
    "poverty_perc",
    "with_coast",
    "coast_length",
    "perimeter",
    "glat",
    "glon",
    "coast_peri_ratio",
    "rainfall_sum",
    "rainfall_max",
    "dis_track_min",
    "vmax_sust",
]


In [27]:
#%% Setting input varialbes
rf_search_space = [
    {
        "rf__n_estimators": [100, 250],
        "rf__max_depth": [20, None],
        "rf__min_samples_split": [2, 8, 10],
        "rf__min_samples_leaf": [1, 3, 5],
    }
]

df_predicted_rf_regr, selected_params_rf_regr = rf_regression_performance(
    df_train_list=df_train_list,
    df_test_list=df_test_list,
    features=selected_features_rf_regr,
    search_space=rf_search_space,
    cv_splits=5,
    GS_score="neg_root_mean_squared_error",
    GS_randomized=False,
    GS_n_iter=10,
    verbose=10,
)


Running for 1 out of a total of 5
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5; 1/36] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100
[CV 1/5; 1/36] END rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100;, score=(train=-0.127, test=-0.333) total time=   2.1s
[CV 2/5; 1/36] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100
[CV 2/5; 1/36] END rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100;, score=(train=-0.126, test=-0.326) total time=   2.1s
[CV 3/5; 1/36] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100
[CV 3/5; 1/36] END rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf__n_estimators=100;, score=(train=-0.124, test=-0.342) total time=   2.2s
[CV 4/5; 1/36] START rf__max_depth=20, rf__min_samples_leaf=1, rf__min_samples_split=2, rf_

In [30]:
file_name = "IBF_typhoon_model\\models\\output\\02\\selected_params_rf_regr.p"
path = os.path.join(cdir, file_name)
pickle.dump(selected_params_rf_regr, open(path, "wb"))

file_name = "IBF_typhoon_model\\models\\output\\02\\df_predicted_rf_regr.csv"
path = os.path.join(cdir, file_name)
df_predicted_rf_regr.to_csv(path)




## XGBoost Regression

### Obtaining the optimal model

In [None]:
xgb_search_space = [
    {
        "estimator__learning_rate": [0.1, 0.5, 1],
        "estimator__gamma": [0.1, 0.5, 2],
        "estimator__max_depth": [6, 8],
        "estimator__reg_lambda": [0.001, 0.1, 1],
        "estimator__n_estimators": [100, 200],
        "estimator__colsample_bytree": [0.5, 0.7],
    }
]

selected_features_xgb_regr, selected_params_xgb_regr_full = xgb_regression_features(
    X=X,
    y=y,
    features=features,
    search_space=xgb_search_space,
    min_features_to_select=1,
    cv_splits=5,
    GS_score="neg_root_mean_squared_error",
    objective='"reg:squarederror"',
    GS_randomized=True,
    GS_n_iter=50,
    verbose=10,
)


print(f"Number of selected features XGBoost Regression {len(selected_features_xgb_regr)}")
print(f"Selected features XGBoost Regression: {selected_features_xgb_regr}")
print(f"Selected Parameters XGBoost Regression: {selected_params_xgb_regr_full}")



### Obtaining the performance estimate

In [None]:
# Setting the selected features for XGB
selected_features_xgb_regr = [
    'rice_area', 
    'mean_slope', 
    'mean_elevation_m', 
    'ruggedness_stdev', 
    'mean_ruggedness', 
    'slope_stdev', 
    'area_km2', 
    'poverty_perc', 
    'with_coast', 
    'coast_length', 
    'perimeter', 
    'glat', 
    'glon', 
    'coast_peri_ratio', 
    'rainfall_sum', 
    'rainfall_max', 
    'dis_track_min', 
    'vmax_sust'
]

In [None]:
xgb_search_space = [
    {
        "xgb__learning_rate": [0.1, 0.5, 1],
        "xgb__gamma": [0.1, 0.5, 2],
        "xgb__max_depth": [6, 8],
        "xgb__reg_lambda": [0.001, 0.1, 1],
        "xgb__n_estimators": [100, 200],
        "xgb__colsample_bytree": [0.5, 0.7],
    }
]

df_predicted_xgb_regr, selected_params_xgb_regr = xgb_regression_performance(
    df_train_list=df_train_list,
    df_test_list=df_test_list,
    features=selected_features_xgb_regr,
    search_space=xgb_search_space,
    cv_splits=5,
    objective="reg:squarederror",
    GS_score="neg_root_mean_squared_error",
    GS_randomized=True,
    GS_n_iter=50,
    verbose=10,
)


In [None]:
file_name = "IBF_typhoon_model\\models\\output\\02\\selected_params_xgb_regr.p"
path = os.path.join(cdir, file_name)
pickle.dump(selected_params_xgb_regr, open(path, "wb"))

file_name = "IBF_typhoon_model\\models\\output\\02\\df_predicted_xgb_regr.csv"
path = os.path.join(cdir, file_name)
df_predicted_xgb_regr.to_csv(path)


## Benchmark

In [97]:
# Predict the average
df_predicted_mean = pd.DataFrame(columns=["year", "actual", "predicted"])

for i in range(len(df_train_list)):

    train = df_train_list[i]
    test = df_test_list[i]

    y_train = train["perc_loss"]
    y_test = test["perc_loss"]

    y_test_pred = [np.mean(y_train)] * len(y_test)

    df_predicted_temp = pd.DataFrame(
        {"year": test["year"], "actual": y_test, "predicted": y_test_pred}
    )

    df_predicted_mean = pd.concat([df_predicted_mean, df_predicted_temp])



In [119]:
# Simle Linear Regression with Wind Speed
input_variable = "vmax"
df_predicted_lr = pd.DataFrame(columns=["year", "actual", "predicted"])

for i in range(len(df_train_list)):

    train = df_train_list[i]
    test = df_test_list[i]

    x_train = train[input_variable].values.reshape(-1, 1)
    y_train = train["perc_loss"].values.reshape(-1, 1)

    x_test = test[input_variable].values.reshape(-1, 1)
    y_test = test["perc_loss"]

    model = LinearRegression()
    lr_fitted = model.fit(x_train, y_train)

    y_pred_train = lr_fitted.predict(x_train)
    y_pred_test = lr_fitted.predict(x_test)
    y_pred_test = y_pred_test.tolist()
    y_pred_test = [val for sublist in y_pred_test for val in sublist]

    df_predicted_temp = pd.DataFrame(
        {"year": test["year"], "actual": y_test, "predicted": y_pred_test}
    )

    df_predicted_lr = pd.concat([df_predicted_lr, df_predicted_temp])


## Results

In [121]:
models = {
    "Random Forest": df_predicted_rf_regr,
    # "XGBoost": df_predicted_xgb_regr,
    "Average": df_predicted_mean,
    "Simple Linear Regression": df_predicted_lr,
}

mae = []
rmse = []

# add 'list' if error
for df_temp in models.values():
    mae.append(mean_absolute_error(df_temp["actual"], df_temp["predicted"]))
    rmse.append(mean_squared_error(df_temp["actual"], df_temp["predicted"], squared=False))

df_results_regr = pd.DataFrame({"Models": list(models.keys()), "MAE": mae, "RMSE": rmse})

display(df_results_regr)


Unnamed: 0,Models,MAE,RMSE
0,Random Forest,0.281097,0.334649
1,Average,0.31881,0.354945
2,Simple Linear Regression,0.292208,0.33328


## Training the optimal model

In [41]:
rf = RandomForestRegressor(
    max_depth=20, min_samples_leaf=1, min_samples_split=8, n_estimators=100,
)

selected_features_rf_regr = [
    "mean_slope",
    "mean_elevation_m",
    "ruggedness_stdev",
    "mean_ruggedness",
    "area_km2",
    "poverty_perc",
    "coast_length",
    "perimeter",
    "glat",
    "glon",
    "coast_peri_ratio",
    "rainfall_max_6h",
    "rainfall_max_24h",
    "dis_track_min",
    "vmax",
]

rf_fitted = rf.fit(X[selected_features_rf_regr], y)

file_name = "IBF_typhoon_model\\models\\saved_models\\trained_regr_rf.sav"
path = os.path.join(cdir, file_name)
pickle.dump(rf_fitted, open(path, "wb"))


