In [1]:
# =============================
# Imports gerais
# =============================
import os
import math
import warnings
import pickle

import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import shap
import xgboost as xgb

from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error
)
from sklearn.model_selection import (
    KFold,
    RepeatedKFold
)
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

warnings.filterwarnings("ignore")
np.random.seed(42)


In [2]:
# =============================
# Funções utilitárias
# =============================
def safe_log(y):
    return np.sign(y) * np.log1p(np.abs(y))

def inverse_safe_log(y_log):
    return np.sign(y_log) * (np.expm1(np.abs(y_log)))


In [3]:
print("Loading dataset...")

shp_path = '../../Data/Processed/PT-FireSprd_v2.1/L2_FireBehavior/PT-FireProg_v2.1_L2_model.shp'

try:
    df = gpd.read_file(shp_path)
    print("Dataset loaded")
except Exception as e:
    raise RuntimeError(f"Error loading dataset: {e}")


Loading dataset...
Dataset loaded


In [5]:
# Remover NaN no target
n_before = len(df)
df = df.dropna(subset=['ros_p'])
n_after = len(df)

print(f"Removed {n_before - n_after} rows with NaN in 'ros_p'")

# Converter categóricas
cat_cols = ["fuel_model", "landform", "land_use", "Haines_av"]
for col in cat_cols:
    df[col] = df[col].astype(str).astype("category")

# Remover colunas não usadas
df.drop(columns=['ros_p_lg1'], inplace=True)

print("Columns disponíveis:")
for col in df.columns:
    print(col)


Removed 0 rows with NaN in 'ros_p'
Columns disponíveis:
ros_p
duration_p
elev_av
aspect_sin
aspect_cos
landform
land_use
1_3y_fir_p
3_8y_fir_p
8_ny_fir_p
fuel_model
f_load_av
sW_1m_av
sW_3m_av
sW_7_av
sW_28_av
sW_100_av
sW_289_av
t_2m_C_av
d_2m_C_av
rh_2m_av
VPD_Pa_av
sP_hPa_av
gp_m2s2_av
dfmc_av
HDW_av
Haines_av
FWI_12h_av
DC_12h_av
FFMC_12h_a
wv10_kh_av
wsin10_av
wcos10_av
wv100_k_av
wsin100_av
wcos100_av
Recirc
CircVar
t_950_av
t_850_av
t_700_av
t_500_av
t_300_av
rh_950_av
rh_850_av
rh_700_av
rh_500_av
rh_300_av
wv_950_av
wv_850_av
wv_700_av
wv_500_av
wv_300_av
wsi_950_av
wco_950_av
wsi_850_av
wco_850_av
wsi_700_av
wco_700_av
wsi_500_av
wco_500_av
wsi_300_av
wco_300_av
vwv_950_av
vwv_850_av
vwv_700_av
vwv_500_av
vwv_300_av
gp_950_av
gp_850_av
gp_700_av
gp_500_av
gp_300_av
gT_s_9_av
gT_9_8_av
gT_8_7_av
gT_7_5_av
gT_5_3_av
wSv_9_av
wSsin_9_av
wScos_9_av
wSv_7_av
wSsin_7_av
wScos_7_av
wSv_5_av
wSsin_5_av
wScos_5_av
wSv_1_av
wSsin_1_av
wScos_1_av
CBH_m_av
HigCC_p_av
LowCC_p_av
MidCC_p_a

In [7]:
train_features = [c for c in df.columns if c not in ('ros_p', 'geometry')]

X = df[train_features]
y_raw = df['ros_p']
y = safe_log(y_raw)

print(f"N amostras: {len(y)}")

N amostras: 851
