In [15]:
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical
import pandas as pd
import os

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score

In [16]:
merged_dir = "../../../data/merged"
image_dir = "../../../images/DT"

# original data

In [17]:
train_file_path = os.path.join(merged_dir, "train_data.parquet")
train_data = pd.read_parquet(train_file_path)
test_file_path = os.path.join(merged_dir, "test_data.parquet")
test_data = pd.read_parquet(test_file_path)

In [18]:
# Prepare features and labels
X_train = train_data.drop(columns=['fire','longitude', 'latitude']).to_numpy()
y_train = train_data['fire'].to_numpy()

X_test = test_data.drop(columns=['fire','longitude', 'latitude']).to_numpy()
y_test = test_data['fire'].to_numpy()

In [19]:
dt = DecisionTreeClassifier(
    random_state=42,
    class_weight="balanced"
)

# Optimized search space
dt_space = {
    "max_depth": Integer(3, 30),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "criterion": Categorical(["gini", "entropy"])
}

# Bayesian search
dt_bayes = BayesSearchCV(
    estimator=dt,
    search_spaces=dt_space,
    n_iter=50,
    cv=5,
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit
dt_bayes.fit(X_train, y_train)

# Best results
print("Best DT params:", dt_bayes.best_params_)
print("Best CV f1_macro:", dt_bayes.best_score_)

# Test evaluation
y_pred_dt = dt_bayes.predict(X_test)
print("Test f1_macro:", f1_score(y_test, y_pred_dt, average="macro"))

# Store all results
results_dt = pd.DataFrame(dt_bayes.cv_results_).sort_values(by="mean_test_score", ascending=False)
results_dt.to_csv("dt_bayes_results.csv", index=False)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi



Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [20]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred_dt, target_names=[str(c) for c in set(y_test)], digits=4)
print(report)

              precision    recall  f1-score   support

           0     0.9719    0.9504    0.9610      7576
           1     0.6422    0.7644    0.6980       883

    accuracy                         0.9310      8459
   macro avg     0.8071    0.8574    0.8295      8459
weighted avg     0.9375    0.9310    0.9336      8459



In [21]:
# Convert cv results to DataFrame
results_dt = pd.DataFrame(dt_bayes.cv_results_)

# Sort by best CV score
results_dt = results_dt.sort_values(
    by="mean_test_score", ascending=False
)

# Show top 5 configs
print(results_dt[
    ["mean_test_score", "std_test_score",
     "param_max_depth", "param_min_samples_split",
     "param_min_samples_leaf", "param_criterion"]
].head())


    mean_test_score  std_test_score  param_max_depth  param_min_samples_split  \
29         0.822016        0.004197               29                        2   
21         0.821311        0.002833               30                        2   
10         0.819900        0.008531               30                        2   
16         0.818431        0.008856               28                        2   
23         0.818290        0.006884               30                        3   

    param_min_samples_leaf param_criterion  
29                       1            gini  
21                       1            gini  
10                       1         entropy  
16                       1         entropy  
23                       1         entropy  


In [22]:
results_dt.to_csv("dt_bayes_original_results.csv", index=False)

# Smote Tomek

In [23]:
train_file_path = os.path.join(merged_dir, "train_smote_tomek.parquet")
train_data = pd.read_parquet(train_file_path)
test_file_path = os.path.join(merged_dir, "test_data.parquet")
test_data = pd.read_parquet(test_file_path)

In [24]:
# Prepare features and labels
X_train = train_data.drop(columns=['fire','longitude', 'latitude']).to_numpy()
y_train = train_data['fire'].to_numpy()

X_test = test_data.drop(columns=['fire','longitude', 'latitude']).to_numpy()
y_test = test_data['fire'].to_numpy()

In [25]:
dt = DecisionTreeClassifier(
    random_state=42,
    class_weight="balanced"
)

# Optimized search space
dt_space = {
    "max_depth": Integer(3, 30),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "criterion": Categorical(["gini", "entropy"])
}

# Bayesian search
dt_bayes = BayesSearchCV(
    estimator=dt,
    search_spaces=dt_space,
    n_iter=50,
    cv=5,
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit
dt_bayes.fit(X_train, y_train)

# Best results
print("Best DT params:", dt_bayes.best_params_)
print("Best CV f1_macro:", dt_bayes.best_score_)

# Test evaluation
y_pred_dt = dt_bayes.predict(X_test)
print("Test f1_macro:", f1_score(y_test, y_pred_dt, average="macro"))

# Store all results
results_dt = pd.DataFrame(dt_bayes.cv_results_).sort_values(by="mean_test_score", ascending=False)
results_dt.to_csv("dt_bayes_results.csv", index=False)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [26]:
report = classification_report(y_test, y_pred_dt, target_names=[str(c) for c in set(y_test)], digits=4)
print(report)

              precision    recall  f1-score   support

           0     0.9677    0.9538    0.9607      7576
           1     0.6472    0.7271    0.6848       883

    accuracy                         0.9301      8459
   macro avg     0.8075    0.8404    0.8228      8459
weighted avg     0.9343    0.9301    0.9319      8459



In [27]:
# Convert cv results to DataFrame
results_dt = pd.DataFrame(dt_bayes.cv_results_)

# Sort by best CV score
results_dt = results_dt.sort_values(
    by="mean_test_score", ascending=False
)

# Show top 5 configs
print(results_dt[
    ["mean_test_score", "std_test_score",
     "param_max_depth", "param_min_samples_split",
     "param_min_samples_leaf", "param_criterion"]
].head())


    mean_test_score  std_test_score  param_max_depth  param_min_samples_split  \
10         0.957393        0.010655               30                        2   
23         0.957211        0.010230               28                        2   
29         0.957144        0.010141               27                        2   
27         0.957110        0.010269               29                        2   
21         0.956548        0.009506               30                        2   

    param_min_samples_leaf param_criterion  
10                       1         entropy  
23                       1         entropy  
29                       1         entropy  
27                       1         entropy  
21                       1            gini  


In [28]:
results_dt.to_csv("dt_bayes_sTomek_results.csv", index=False)