In [1]:
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical
import pandas as pd
import os

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score

In [2]:
merged_dir = "../../../data/merged"
image_dir = "../../../images/DT"

# original data

In [3]:
train_file_path = os.path.join(merged_dir, "train_data.parquet")
train_data = pd.read_parquet(train_file_path)
test_file_path = os.path.join(merged_dir, "test_data.parquet")
test_data = pd.read_parquet(test_file_path)

In [4]:
# Prepare features and labels
X_train = train_data.drop(columns=['fire','longitude', 'latitude']).to_numpy()
y_train = train_data['fire'].to_numpy()

X_test = test_data.drop(columns=['fire','longitude', 'latitude']).to_numpy()
y_test = test_data['fire'].to_numpy()

In [5]:
rf = RandomForestClassifier(
    random_state=42,
    class_weight="balanced"
)

# Optimized search space
rf_space = {
    "n_estimators": Integer(100, 300),
    "max_depth": Integer(5, 25),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "max_features": Categorical(["sqrt", "log2"])
}

# Bayesian Search
rf_bayes = BayesSearchCV(
    estimator=rf,
    search_spaces=rf_space,
    n_iter=50,
    cv=5,
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit
rf_bayes.fit(X_train, y_train)

# Best results
print("Best RF params:", rf_bayes.best_params_)
print("Best CV f1_macro:", rf_bayes.best_score_)

# Test evaluation
y_pred_rf = rf_bayes.predict(X_test)
print("Test f1_macro:", f1_score(y_test, y_pred_rf, average="macro"))

# Store all results
results_rf = pd.DataFrame(rf_bayes.cv_results_).sort_values(by="mean_test_score", ascending=False)
results_rf.to_csv("rf_bayes_results.csv", index=False)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [6]:
# Generate classification report
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred_rf, target_names=[str(c) for c in set(y_test)], digits=4)
print(report)

              precision    recall  f1-score   support

           0     0.9711    0.9624    0.9667      7576
           1     0.7003    0.7542    0.7263       883

    accuracy                         0.9407      8459
   macro avg     0.8357    0.8583    0.8465      8459
weighted avg     0.9428    0.9407    0.9416      8459



In [7]:
# Convert cv results to DataFrame
results_rf = pd.DataFrame(rf_bayes.cv_results_)

# Sort by best CV score
results_rf = results_rf.sort_values(
    by="mean_test_score", ascending=False
)

# Show top 5 configs
print(results_rf[
    ["mean_test_score", "std_test_score",
     "param_max_depth", "param_min_samples_split",
     "param_min_samples_leaf", "param_max_features", "param_n_estimators"]
].head())


    mean_test_score  std_test_score  param_max_depth  param_min_samples_split  \
10         0.843122        0.003272               25                        2   
28         0.842858        0.003249               24                        2   
24         0.842769        0.003510               25                        2   
44         0.842710        0.002878               22                        2   
23         0.842543        0.003845               24                        2   

    param_min_samples_leaf param_max_features  param_n_estimators  
10                       1               sqrt                 300  
28                       1               sqrt                 300  
24                       1               log2                 300  
44                       1               log2                 154  
23                       1               log2                 107  


In [8]:
results_rf.to_csv("rf_bayes_original_results.csv", index=False)

# Smote Tomek

In [9]:
train_file_path = os.path.join(merged_dir, "train_smote_tomek.parquet")
train_data = pd.read_parquet(train_file_path)
test_file_path = os.path.join(merged_dir, "test_data.parquet")
test_data = pd.read_parquet(test_file_path)

In [10]:
# Prepare features and labels
X_train = train_data.drop(columns=['fire','longitude', 'latitude']).to_numpy()
y_train = train_data['fire'].to_numpy()

X_test = test_data.drop(columns=['fire','longitude', 'latitude']).to_numpy()
y_test = test_data['fire'].to_numpy()

In [11]:
rf = RandomForestClassifier(
    random_state=42,
    class_weight="balanced"
)

# Optimized search space
rf_space = {
    "n_estimators": Integer(100, 300),
    "max_depth": Integer(5, 25),
    "min_samples_split": Integer(2, 20),
    "min_samples_leaf": Integer(1, 10),
    "max_features": Categorical(["sqrt", "log2"])
}

# Bayesian Search
rf_bayes = BayesSearchCV(
    estimator=rf,
    search_spaces=rf_space,
    n_iter=50,
    cv=5,
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

# Fit
rf_bayes.fit(X_train, y_train)

# Best results
print("Best RF params:", rf_bayes.best_params_)
print("Best CV f1_macro:", rf_bayes.best_score_)

# Test evaluation
y_pred_rf = rf_bayes.predict(X_test)
print("Test f1_macro:", f1_score(y_test, y_pred_rf, average="macro"))

# Store all results
results_rf = pd.DataFrame(rf_bayes.cv_results_).sort_values(by="mean_test_score", ascending=False)
results_rf.to_csv("rf_bayes_results.csv", index=False)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [12]:
report = classification_report(y_test, y_pred_rf, target_names=[str(c) for c in set(y_test)], digits=4)
print(report)

              precision    recall  f1-score   support

           0     0.9724    0.9568    0.9645      7576
           1     0.6743    0.7667    0.7175       883

    accuracy                         0.9370      8459
   macro avg     0.8233    0.8618    0.8410      8459
weighted avg     0.9413    0.9370    0.9388      8459



In [13]:
# Convert cv results to DataFrame
results_rf = pd.DataFrame(rf_bayes.cv_results_)

# Sort by best CV score
results_rf = results_rf.sort_values(
    by="mean_test_score", ascending=False
)

# Show top 5 configs
print(results_rf[
    ["mean_test_score", "std_test_score",
     "param_max_depth", "param_min_samples_split",
     "param_min_samples_leaf", "param_max_features", "param_n_estimators"]
].head())


    mean_test_score  std_test_score  param_max_depth  param_min_samples_split  \
12         0.970060        0.010297               25                        2   
24         0.969661        0.010251               25                        2   
17         0.969544        0.010486               25                        2   
22         0.969444        0.010246               24                        2   
10         0.969045        0.010378               25                        3   

    param_min_samples_leaf param_max_features  param_n_estimators  
12                       1               sqrt                 300  
24                       1               log2                 300  
17                       1               log2                 203  
22                       1               sqrt                 300  
10                       1               sqrt                 177  


In [14]:
results_rf.to_csv("rf_bayes_sTomek_results.csv", index=False)