In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/Wild_Fire/Regions with most tree cover loss due to fires in Thailand/treecover_loss_from_fires_by_region__ha.csv")

In [3]:
df

Unnamed: 0,iso,adm1,umd_tree_cover_loss__year,umd_tree_cover_loss__ha,umd_tree_cover_loss_from_fires__ha
0,THA,1,2001,774.644388,2.294186
1,THA,2,2001,3.054396,0.074540
2,THA,3,2001,16.749405,0.747823
3,THA,4,2001,308.611957,0.585485
4,THA,5,2001,46.036478,1.485205
...,...,...,...,...,...
1763,THA,73,2023,349.300848,0.660672
1764,THA,74,2023,174.385196,0.370912
1765,THA,75,2023,779.747852,1.979667
1766,THA,76,2023,4068.073065,1.224001


In [5]:
df["adm1"] = df["adm1"].astype("category")
df["adm1_cat"] = df["adm1"].cat.codes

df

Unnamed: 0,iso,adm1,umd_tree_cover_loss__year,umd_tree_cover_loss__ha,umd_tree_cover_loss_from_fires__ha,adm1_cat
0,THA,1,2001,774.644388,2.294186,0
1,THA,2,2001,3.054396,0.074540,1
2,THA,3,2001,16.749405,0.747823,2
3,THA,4,2001,308.611957,0.585485,3
4,THA,5,2001,46.036478,1.485205,4
...,...,...,...,...,...,...
1763,THA,73,2023,349.300848,0.660672,72
1764,THA,74,2023,174.385196,0.370912,73
1765,THA,75,2023,779.747852,1.979667,74
1766,THA,76,2023,4068.073065,1.224001,75


In [6]:
features = ["umd_tree_cover_loss__year", "adm1_cat", "umd_tree_cover_loss_from_fires__ha"]
target = "umd_tree_cover_loss__ha"

X = df[features]
y = df[target]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = GradientBoostingRegressor(random_state=42)

In [10]:
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_squared_error")  # Negative MSE for regression
print(f"Cross-Validation Scores (5 folds): {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f}")

Cross-Validation Scores (5 folds): [-3947091.47103087 -1113923.41771056 -1935290.41942052 -2144273.93791946
 -1826322.43076314]
Mean CV Score: -2193380.3354


In [11]:
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of boosting stages
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'max_depth': [3, 4, 5],  # Maximum depth of individual trees
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4]  # Minimum samples at a leaf node
}

grid_search = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid, cv=5, scoring="neg_mean_squared_error", verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validation Score: -1241752.1414
