Script Description
This script loads a pre-processed dataset, prepares features and target variables for predicting soil water content (SWCT_015), and optimizes a Extreme Gradient Boosting regression model using grouped cross-validation.

File Name: 02_01_Hyperparameter_Tuning.ipynb

Date: 2025

Created by: Rob Alamgir

Version: 1.0

References:

#### Import the relevant packages

In [1]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, LeaveOneGroupOut, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import make_scorer
from sklearn.exceptions import FitFailedWarning
from sklearn.linear_model import LinearRegression
from xgboost.sklearn import XGBRegressor
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib.colors import Normalize
import matplotlib.cm as cm
#help(XGBRegressor)
#help(RandomForestRegressor)

### Step 1: Load Data & prep the dataset

In [2]:
# Load and preprocess data
data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data/Pre_Processed_Data_All_Locations_Updated_6.csv"
#data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data/Pre_Processed_Data_All_Locations_Updated_6_Summer_Data.csv"
#data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data/Pre_Processed_Data_All_Locations_Updated_6_Winter_Data.csv"
Complete_Data = pd.read_csv(data_path)

# Data preprocessing
Complete_Data['Date'] = pd.to_datetime(Complete_Data['Date'], format='%Y-%m-%d')
Complete_Data['Source_ID'] = Complete_Data['Source'].astype('category').cat.codes + 1
print(f"Rows and columns before removing NaNs: {Complete_Data.shape}")

# Filter and clean data
filtered_df = Complete_Data.dropna(subset=['SWCT_1_015']).copy() 
filtered_df['BOFEK_2020_Physical Units'] = filtered_df['BOFEK_2020_Physical Units'].astype('category')
print(f"Rows and columns after removing NaNs: {filtered_df.shape}")

# Feature and target selection
RS_GSD_Features = ['S1_Backscatter', 'S2_NDVI', 'S2_EVI', 'S2_NDMI', 
                   'STMP_1_015', 'ATMP_f', 'WTMP_f', 'WLEV_f', 'VPD_f',
                   'NEE_CO2_kg_day_ha_DAv_NT', 'NEE_CH4_kg_day_ha_DAv_NT',
                   'BD_0_5_values', 'Peat_Thickness_2022']


X = filtered_df[RS_GSD_Features]    # Features   
y = filtered_df['SWCT_1_015']       # Predictor
groups = filtered_df["Source_ID"]   # Groups for Leave-One-Group-Out
dates = filtered_df['Date']

print(f"Features (X): {X.shape}, Target (y): {y.shape}, Groups: {groups.shape}, Date: {dates.shape}")

Rows and columns before removing NaNs: (36222, 109)
Rows and columns after removing NaNs: (8576, 109)
Features (X): (8576, 13), Target (y): (8576,), Groups: (8576,), Date: (8576,)


### Step 2: Split the dataset to a training and testing set

In [3]:
# Perform the train-test split with temporal separation
X_train, X_test, y_train, y_test, groups_train, groups_test, dates_train, dates_test = train_test_split(
    X, y, groups, dates,
    test_size=0.1,      # Reserve 10%/30% for the test set
    shuffle=False)      # Ensure temporal order is maintained

# Ensure alignment of training data
X_train, y_train = X_train.align(y_train, join='inner', axis=0)
groups_train = groups_train.loc[X_train.index]
dates_train = dates_train.loc[X_train.index]  # Align dates_train with X_train

# Ensure alignment of test data
X_test, y_test = X_test.align(y_test, join='inner', axis=0)
groups_test = groups_test.loc[X_test.index]
dates_test = dates_test.loc[X_test.index]  # Align dates_test with X_test

# Verify alignment
assert X_train.index.equals(y_train.index) and X_train.index.equals(groups_train.index) and X_train.index.equals(dates_train.index), \
    "Rows in X_train, y_train, groups_train, and dates_train are misaligned!"

assert X_test.index.equals(y_test.index) and X_test.index.equals(groups_test.index) and X_test.index.equals(dates_test.index), \
    "Rows in X_test, y_test, groups_test, and dates_test are misaligned!"

# Print shapes for verification
print("Data shapes after splitting and alignment:")
print(f"Train set: X_train: {X_train.shape}, y_train: {y_train.shape}, groups_train: {groups_train.shape}, dates_train: {dates_train.shape}")
print(f"Test set: X_test: {X_test.shape}, y_test: {y_test.shape}, groups_test: {groups_test.shape}, dates_test: {dates_test.shape}")

Data shapes after splitting and alignment:
Train set: X_train: (7718, 13), y_train: (7718,), groups_train: (7718,), dates_train: (7718,)
Test set: X_test: (858, 13), y_test: (858,), groups_test: (858,), dates_test: (858,)


### Step 3: Define the model, hyperparameter grid and run the Grid Search

In [4]:
xgb = XGBRegressor(random_state=0, enable_categorical=True)  # Define the XGBoost model
pipeline = Pipeline([("xgbregressor", xgb)])

param_grid = {"xgbregressor__n_estimators": [800, 900, 1000],
              "xgbregressor__max_depth": [5, 6, 7],
              "xgbregressor__learning_rate": [0.05, 0.1, 0.2],
              "xgbregressor__subsample": [0.5, 0.7, 0.9],
              "xgbregressor__colsample_bytree": [0.5, 0.6, 0.7],
              "xgbregressor__scale_pos_weight": [1, 5]}

logo = LeaveOneGroupOut()
warnings.simplefilter("error", FitFailedWarning)

# Define custom scoring functions
def mae_scorer(y_true, y_pred):
    return -mean_absolute_error(y_true, y_pred)
def mse_scorer(y_true, y_pred):
    return -mean_squared_error(y_true, y_pred)
def bias_scorer(y_true, y_pred):
    return np.mean(y_true - y_pred)

# Wrap the custom scorers using make_scorer
scoring = {'r2': 'r2',
           'mae': make_scorer(mae_scorer),
           'mse': make_scorer(mse_scorer),
           'bias': make_scorer(bias_scorer)}

param_combinations = list(ParameterGrid(param_grid))    # Get the total number of parameter combinations

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline,
                           param_grid=param_grid,
                           cv=logo,
                           scoring=scoring,
                           refit='r2',
                           n_jobs=-1,
                           error_score=0,
                           verbose=0)  # Avoid excessive logging

# Wrap single values in lists for GridSearchCV
with tqdm(total=len(param_combinations), desc="Grid Search Progress") as pbar:
    best_score = -np.inf
    best_params = None
    best_model = None
    results_list = []
    
    for params in param_combinations:
        try:
            # Convert params into a properly formatted grid dictionary
            formatted_param_grid = {key: [value] for key, value in params.items()}
            
            grid_search.set_params(param_grid=formatted_param_grid)
            grid_search.fit(X_train, y_train, groups=groups_train)

            # Capture the best model
            if grid_search.best_score_ > best_score:
                best_score = grid_search.best_score_
                best_params = grid_search.best_params_
                best_model = grid_search.best_estimator_
            
            # Extract and store results
            cv_results = grid_search.cv_results_
            results_list.append({
                'n_estimators': params['xgbregressor__n_estimators'],
                'max_depth': params['xgbregressor__max_depth'],
                'learning_rate': params['xgbregressor__learning_rate'],
                'subsample': params['xgbregressor__subsample'],
                'colsample_bytree': params['xgbregressor__colsample_bytree'],
                'scale_pos_weight': params['xgbregressor__scale_pos_weight'],
                'mean_test_r2': cv_results['mean_test_r2'].mean(),
                'mean_test_mae': -cv_results['mean_test_mae'].mean(),
                'mean_test_mse': -cv_results['mean_test_mse'].mean(),
                'mean_test_bias': cv_results['mean_test_bias'].mean()
            })
        
        except Exception as e:
            print(f"Error encountered for params {params}: {e}")

        pbar.update(1)


results_df = pd.DataFrame(results_list)                      # Convert results to a DataFrame
print("\nBest Parameters:", best_params)                     # Print best parameters and score
print("Best Cross-Validation Score (R²):", best_score)

Grid Search Progress: 100%|████████████████████████████████████████████████████████| 486/486 [1:37:49<00:00, 12.08s/it]


Best Parameters: {'xgbregressor__colsample_bytree': 0.5, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 7, 'xgbregressor__n_estimators': 800, 'xgbregressor__scale_pos_weight': 1, 'xgbregressor__subsample': 0.9}
Best Cross-Validation Score (R²): 0.5101173328404583





### Step 5: Save the Grid Search Results

In [5]:
# Save results to CSV for reference
results_df.to_csv("C:/Data_MSc_Thesis/Results/HP_3_grid_search_results.csv", index=False)