**Script Description:** This script loads a pre-processed dataset, prepares the reduced selected set of features for predicting SENTEK Soil Moisture Content (SENTEK_SMC), and optimizes a Extreme Gradient Boosting regression model using grouped cross-validation.

**File Name:** 02_05_Hyperparameter_Tuning_Run_2.ipynb

**Date:** 2025

**Created by:** Rob Alamgir

##### Import the relevant packages

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, LeaveOneGroupOut, ParameterGrid
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import make_scorer
from sklearn.exceptions import FitFailedWarning
from xgboost.sklearn import XGBRegressor
from sklearn.preprocessing import StandardScaler
#help(XGBRegressor)

### Step 1: Load Data & prep the dataset

In [2]:
data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data_Final/Pre_Processed_Data_All_Locations_V6.csv"
complete_dataset = pd.read_csv(data_path)

column_to_remove = 'Porosity'  
complete_dataset = complete_dataset.drop(columns=[column_to_remove])
rename_dict = {"Porosity_BIS4D_SOM": "Porosity","BOFEK_2020_Physical Units": "BOFEK_PU", "Peat_Thickness_2022": "Peat_Thickness"}
complete_dataset.rename(columns=rename_dict, inplace=True)

# Data preprocessing
complete_dataset['Date'] = pd.to_datetime(complete_dataset['Date'], format='%Y-%m-%d')
complete_dataset['Source_ID'] = complete_dataset['Site_ID'].astype('category').cat.codes + 1
print(f"Rows and columns before removing NaNs: {complete_dataset.shape}")
filtered_df = complete_dataset.dropna(subset=['SENTEK_SMC']).copy()
filtered_df['BOFEK_PU'] = filtered_df['BOFEK_PU'].astype('category')
print(f"Rows and columns after removing NaNs: {filtered_df.shape}")

# All selected 10 features
Features = ['Sentinel_1_SMC', 'S1_Backscatter', 'S1_Backscatter_SD', 
            'S2_NDVI', 'S2_NDMI', 'L8_9_LST', 'WLEV_f', 'WTMP_f', 'PET', 'Peat_Thickness']

X = filtered_df[Features]         # Features   
y = filtered_df['SENTEK_SMC']     # Predictor
groups = filtered_df["Source_ID"] # Groups for Leave-One-Group-Out
dates = filtered_df['Date']
print(f"Features (X): {X.shape}, Target (y): {y.shape}, Groups: {groups.shape}, Date: {dates.shape}")

Rows and columns before removing NaNs: (36056, 109)
Rows and columns after removing NaNs: (8368, 109)
Features (X): (8368, 10), Target (y): (8368,), Groups: (8368,), Date: (8368,)


### Step 2: Split the dataset

In [3]:
# Perform the train-test split with temporal separation
X_train, X_test, y_train, y_test, groups_train, groups_test, dates_train, dates_test = train_test_split(
    X, y, groups, dates,
    test_size=0.1,      # Reserve 30% for the test set
    shuffle=False)      # Ensure temporal order is maintained

#### Perform a couple of checks regarding the data split

In [4]:
# Verify temporal separation
print("\nEarliest date in training set:", dates_train.min())
print("Latest date in training set:", dates_train.max())
print("Earliest date in test set:", dates_test.min())
print("Latest date in test set:", dates_test.max())

# Ensure alignment of training data
X_train, y_train = X_train.align(y_train, join='inner', axis=0)
groups_train = groups_train.loc[X_train.index]
dates_train = dates_train.loc[X_train.index]  # Align dates_train with X_train

# Ensure alignment of test data
X_test, y_test = X_test.align(y_test, join='inner', axis=0)
groups_test = groups_test.loc[X_test.index]
dates_test = dates_test.loc[X_test.index]  # Align dates_test with X_test

# Verify alignment
assert X_train.index.equals(y_train.index) and X_train.index.equals(groups_train.index) and X_train.index.equals(dates_train.index), \
    "Rows in X_train, y_train, groups_train, and dates_train are misaligned!"

assert X_test.index.equals(y_test.index) and X_test.index.equals(groups_test.index) and X_test.index.equals(dates_test.index), \
    "Rows in X_test, y_test, groups_test, and dates_test are misaligned!"

# Print shapes for verification
print("Data shapes after splitting and alignment:")
print(f"Train set: X_train: {X_train.shape}, y_train: {y_train.shape}, groups_train: {groups_train.shape}, dates_train: {dates_train.shape}")
print(f"Test set: X_test: {X_test.shape}, y_test: {y_test.shape}, groups_test: {groups_test.shape}, dates_test: {dates_test.shape}")

# Print group distribution in train and test sets
print("Group distribution in training set:")
print(groups_train.value_counts())
print("\nGroup distribution in test set:")
print(groups_test.value_counts())

# Calculate and print the number of unique groups in train and test sets
unique_groups_train = groups_train.nunique()
unique_groups_test = groups_test.nunique()
print(f"\nUnique groups in train: {unique_groups_train}, in test: {unique_groups_test}")


Earliest date in training set: 2021-09-01 00:00:00
Latest date in training set: 2023-12-03 00:00:00
Earliest date in test set: 2023-12-04 00:00:00
Latest date in test set: 2024-05-13 00:00:00
Data shapes after splitting and alignment:
Train set: X_train: (7531, 10), y_train: (7531,), groups_train: (7531,), dates_train: (7531,)
Test set: X_test: (837, 10), y_test: (837,), groups_test: (837,), dates_test: (837,)
Group distribution in training set:
Source_ID
11    824
12    823
17    820
7     804
4     778
8     773
16    735
2     716
1     698
3     560
Name: count, dtype: int64

Group distribution in test set:
Source_ID
1     162
12    162
11    149
16    142
17    140
2      82
Name: count, dtype: int64

Unique groups in train: 10, in test: 6


In [5]:
# Apply StandardScaler to numerical features
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])

### Step 3: Define the model and hyperparameter grid

In [6]:
param_grid = {
    "xgbregressor__n_estimators": [800, 900, 1000],
    "xgbregressor__max_depth": [5, 6, 7],
    "xgbregressor__learning_rate": [0.01, 0.05, 0.1],
    "xgbregressor__subsample": [0.5, 0.7, 0.8],
    "xgbregressor__colsample_bytree": [0.5, 0.6, 0.7]
}

# Define custom scoring functions
def mae_scorer(y_true, y_pred):
    return -mean_absolute_error(y_true, y_pred)

def mse_scorer(y_true, y_pred):
    return -mean_squared_error(y_true, y_pred)

def bias_scorer(y_true, y_pred):
    return np.mean(y_true - y_pred)

scoring = {'r2': 'r2',
           'mae': make_scorer(mae_scorer),
           'mse': make_scorer(mse_scorer),
           'bias': make_scorer(bias_scorer)}

# Initialize model and pipeline
xgb = XGBRegressor(random_state=42, enable_categorical=True)
pipeline = Pipeline([("xgbregressor", xgb)])

# Set up cross-validation
logo = LeaveOneGroupOut()
warnings.simplefilter("error", FitFailedWarning)

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=logo,
                           scoring=scoring, refit='r2', n_jobs=-1, error_score=0, verbose=3)

grid_search.fit(X_train_scaled, y_train, groups=groups_train)

# Extract results
cv_results = grid_search.cv_results_
results_list = []
for i in range(len(cv_results['params'])):
    params = cv_results['params'][i]
    results_list.append({
        'n_estimators': params['xgbregressor__n_estimators'],
        'max_depth': params['xgbregressor__max_depth'],
        'learning_rate': params['xgbregressor__learning_rate'],
        'subsample': params['xgbregressor__subsample'],
        'colsample_bytree': params['xgbregressor__colsample_bytree'],
        'mean_test_r2': cv_results['mean_test_r2'][i],
        'mean_test_mae': -cv_results['mean_test_mae'][i],
        'mean_test_mse': -cv_results['mean_test_mse'][i],
        'mean_test_bias': cv_results['mean_test_bias'][i]
    })

results_df = pd.DataFrame(results_list)

Fitting 10 folds for each of 243 candidates, totalling 2430 fits


### Step 4: Print and save best hyperparameters and score

In [8]:
# Print best parameters and score
print("\nBest Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score (R²):", grid_search.best_score_)

# Save results to CSV for reference
results_df.to_csv("C:/Data_MSc_Thesis/Results/04_05_HP_2_grid_search_results.csv", index=False)


Best Parameters: {'xgbregressor__colsample_bytree': 0.5, 'xgbregressor__learning_rate': 0.01, 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 800, 'xgbregressor__subsample': 0.8}
Best Cross-Validation Score (R²): 0.42066701368232007
