#### Script Description
This script loads a pre-processed dataset, prepares features and target variables for predicting soil water content   (SWCT_015), and optimizes a RandomForestRegressor model using grouped cross-validation.

*File Name:* 03_01_RF_Hyperparameter_Tuning.ipynb

*Date:* 2024

*Created by:* Rob Alamgir  

*Version:* 1.0

*References:*

#### Import the relevant packages

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, LeaveOneGroupOut
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.exceptions import FitFailedWarning

#### Step 1: Data Preprocessing 

In [4]:
# Load and preprocess data
data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data/Pre_Processed_Data_All_Locaions_Updated_1.csv"
Complete_Data = pd.read_csv(data_path)

# Convert 'Date' column to datetime format and encode 'Source' as numeric codes
Complete_Data['Date'] = pd.to_datetime(Complete_Data['Date'], format='%Y-%m-%d')
Complete_Data['Source_ID'] = Complete_Data['Source'].astype('category').cat.codes + 1

# Remove unnecessary columns for analysis
filtered_df = Complete_Data.drop(columns=[
    'Day_of_Year', 'Source',
    'SWCT_005', 'STMP_005', 'NEE_CO2', 'NEE_CO2_MDS', 
    'NEE_CH4', 'NEE_CH4_MDS', 'NEE_H2O'])

# Filter data from 2020 onwards and scale 'SWCT_015' values
filtered_df = filtered_df[filtered_df['Date'] >= '2020-01-01']
filtered_df['SWCT_015'] = filtered_df['SWCT_015'] / 100

# Drop rows with NaNs in 'SWCT_015' directly when defining X, y, and groups
non_na_data = filtered_df.dropna(subset=['SWCT_015'])
X = non_na_data.drop(columns=["SWCT_015", "Date"])  # Define X as features excluding the target and Date
y = non_na_data["SWCT_015"]  # Define y as the target variable
groups = non_na_data["Source_ID"]  # Define groups for Leave-One-Group-Out cross-validation

# Split the data into training and test sets
X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
    X, y, groups, test_size=0.2, random_state=42)

# Print dimensions of the data after handling NaNs and splitting
print("Data shapes after NaN handling and splitting:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

# Ensure data alignment between X_train and y_train
X_train, y_train = X_train.align(y_train, join='inner', axis=0)
groups_train = groups_train.loc[X_train.index]  # Align groups_train to X_train indices

# Check for alignment
if X_train.index.equals(y_train.index) and X_train.index.equals(groups_train.index):
    print("\nRows in X_train, y_train, and groups_train are aligned.")
else:
    print("\nWarning: Rows in X_train, y_train, and groups_train are misaligned!")

Data shapes after NaN handling and splitting:
X_train: (6858, 12)
y_train: (6858,)
X_test: (1715, 12)
y_test: (1715,)

Rows in X_train, y_train, and groups_train are aligned.


#### Step 2: Primary Hyperparameter Tuning

In [5]:
# Define the RandomForestRegressor model and create a pipeline
rf = RandomForestRegressor(random_state=0)
pipeline = Pipeline([("randomforestregressor", rf)])

# Define parameter grid for GridSearchCV
param_grid = {
    "randomforestregressor__n_estimators": [300, 400, 500, 600, 700],
    "randomforestregressor__max_depth": [5, 10, 15],
    "randomforestregressor__max_features": ["sqrt", "log2", None],
    "randomforestregressor__min_samples_leaf": [1, 2, 4]}

# Suppress warnings except for FitFailedWarning, which indicates issues during fitting
warnings.simplefilter("error", FitFailedWarning)

# Initialize GridSearchCV with Leave-One-Group-Out cross-validation
grid_search = GridSearchCV(
    pipeline,
    param_grid = param_grid,
    cv = LeaveOneGroupOut().split(X_train, y_train, groups=groups_train),
    scoring = "neg_mean_squared_error",
    n_jobs = -1,
    error_score=np.nan)  # Set failed fits to NaN

# Fit GridSearchCV with error handling
try:
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_  # Best parameters found by GridSearchCV
    best_score = grid_search.best_score_    # Best cross-validation score
    print("Best Parameters:", best_params)
    print("Best Cross-Validation Score:", best_score)
except FitFailedWarning as e:
    print("FitFailedWarning encountered during fitting:")
    print(str(e))
except ValueError as ve:
    print("ValueError encountered during fitting:")
    print(str(ve))
except AttributeError:
    print("GridSearchCV did not complete successfully; best parameters and score are unavailable.")

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'randomforestregressor__max_depth': 15, 'randomforestregressor__max_features': None, 'randomforestregressor__min_samples_leaf': 1, 'randomforestregressor__n_estimators': 400}
Best Cross-Validation Score: -0.008283352181404255
