Script Description This script loads a pre-processed dataset,

File Name: 03_03_Fitting_NEE_SWC_non_linear_bell.ipynb

Date: 2025

Created by: Rob Alamgir

Version: 1.0

References:

#### Import the relevant packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.stats import norm
from itertools import product

### Step 1: Load and preprocess data

In [None]:
#Load and preprocess data
data_path = "C:/Data_MSc_Thesis/Pre_Processed_Data/Pre_Processed_Data_All_Locations_Updated_.csv"
complete_dataset = pd.read_csv(data_path)
complete_dataset['Date'] = pd.to_datetime(complete_dataset['Date'], format='%Y-%m-%d')
print(f"Rows and columns before removing NaNs: {complete_dataset.shape}")

In [None]:
filtered_df = complete_dataset.dropna(subset=['SWCT_1_015','Planet_SWC', 'NEE_CO2_kg_day_ha_DAv_NT', 
                                              'GPP_CO2_kg_day_ha_DAv_NT', 'STMP_1_015','ATMP_f'], how='any').copy()
print(f"Rows and columns after removing NaNs: {filtered_df.shape}")

# Define independent variables and target
X_data_1 = (filtered_df['GPP_CO2_kg_day_ha_DAv_NT'].values, 
            filtered_df['Planet_SWC'].values, 
            filtered_df['ATMP_f'].values)

X_data_3 = (filtered_df['GPP_CO2_kg_day_ha_DAv_NT'].values, 
            filtered_df['Planet_SWC'].values, 
            filtered_df['ATMP_f'].values,
            filtered_df['STMP_1_015'].values)  # T_soil for model 3

y_data = filtered_df['NEE_CO2_kg_day_ha_DAv_NT'].values

### Step 2: Define Models

#### Model Option 1: Sigmoid Dependence with VSM instead of WL

In [None]:
def model_1(X, alpha, beta, gamma, delta):
    """ NEE_CO2_night = α * GPP + β / (1 + exp(γ * VSM)) * exp(δ * T_air) """
    GPP, VSM, T_air = X
    return alpha * GPP + (beta / (1 + np.exp(gamma * VSM))) * np.exp(delta * T_air)

#### Model Option 2: Bell curve based on the Gaussian distribution function

In [1]:
def model_2(X, alpha, beta, gamma, delta, epsilon):
    """ NEE_CO2_night = α * GPP + β * exp(-0.5 * ((VSM - γ) / δ)²) * exp(ε * T_air) """
    GPP, VSM, T_air = X
    return alpha * GPP + beta * np.exp(-0.5 * ((VSM - gamma) / delta)**2) * np.exp(epsilon * T_air)

#### Model Option 3: Sigmoid Dependence with VSM and other additional parameters & exponents

In [None]:
def model_3(X, alpha, beta, gamma, delta, v):
    """ NEE_CO2_night = α * GPP * exp(β * T_air) + δ / (1 + exp(γ * VSM)) * exp(v * T_soil) """
    GPP, VSM, T_air, T_soil = X
    return alpha * GPP * np.exp(beta * T_air) + (delta / (1 + np.exp(gamma * VSM))) * np.exp(v * T_soil)

### Step 3: Define Grid Search & AIC Calculation

In [None]:
# Function to compute AIC
def calculate_aic(y_true, y_pred, num_params):
    residual = y_true - y_pred
    rss = np.sum(residual**2)
    n = len(y_true)
    aic = n * np.log(rss / n) + 2 * num_params
    return aic

# Fit model with multiple starting values (grid search)
def fit_model_with_grid_search(model, X_data, y_data, param_grid):
    best_aic = np.inf
    best_params = None
    best_y_pred = None

    for initial_guess in param_grid:
        try:
            popt, _ = curve_fit(model, X_data, y_data, p0=initial_guess, maxfev=20000)
            y_pred = model(X_data, *popt)
            aic_score = calculate_aic(y_data, y_pred, len(popt))

            if aic_score < best_aic:
                best_aic = aic_score
                best_params = popt
                best_y_pred = y_pred
        except RuntimeError:
            continue  # Skip failed fits

    return best_params, best_aic, best_y_pred

In [None]:
# Define parameter grid (ranges for α, β, γ, δ, ε, v)
param_grid_1 = list(product(np.linspace(0, 2, 5),  # alpha
                            np.linspace(0, 2, 5),  # beta
                            np.linspace(-2, 2, 5),  # gamma
                            np.linspace(0, 1, 5)))  # delta

param_grid_2 = list(product(np.linspace(0, 2, 5),  # alpha
                            np.linspace(0, 2, 5),  # beta
                            np.linspace(-2, 2, 5),  # gamma
                            np.linspace(0, 1, 5),  # delta
                            np.linspace(0, 1, 5)))  # epsilon

param_grid_3 = list(product(np.linspace(0, 2, 5),  # alpha
                            np.linspace(0, 1, 5),  # beta
                            np.linspace(-2, 2, 5),  # gamma
                            np.linspace(0, 1, 5),  # delta
                            np.linspace(0, 1, 5)))  # v

# Fit models using grid search
popt_1, aic_1, y_pred_1 = fit_model_with_grid_search(model_1, X_data_1, y_data, param_grid_1)
popt_2, aic_2, y_pred_2 = fit_model_with_grid_search(model_2, X_data_1, y_data, param_grid_2)
popt_3, aic_3, y_pred_3 = fit_model_with_grid_search(model_3, X_data_3, y_data, param_grid_3)

# Print results
print(f"Model 1 AIC: {aic_1}, Parameters: {popt_1}")
print(f"Model 2 AIC: {aic_2}, Parameters: {popt_2}")
print(f"Model 3 AIC: {aic_3}, Parameters: {popt_3}")

# Find best model
best_model = min((aic_1, "Model 1"), (aic_2, "Model 2"), (aic_3, "Model 3"))
print(f"Best model: {best_model[1]} with AIC = {best_model[0]}")

#### Model Residuals and Goodness-of-Fit Check

In [None]:
# Plot residuals
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.hist(y_data - y_pred_1, bins=20, alpha=0.5, label="Model 1")
plt.hist(y_data - y_pred_2, bins=20, alpha=0.5, label="Model 2")
plt.hist(y_data - y_pred_3, bins=20, alpha=0.5, label="Model 3")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.legend()
plt.title("Residuals Distribution")

# Plot observed vs. predicted
plt.subplot(1, 2, 2)
plt.scatter(y_data, y_pred_1, alpha=0.5, label="Model 1")
plt.scatter(y_data, y_pred_2, alpha=0.5, label="Model 2")
plt.scatter(y_data, y_pred_3, alpha=0.5, label="Model 3")
plt.plot(y_data, y_data, 'k--')
plt.xlabel("Observed")
plt.ylabel("Predicted")
plt.legend()
plt.title("Observed vs. Predicted")
plt.show()

In [None]:
# Generate multiple starting values for grid search
num_starts = 50  # Number of random initializations
alpha_vals = np.random.uniform(-1, 1, num_starts)
beta_vals = np.random.uniform(np.min(y_data), 0, num_starts)
gamma_vals = np.random.uniform(np.min(df['VSM']), np.max(df['VSM']), num_starts)
delta_vals = np.random.uniform(0.01, np.std(df['VSM']), num_starts)
epsilon_vals = np.random.uniform(-0.1, 0.1, num_starts)

# Function to calculate AIC
def calculate_aic(n, residuals, num_params):
    rss = np.sum(residuals ** 2)
    return n * np.log(rss / n) + 2 * num_params

best_params = None
best_aic = np.inf

# Grid search loop
for alpha_init, beta_init, gamma_init, delta_init, epsilon_init in zip(alpha_vals, beta_vals, gamma_vals, delta_vals, epsilon_vals):
    p0 = [alpha_init, beta_init, gamma_init, delta_init, epsilon_init]
    try:
        popt, _ = curve_fit(nee_night_model, X_data, y_data, p0=p0, maxfev=10000)
        y_pred = nee_night_model(X_data, *popt)
        aic = calculate_aic(len(y_data), y_data - y_pred, len(popt))
        
        if aic < best_aic:
            best_aic = aic
            best_params = popt
    except:
        continue  # Skip if fit fails

# Extract best-fit parameters
alpha_fit, beta_fit, gamma_fit, delta_fit, epsilon_fit = best_params
print(f"Best Fit Parameters (based on AIC):\n alpha = {alpha_fit}\n beta = {beta_fit}\n gamma = {gamma_fit}\n delta = {delta_fit}\n epsilon = {epsilon_fit}")

# Calculate fitted values
y_pred = nee_night_model(X_data, *best_params)

In [None]:
# Plot actual vs predicted values
plt.scatter(y_data, y_pred, alpha=0.5)
plt.xlabel("Actual NEE_CO2_night")
plt.ylabel("Predicted NEE_CO2_night")
plt.title("Model Fit: Actual vs Predicted")
plt.plot([min(y_data), max(y_data)], [min(y_data), max(y_data)], 'r--')  # 1:1 line
plt.show()

# Compute R-squared
def r_squared(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)

r2 = r_squared(y_data, y_pred)
print(f"R-squared: {r2:.4f}")