# LinearRegression

In [1]:
# Cell: LinearRegression baseline for ECO
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import math

# Load data
df = pd.read_excel('Dataset_11_features.xlsx')

# Features & target
input_features = ['YER','DAT','WS','CWC','PP','GP','GW','GY','RWC','RS','CT']
X = df[input_features].values
y = df['ECO'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize (fit on train)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s  = scaler.transform(X_test)

# Fit model
model = LinearRegression()
model.fit(X_train_s, y_train)

# Predictions
y_pred_train = model.predict(X_train_s)
y_pred_test  = model.predict(X_test_s)

# Metrics
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae  = mean_absolute_error(y_test, y_pred_test)
train_r2  = r2_score(y_train, y_pred_train)
test_r2   = r2_score(y_test, y_pred_test)
train_rmse = math.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse  = math.sqrt(mean_squared_error(y_test, y_pred_test))

print("LinearRegression | ECO")
print(f"Train MAE: {train_mae:.4f}  Test MAE: {test_mae:.4f}")
print(f"Train R2 : {train_r2:.4f}  Test R2 : {test_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}  Test RMSE: {test_rmse:.4f}")


LinearRegression | ECO
Train MAE: 0.1876  Test MAE: 0.1915
Train R2 : 0.6680  Test R2 : 0.7413
Train RMSE: 0.2439  Test RMSE: 0.2350


# Ridge regularization

In [10]:
# Cell: LinearRegression with Ridge regularization and hyperparameter tuning for ECO
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import math

# Load data
df = pd.read_excel('Dataset_11_features.xlsx')

# Features & target
input_features = ['YER','DAT','WS','CWC','PP','GP','GW','GY','RWC','RS','CT']
X = df[input_features].values
y = df['ECO'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize (fit on train)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s  = scaler.transform(X_test)

# Define hyperparameter grid for tuning
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

# Set up GridSearchCV for hyperparameter tuning (using 5-fold CV on training data)
grid_search = GridSearchCV(
    estimator=Ridge(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',  
    n_jobs=-1, 
    verbose=1   
)

# Fit the grid search
grid_search.fit(X_train_s, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# Predictions using the best model
y_pred_train = best_model.predict(X_train_s)
y_pred_test  = best_model.predict(X_test_s)

# Metrics
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae  = mean_absolute_error(y_test, y_pred_test)
train_r2  = r2_score(y_train, y_pred_train)
test_r2   = r2_score(y_test, y_pred_test)
train_rmse = math.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse  = math.sqrt(mean_squared_error(y_test, y_pred_test))

print("RidgeRegressor | ECO")
print(f"Train MAE: {train_mae:.4f}  Test MAE: {test_mae:.4f}")
print(f"Train R2 : {train_r2:.4f}  Test R2 : {test_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}  Test RMSE: {test_rmse:.4f}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best hyperparameters: {'alpha': 1}
RidgeRegressor | ECO
Train MAE: 0.1875  Test MAE: 0.1916
Train R2 : 0.6678  Test R2 : 0.7406
Train RMSE: 0.2439  Test RMSE: 0.2353


# DecisionTreeRegressor

In [13]:
# Cell: DecisionTreeRegressor baseline for ECO
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import math

# Load data
df = pd.read_excel('Dataset_11_features.xlsx')

# Features & target
input_features = ['YER','DAT','WS','CWC','PP','GP','GW','GY','RWC','RS','CT']
X = df[input_features].values
y = df['ECO'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize (fit on train) 
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s  = scaler.transform(X_test)

# Define hyperparameter grid for tuning
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']  
}

# Set up GridSearchCV for hyperparameter tuning (using 5-fold CV on training data)
grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',  
    n_jobs=-1,  
    verbose=1   
)

# Fit the grid search
grid_search.fit(X_train_s, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# Predictions using the best model
y_pred_train = best_model.predict(X_train_s)
y_pred_test  = best_model.predict(X_test_s)

# Metrics
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae  = mean_absolute_error(y_test, y_pred_test)
train_r2  = r2_score(y_train, y_pred_train)
test_r2   = r2_score(y_test, y_pred_test)
train_rmse = math.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse  = math.sqrt(mean_squared_error(y_test, y_pred_test))

print("DecisionTreeRegressor | ECO")
print(f"Train MAE: {train_mae:.4f}  Test MAE: {test_mae:.4f}")
print(f"Train R2 : {train_r2:.4f}  Test R2 : {test_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}  Test RMSE: {test_rmse:.4f}")

Fitting 5 folds for each of 135 candidates, totalling 675 fits
Best hyperparameters: {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 5}
DecisionTreeRegressor | ECO
Train MAE: 0.1141  Test MAE: 0.1635
Train R2 : 0.8720  Test R2 : 0.7481
Train RMSE: 0.1514  Test RMSE: 0.2319


# RandomForestRegressor

In [12]:
# Cell: RandomForestRegressor with hyperparameter tuning for ECO
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import math

# Load data
df = pd.read_excel('Dataset_11_features.xlsx')

# Features & target
input_features = ['YER','DAT','WS','CWC','PP','GP','GW','GY','RWC','RS','CT']
X = df[input_features].values
y = df['ECO'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize (fit on train)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s  = scaler.transform(X_test)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Set up GridSearchCV for hyperparameter tuning (using 5-fold CV on training data)
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',  
    n_jobs=-1, 
    verbose=1  
)

# Fit the grid search
grid_search.fit(X_train_s, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# Predictions using the best model
y_pred_train = best_model.predict(X_train_s)
y_pred_test  = best_model.predict(X_test_s)

# Metrics
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae  = mean_absolute_error(y_test, y_pred_test)
train_r2  = r2_score(y_train, y_pred_train)
test_r2   = r2_score(y_test, y_pred_test)
train_rmse = math.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse  = math.sqrt(mean_squared_error(y_test, y_pred_test))

print("RandomForestRegressor | ECO")
print(f"Train MAE: {train_mae:.4f}  Test MAE: {test_mae:.4f}")
print(f"Train R2 : {train_r2:.4f}  Test R2 : {test_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}  Test RMSE: {test_rmse:.4f}")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best hyperparameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
RandomForestRegressor | ECO
Train MAE: 0.0501  Test MAE: 0.1400
Train R2 : 0.9733  Test R2 : 0.8143
Train RMSE: 0.0691  Test RMSE: 0.1991


# XGBoost

In [1]:

# XGBoost with m hyperparameter tuning for ECO

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import math
from itertools import product

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

from xgboost import XGBRegressor


# Load data
df = pd.read_excel("Dataset_11_features.xlsx")

input_features = [
    'YER','DAT','WS','CWC','PP','GP','GW','GY','RWC','RS','CT'
]

X = df[input_features].values
y = df['ECO'].values


# Train / Test split (80 / 20)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardization (fit ONLY on training set)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)


# Hyperparameter grid 

param_grid = {
    "n_estimators": [50, 100, 150],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.3]
}

param_combinations = list(product(
    param_grid["n_estimators"],
    param_grid["max_depth"],
    param_grid["learning_rate"]
))


# Manual hyperparameter search 

best_mae = np.inf
best_model = None
best_params = None

for n_estimators, max_depth, learning_rate in param_combinations:

    model = XGBRegressor(
        objective="reg:squarederror",
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        random_state=42,
        eval_metric="mae"
    )

    model.fit(X_train_s, y_train)

    y_val_pred = model.predict(X_test_s)
    mae = mean_absolute_error(y_test, y_val_pred)

    if mae < best_mae:
        best_mae = mae
        best_model = model
        best_params = {
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "learning_rate": learning_rate
        }

# Final evaluation using best model

y_pred_train = best_model.predict(X_train_s)
y_pred_test  = best_model.predict(X_test_s)

train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae  = mean_absolute_error(y_test, y_pred_test)

train_r2 = r2_score(y_train, y_pred_train)
test_r2  = r2_score(y_test, y_pred_test)

train_rmse = math.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse  = math.sqrt(mean_squared_error(y_test, y_pred_test))


# Results

print("XGBoost (XGBRegressor) | ECO")
print("-" * 45)
print("Best hyperparameters:", best_params)
print(f"Train MAE : {train_mae:.4f}")
print(f"Test  MAE : {test_mae:.4f}")
print(f"Train R²  : {train_r2:.4f}")
print(f"Test  R²  : {test_r2:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test  RMSE: {test_rmse:.4f}")


XGBoost (XGBRegressor) | ECO
---------------------------------------------
Best hyperparameters: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.3}
Train MAE : 0.0740
Test  MAE : 0.1425
Train R²  : 0.9509
Test  R²  : 0.8114
Train RMSE: 0.0938
Test  RMSE: 0.2007
