In [1]:
# Path to google drive folder
# %cd /content/drive/MyDrive/Colab Notebooks/Disertasi-Ahmad-Rofiqul/002. Laporan-eksperimen
# %cd /content/drive/MyDrive/Colab Notebooks/Disertasi-Ahmad-Rofiqul/002.Experiment_SWI_16052024/

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor, DMatrix, cv
import matplotlib.pyplot as plt

# from sklearn.ensemble import IsolationForest, VotingClassifier
# from sklearn.neighbors import LocalOutlierFactor
# from sklearn.svm import OneClassSVM
# from sklearn.covariance import EllipticEnvelope
# from pyod.models.hbos import HBOS
# from sklearn.cluster import DBSCAN
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, roc_auc_score, roc_curve, auc
# from sklearn.model_selection import train_test_split
# from sklearn.base import BaseEstimator, OutlierMixin
# from scipy.spatial.distance import mahalanobis

In [8]:

# Load the dataset
# df = pd.read_csv('/mnt/data/09_06_0_N2O_Agriculture_TestTraining_IDO_CV.ipynb')
file_path = 'dataset/09_05_4_1_AgricultureOutliers_HardVotingBased_IDO.csv'
df= pd.read_csv(file_path)

X = df.drop('N2O', axis=1)
y = df['N2O']

test_sizes = [0.2, 0.25, 0.3, 0.35]
results = []


In [9]:

# Define evaluation function
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mae, mse, rmse, r2

# Function to perform no cross-validation
def no_cross_validation(X_train, y_train, X_test, y_test, params):
    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return evaluate_model(y_test, y_pred)

# Function to perform sklearn cross_val_score
def sklearn_cross_val_score_eval(X, y, params):
    model = XGBRegressor(**params)
    mae_scores = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
    mse_scores = -cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(mse_scores)
    r2_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    return mae_scores.mean(), mse_scores.mean(), rmse_scores.mean(), r2_scores.mean()

# Function to perform xgb.cv
def xgb_cv_eval(X, y, params):
    dtrain = DMatrix(X, label=y)
    cv_results = cv(params, dtrain, num_boost_round=100, nfold=5, metrics=['mae', 'rmse'], early_stopping_rounds=10, seed=42)
    mae = cv_results['test-mae-mean'].iloc[-1]
    rmse = cv_results['test-rmse-mean'].iloc[-1]
    mse = rmse ** 2
    r2 = 1 - (mse / np.var(y))
    return mae, mse, rmse, r2


In [10]:

# Function to perform hyperparameter tuning and evaluation
def perform_gridsearch(X, y, test_size):
    results = {'Test Size': [], 'Method': [], 'MAE': [], 'MSE': [], 'RMSE': [], 'R2': []}
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # Grid Search
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9]
    }
    grid_search = GridSearchCV(
        estimator=XGBRegressor(n_estimators=100, random_state=42),
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=3,
        n_jobs=-1,
        verbose=0
    )
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    
    # No Cross Validation
    mae, mse, rmse, r2 = no_cross_validation(X_train, y_train, X_test, y_test, best_params)
    results['Test Size'].append(test_size)
    results['Method'].append('No Cross Validation (Grid Search)')
    results['MAE'].append(mae)
    results['MSE'].append(mse)
    results['RMSE'].append(rmse)
    results['R2'].append(r2)

    # Sklearn cross_val_score
    mae, mse, rmse, r2 = sklearn_cross_val_score_eval(X, y, best_params)
    results['Test Size'].append(test_size)
    results['Method'].append('Sklearn cross_val_score (Grid Search)')
    results['MAE'].append(mae)
    results['MSE'].append(mse)
    results['RMSE'].append(rmse)
    results['R2'].append(r2)
    
    # xgb.cv
    xgb_params = best_params.copy()
    xgb_params['objective'] = 'reg:squarederror'
    mae, mse, rmse, r2 = xgb_cv_eval(X, y, xgb_params)
    results['Test Size'].append(test_size)
    results['Method'].append('xgb.cv (Grid Search)')
    results['MAE'].append(mae)
    results['MSE'].append(mse)
    results['RMSE'].append(rmse)
    results['R2'].append(r2)

    return results


In [11]:

# Function to perform hyperparameter tuning and evaluation
def perform_randomsearch(X, y, test_size):
    results = {'Test Size': [], 'Method': [], 'MAE': [], 'MSE': [], 'RMSE': [], 'R2': []}
    
    # Random Search
    param_distributions = {
        'learning_rate': [0.01, 0.1, 0.3, 0.5, 0.7, 1.0],
        'max_depth': [3, 5, 7, 9, 11],
        'min_child_weight': [1, 2, 3, 4, 5],
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    }
    random_search = RandomizedSearchCV(
        estimator=XGBRegressor(n_estimators=100, random_state=42),
        param_distributions=param_distributions,
        n_iter=50,
        scoring='neg_mean_squared_error',
        cv=3,
        n_jobs=-1,
        verbose=0,
        random_state=42
    )
    random_search.fit(X_train, y_train)
    best_params = random_search.best_params_
    
    # No Cross Validation
    mae, mse, rmse, r2 = no_cross_validation(X_train, y_train, X_test, y_test, best_params)
    results['Test Size'].append(test_size)
    results['Method'].append('No Cross Validation (Random Search)')
    results['MAE'].append(mae)
    results['MSE'].append(mse)
    results['RMSE'].append(rmse)
    results['R2'].append(r2)

    # Sklearn cross_val_score
    mae, mse, rmse, r2 = sklearn_cross_val_score_eval(X, y, best_params)
    results['Test Size'].append(test_size)
    results['Method'].append('Sklearn cross_val_score (Random Search)')
    results['MAE'].append(mae)
    results['MSE'].append(mse)
    results['RMSE'].append(rmse)
    results['R2'].append(r2)
    
    # xgb.cv
    xgb_params = best_params.copy()
    xgb_params['objective'] = 'reg:squarederror'
    mae, mse, rmse, r2 = xgb_cv_eval(X, y, xgb_params)
    results['Test Size'].append(test_size)
    results['Method'].append('xgb.cv (Random Search)')
    results['MAE'].append(mae)
    results['MSE'].append(mse)
    results['RMSE'].append(rmse)
    results['R2'].append(r2)
    
    return results


In [12]:

# Perform testing for each test size and optimization method
all_results = []

for test_size in test_sizes:
    results = perform_gridsearch(X, y, test_size)
    # results2 = perform_randomsearch(X, y, test_size)
    all_results.append(pd.DataFrame(results))

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_results, ignore_index=True)


ValueError: 
All the 729 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
729 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python312\Lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\sklearn.py", line 1055, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\sklearn.py", line 521, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
                    ^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\sklearn.py", line 958, in _create_dmatrix
    return QuantileDMatrix(
           ^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\core.py", line 1529, in __init__
    self._init(
  File "c:\Python312\Lib\site-packages\xgboost\core.py", line 1588, in _init
    it.reraise()
  File "c:\Python312\Lib\site-packages\xgboost\core.py", line 576, in reraise
    raise exc  # pylint: disable=raising-bad-type
    ^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\core.py", line 557, in _handle_exception
    return fn()
           ^^^^
  File "c:\Python312\Lib\site-packages\xgboost\core.py", line 641, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
                                          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\data.py", line 1280, in next
    input_data(**self.kwargs)
  File "c:\Python312\Lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\core.py", line 624, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
                                                   ^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\data.py", line 1315, in _proxy_transform
    arr, feature_names, feature_types = _transform_pandas_df(
                                        ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\xgboost\data.py", line 490, in _transform_pandas_df
    _invalid_dataframe_dtype(data)
  File "c:\Python312\Lib\site-packages\xgboost\data.py", line 308, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Date: object, Experiment: object, DataUse: object, Replication: object, Month: object, Vegetation: object, VegType: object


In [None]:

# Display the DataFrame
# import ace_tools as tools; tools.display_dataframe_to_user(name="Hyperparameter Tuning Results", dataframe=final_results_df)

# Visualization of performance metrics
metrics = ['MAE', 'MSE', 'RMSE', 'R2']
methods = final_results_df['Method'].unique()
colors = ['b', 'g', 'r', 'c', 'm']  # Adjust or expand this list as needed

for metric in metrics:
    plt.figure(figsize=(12, 8))
    for i, method in enumerate(methods):
        subset = final_results_df[final_results_df['Method'] == method]
        plt.bar(subset['Test Size'] + i * 0.02, subset[metric], width=0.02, label=method, color=colors[i % len(colors)])
        
        for x, y in zip(subset['Test Size'], subset[metric]):
            plt.text(x + i * 0.02, y, f'{y:.5f}', ha='center', va='bottom', fontsize=9)

    plt.xlabel('Test Size')
    plt.ylabel(metric)
    plt.title(f'Comparison of {metric} for Different Hyperparameter Tuning Methods')
    plt.xticks(test_sizes)
    plt.legend()
    plt.show()
