In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb


In [2]:
def evaluate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, mape, r2


In [3]:

def perform_xgboost_prediction(file_path, test_sizes, nfolds):
    # Load the dataset
    data = pd.read_csv(file_path)

    # Define the target variable and features
    X = data.drop(columns=['N2O'])
    y = data['N2O']

    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(include=['object']).columns

    # Define the preprocessing for numerical data (scaling)
    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    # Define the preprocessing for categorical data (one-hot encoding)
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine preprocessing steps into a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Define the model
    xgb_model = xgb.XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')

    # Define the results dictionary
    results = []

    for test_size in test_sizes:
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        
        # Create a pipeline with preprocessing and the model
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', xgb_model)
        ])

        # Fit the pipeline on the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Evaluate the model without cross-validation
        mse, rmse, mae, mape, r2 = evaluate_metrics(y_test, y_pred)
        results.append({
            'method': 'No CV',
            'test_size': test_size,
            'nfolds': None,
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'MAPE': mape,
            'R2': r2
        })

        for nfold in nfolds:
            # Use xgb.cv for cross-validation
            dtrain = xgb.DMatrix(data=preprocessor.fit_transform(X_train), label=y_train)
            cv_results = xgb.cv(dtrain=dtrain, params=xgb_model.get_params(), nfold=nfold, metrics='rmse', as_pandas=True, seed=42)
            
            # Get the predictions using cross_val_predict
            y_pred_cv = cross_val_predict(pipeline, X, y, cv=nfold)

            # Evaluate the model with cross-validation
            mse_cv, rmse_cv, mae_cv, mape_cv, r2_cv = evaluate_metrics(y, y_pred_cv)
            results.append({
                'method': 'CV',
                'test_size': test_size,
                'nfolds': nfold,
                'MSE': mse_cv,
                'RMSE': rmse_cv,
                'MAE': mae_cv,
                'MAPE': mape_cv,
                'R2': r2_cv
            })

    # Convert the results to a DataFrame for better visualization
    results_df = pd.DataFrame(results)

    # Save the results to a CSV file
    results_df.to_csv('hasil/00_02_xgboost_prediction_comparison_pipeline.csv', index=False)

    return results_df


In [4]:

# Parameters
file_path = 'dataset/agriculture_dataset.csv'
test_sizes = [0.2, 0.25, 0.3, 0.35]
nfolds = [3, 5, 7, 9, 12]

# Run the function and get the results
results_df = perform_xgboost_prediction(file_path, test_sizes, nfolds)

# Display the results
# import ace_tools as tools; tools.display_dataframe_to_user(name="XGBoost Prediction Comparison", dataframe=results_df)


Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.

Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not