Untuk mengimplementasikan fungsi perform_N2O_xgboost_prediction yang mencakup langkah-langkah preprocessing, deteksi outlier, dan prediksi menggunakan XGBoost, kita akan mengikuti struktur berikut:

Langkah-Langkah:
Imputasi Nilai Hilang: Mengganti nilai yang hilang dengan mean menggunakan SimpleImputer.
Deteksi Outlier: Menerapkan berbagai metode deteksi outlier:
Isolation Forest (IF)
DBSCAN
One-Class SVM (OCSVM)
Kombinasi dari metode di atas:
IF + DBSCAN
IF + OCSVM
IF + DBSCAN + OCSVM
Prediksi N2O: Menggunakan XGBoost dengan dan tanpa cross-validation melalui pipeline.
Evaluasi: Menggunakan berbagai metrik evaluasi (MSE, RMSE, MAE, MAPE, R2) untuk menilai kinerja model.
Parameter Test Size dan Nfolds: Mencoba berbagai kombinasi test size dan jumlah lipatan untuk cross-validation.

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.cluster import DBSCAN
from xgboost import XGBRegressor, DMatrix, cv as xgb_cv
from scipy.sparse import csr_matrix


In [13]:
def evaluate_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, mae, mape, r2


In [14]:
def detect_outliers(data, method, method_name):
    if method_name in ['IF', 'IF_DBSCAN', 'IF_OCSVM', 'IF_DBSCAN_OCSVM']:
        outliers_if = IsolationForest(contamination=0.1, random_state=42).fit_predict(data)
        data = data[outliers_if == 1]

    if method_name in ['DBSCAN', 'IF_DBSCAN', 'IF_DBSCAN_OCSVM']:
        outliers_dbscan = DBSCAN(eps=0.5, min_samples=5).fit_predict(data)
        data = data[outliers_dbscan != -1]

    if method_name in ['OCSVM', 'IF_OCSVM', 'IF_DBSCAN_OCSVM']:
        outliers_ocsvm = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1).fit_predict(data)
        data = data[outliers_ocsvm == 1]

    return data


In [15]:


def perform_N2O_xgboost_prediction(file_path, test_sizes, nfolds):
    # Load the dataset
    data = pd.read_csv(file_path)

    # Define the target variable and features
    X = data.drop(columns=['N2O'])
    y = data['N2O']

    # Identify numerical and categorical columns
    numerical_cols = X.select_dtypes(include=['number']).columns
    categorical_cols = X.select_dtypes(include=['object']).columns

    # Define the preprocessing for numerical data (imputation + scaling)
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
        ('scaler', StandardScaler())
    ])

    # Define the preprocessing for categorical data (imputation + one-hot encoding)
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine preprocessing steps into a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Define the model
    xgb_model = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')

    # Define outlier detection methods
    outlier_methods = ['None', 'IF', 'DBSCAN', 'OCSVM', 'IF_DBSCAN', 'IF_OCSVM', 'IF_DBSCAN_OCSVM']

    # Define the results dictionary
    results = []

    for test_size in test_sizes:
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        
        # Apply preprocessing
        X_train_processed = preprocessor.fit_transform(X_train)
        X_test_processed = preprocessor.transform(X_test)

        for outlier_method in outlier_methods:
            if outlier_method != 'None':
                # Detect and remove outliers from the training set
                X_train_processed = detect_outliers(X_train_processed, None, outlier_method)
                y_train = y_train.iloc[:X_train_processed.shape[0]]  # Adjust y_train to match X_train size

            # Predict without cross-validation
            pipeline = Pipeline(steps=[
                ('preprocessor', preprocessor),
                ('regressor', xgb_model)
            ])

            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)

            mse, rmse, mae, mape, r2 = evaluate_metrics(y_test, y_pred)
            results.append({
                'method': f'No CV ({outlier_method})',
                'test_size': test_size,
                'nfolds': None,
                'MSE': mse,
                'RMSE': rmse,
                'MAE': mae,
                'MAPE': mape,
                'R2': r2
            })

            for nfold in nfolds:
                # Use xgb.cv for cross-validation
                dtrain = DMatrix(data=X_train_processed, label=y_train)
                cv_results = xgb_cv(dtrain=dtrain, params=xgb_model.get_params(), nfold=nfold, metrics='rmse', as_pandas=True, seed=42)
                
                # Get the predictions using cross_val_predict
                y_pred_cv = cross_val_predict(pipeline, X, y, cv=nfold)

                mse_cv, rmse_cv, mae_cv, mape_cv, r2_cv = evaluate_metrics(y, y_pred_cv)
                results.append({
                    'method': f'CV ({outlier_method})',
                    'test_size': test_size,
                    'nfolds': nfold,
                    'MSE': mse_cv,
                    'RMSE': rmse_cv,
                    'MAE': mae_cv,
                    'MAPE': mape_cv,
                    'R2': r2_cv
                })

    # Convert the results to a DataFrame for better visualization
    results_df = pd.DataFrame(results)

    # Save the results to a CSV file
    results_df.to_csv('hasil/00_02_1_xgboost_outlier_detection_comparison_pipeline.csv', index=False)

    return results_df


In [17]:

# Parameters
file_path = 'dataset/agriculture_dataset.csv'
test_sizes = [0.2, 0.25, 0.3, 0.35]
# nfolds = [3, 5, 7, 9, 12]
nfolds = [5]

# Run the function and get the results
results_df = perform_N2O_xgboost_prediction(file_path, test_sizes, nfolds)

# Display the results
# import ace_tools as tools; tools.display_dataframe_to_user(name="XGBoost Outlier Detection Comparison", dataframe=results_df)


Parameters: { "enable_categorical", "missing", "use_label_encoder" } are not used.



XGBoostError: [13:43:21] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\data\data.cc:501: Check failed: this->labels.Size() % this->num_row_ == 0 (1616 vs. 0) : Incorrect size for labels.