### For Final Review

#### XGBoost Model

###### CO2_Emission (Âµg/g/day)

In [None]:
# ==============================
# IBS Project - XGBoost Model
# ==============================

import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from xgboost import XGBRegressor

# ------------------------------
# Load dataset
# ------------------------------
df = pd.read_csv("Dataset\Soil_microbe_dataset.csv")

# Define target column
target_col = df.columns[-2]

X = df.drop(columns=[target_col])
y = df[target_col]

# Identify numeric and categorical columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# ------------------------------
# Preprocessing pipelines
# ------------------------------
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop'
)

# ------------------------------
# Train-test split
# ------------------------------
RANDOM_SEED = 42
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED
)

# ------------------------------
# Default XGBoost Model
# ------------------------------
xgb_default = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=RANDOM_SEED, verbosity=0))
])

xgb_default.fit(X_train, y_train)
y_pred_default = xgb_default.predict(X_test)

# ------------------------------
# Tuned XGBoost Model
# ------------------------------
param_dist = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [3, 5, 7, 10],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__subsample': [0.6, 0.8, 1.0],
    'model__colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_tuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=RANDOM_SEED, verbosity=0))
])

search = RandomizedSearchCV(
    xgb_tuned,
    param_distributions=param_dist,
    n_iter=10,
    scoring='r2',
    cv=3,
    verbose=1,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

search.fit(X_train, y_train)
best_xgb = search.best_estimator_
y_pred_tuned = best_xgb.predict(X_test)

# ------------------------------
# Performance Evaluation
# ------------------------------
results = pd.DataFrame({
    "R2": [
        r2_score(y_test, y_pred_default),
        r2_score(y_test, y_pred_tuned)
    ],
    "MAE": [
        mean_absolute_error(y_test, y_pred_default),
        mean_absolute_error(y_test, y_pred_tuned)
    ],
    "RMSE": [
        mean_squared_error(y_test, y_pred_default, squared=False),
        mean_squared_error(y_test, y_pred_tuned, squared=False)
    ]
}, index=["XGBoost_default", "XGBoost_tuned"])

print("\nPerformance Comparison (Test Set):\n")
print(results)

# ------------------------------
# Extra Check: Train vs Test R²
# ------------------------------
print("\nTrain vs Test R²:")
print("Default XGB  -> Train:", r2_score(y_train, xgb_default.predict(X_train)), 
      " Test:", r2_score(y_test, y_pred_default))
print("Tuned   XGB  -> Train:", r2_score(y_train, best_xgb.predict(X_train)), 
      " Test:", r2_score(y_test, y_pred_tuned))

  df = pd.read_csv("E:\S3\IBS\Meta-Genomic-Data-Analytics\Dataset\Soil_microbe_dataset.csv")  # change file name if needed


Fitting 3 folds for each of 10 candidates, totalling 30 fits





Performance Comparison (Test Set):

                       R2       MAE      RMSE
XGBoost_default  0.999582  0.082048  0.103227
XGBoost_tuned    0.999577  0.082516  0.103817

Train vs Test R²:
Default XGB  -> Train: 0.999648717680866  Test: 0.9995820652385292
Tuned   XGB  -> Train: 0.9996615364409726  Test: 0.9995772711631369
