In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectKBest, RFE, f_regression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.inspection import PartialDependenceDisplay
from xgboost import XGBRegressor
from xgboost import plot_importance
from xgboost import plot_tree
import joblib

In [None]:
model_name = 'xgboost_model'
model = XGBRegressor()

In [None]:
# read data


In [None]:
y = df.pop('Hsig (m)')
X = df

In [None]:
n_features_to_select = [5, 13]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Model pipeline and grid search parameters

In [None]:
pipeline_steps = [
    ('scaler', StandardScaler()),
    ('feature_selection', RFECV(estimator=GradientBoostingRegressor(n_estimators=50), cv=5)),
    ('estimator', model)
]
pipeline = Pipeline(pipeline_steps)

In [None]:
model_params = {
    'estimator__n_estimators': [50, 100],
    'estimator__learning_rate': [ 0.03, 0.1],
    'estimator__max_depth': [3, 5, 8],
}

In [None]:
param_grid = [
    {
        'scaler': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'feature_selection': ['passthrough'],
        'estimator': [model],
        **model_params,
    },
    {
        'scaler': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'feature_selection': [RFE(estimator=DecisionTreeRegressor())],
        'feature_selection__n_features_to_select': n_features_to_select,
        'estimator': [model],
        **model_params,
    },
    {
        'scaler': [StandardScaler(), MinMaxScaler(), 'passthrough'],
        'feature_selection': [SelectKBest(f_regression)],
        'feature_selection__k': n_features_to_select,
        'estimator': [model],
        **model_params,
    }]

# Searching best model and feature importance

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=3, n_jobs=5)
grid_search.fit(X_train, y_train)

In [None]:
# Print the best parameters and score
print(f'Best parameters for {model_name} for Hsig (m): {grid_search.best_params_}')
print(f'Best score for {model_name} for Hsig (m): {grid_search.best_score_}')

In [None]:
best_pipe = grid_search.best_estimator_
sel = best_pipe.named_steps['feature_selection']
if sel == 'passthrough':
    selected_names = X_train.columns
else:
    selected_names = X_train.columns[sel.get_support()]
selected_names.tolist()

In [None]:
# Predicting the model on test set
y_pred_test = grid_search.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

In [None]:
if hasattr(grid_search.best_estimator_['estimator'], 'feature_importances_'):
    print("Feature importances:")
    print(grid_search.best_estimator_['estimator'].feature_importances_)

In [None]:
if hasattr(grid_search.best_estimator_['estimator'], 'coef_'):
    print("Coefficients:")
    print(grid_search.best_estimator_['estimator'].coef_)

In [None]:
if hasattr(grid_search.best_estimator_['estimator'], 'feature_importances_'):
    feature_importances = grid_search.best_estimator_['estimator'].feature_importances_
    feature_importances_df = pd.DataFrame(feature_importances, index=grid_search.best_estimator_[:-1].get_feature_names_out(), columns=['importance'])


In [None]:
feature_importances_df

In [None]:
# Use your fitted pipeline:
best_pipe = grid_search.best_estimator_

# Pick features by NAME (safer than indices)
# Example: replace with names that exist in your X.columns
features_to_plot = [0, 1, 2, 3, 4, 5]

fig = plt.figure(figsize=(10, 5))
PartialDependenceDisplay.from_estimator(
    best_pipe,
    X_train,                # raw training frame (with original column names)
    features=features_to_plot,
    kind="individual",         # or "individual" for Individual Conditional Expectation (ICE) curves, or "both"
    grid_resolution=50
)
plt.suptitle("XGBoost Partial Dependence")

In [None]:
if hasattr(grid_search.best_estimator_['estimator'], 'coef_'):
    intercept = grid_search.best_estimator_['estimator'].intercept_
    coefficients = grid_search.best_estimator_['estimator'].coef_

    # Save coefficients and intercept together
    coefficients_df = pd.DataFrame({'feature': grid_search.best_estimator_[:-1].get_feature_names_out(), 'coefficient': coefficients})
    if grid_search.best_params_['estimator__fit_intercept']:
        coefficients_df.loc[-1] = ['intercept', intercept]  # Add intercept as a row
        coefficients_df.index = coefficients_df.index + 1  # Shift index
    coefficients_df = coefficients_df.sort_index()     # Sort so intercept is first


In [None]:
# Print test metrics
print("Test Metrics")
print(f"Mean Squared Error: {mse_test}")
print(f"Mean Absolute Percentage Error: {mape_test}")
print(f"R2 Score: {r2_test}")
print(f"Correlation coefficient: {np.corrcoef(y_test, y_pred_test)[0, 1]:.2f}")

# Plot predictions against true values; plot errors

In [None]:
plt.scatter(y_test, y_pred_test)
plt.xlabel('Measured Hsig (m)')
plt.ylabel('Estimated Hsig (m)')
plt.ylim([0, 2])
plt.xlim([0, 2])
plt.grid()

In [None]:
# Extract the fitted XGBRegressor from your best pipeline
xgb_model = grid_search.best_estimator_.named_steps['estimator']

# Plot the first tree (0-indexed)
plt.figure(figsize=(30, 25))
plot_tree(xgb_model, num_trees=0, rankdir='LR')


In [None]:
plt.figure(figsize=(10, 6))
plot_importance(xgb_model, max_num_features=20)
plt.title("XGBoost Feature Importance")