In [1]:
import os
import pandas as pd
from xgboost import XGBRegressor, DMatrix, cv
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib


In [2]:
data_path = os.path.join('../data/interim')
model_path = os.path.join('../model')

In [5]:
train_x = pd.read_csv(os.path.join(data_path, "train_x.csv"), sep='|')
train_y = pd.read_csv(os.path.join(data_path, "train_y.csv"), sep='|')
test_x = pd.read_csv(os.path.join(data_path, "test_x.csv"), sep='|')
test_y = pd.read_csv(os.path.join(data_path, "test_y.csv"), sep='|')

In [6]:
dm_train = DMatrix(data=train_x, label=train_y)
dm_test = DMatrix(data=test_x, label=test_y)


In [7]:
param_grid = {
    'n_estimators': [100, 200, 300], 
    'learning_rate': [0.01, 0.1, 0.2], 
    'max_depth': [3, 4, 5], 
}

grid = GridSearchCV(
            estimator=XGBRegressor(device="cpu"),
            param_grid=param_grid,
            cv=3, scoring='neg_mean_absolute_error', verbose=1, n_jobs=1)

In [8]:
def fit(model, *args, **kwargs):
    model.verbose = 10
    model.fit(*args, **kwargs)
    return model

In [9]:
grid_search = fit(grid, train_x, train_y)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV 1/3; 1/27] START learning_rate=0.01, max_depth=3, n_estimators=100..........
[CV 1/3; 1/27] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-0.926 total time=   1.9s
[CV 2/3; 1/27] START learning_rate=0.01, max_depth=3, n_estimators=100..........
[CV 2/3; 1/27] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-0.894 total time=   1.8s
[CV 3/3; 1/27] START learning_rate=0.01, max_depth=3, n_estimators=100..........
[CV 3/3; 1/27] END learning_rate=0.01, max_depth=3, n_estimators=100;, score=-0.880 total time=   1.8s
[CV 1/3; 2/27] START learning_rate=0.01, max_depth=3, n_estimators=200..........
[CV 1/3; 2/27] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-0.905 total time=   3.0s
[CV 2/3; 2/27] START learning_rate=0.01, max_depth=3, n_estimators=200..........
[CV 2/3; 2/27] END learning_rate=0.01, max_depth=3, n_estimators=200;, score=-0.872 total time=   3.1s
[CV 3/3; 2/27] STAR

In [10]:
best_xgb = grid_search.best_estimator_

In [11]:
predictions = best_xgb.predict(test_x)

# Calculate evaluation metrics
mse = mean_squared_error(test_y, predictions)
rmse = mean_squared_error(test_y, predictions, squared=False)
mae = mean_absolute_error(test_y, predictions)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)

Mean Squared Error (MSE): 0.9445614701887707
Root Mean Squared Error (RMSE): 0.9718855231912711
Mean Absolute Error (MAE): 0.7758542171865702


In [None]:
joblib.dump(best_xgb, os.path.join(model_path, 'best_xgb_model.pkl'))