In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

In [3]:
diamonds = sns.load_dataset("diamonds")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
diamonds.shape

(53940, 10)

In [5]:
from sklearn.model_selection import train_test_split

X, y = diamonds.drop('price', axis=1), diamonds[['price']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
import xgboost as xgb

dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Cross Validation

In [8]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

In [9]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

print(f'best_num_boost_round : {len(results)}')

best_num_boost_round : 53


In [10]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2859.095179,3.738838,2859.711025,19.249719
1,2081.608111,2.169328,2085.556203,16.71534
2,1542.789804,1.44816,1551.686252,14.815797
3,1177.386062,1.924371,1191.49739,13.893185
4,936.177799,1.213138,958.509243,11.464197


In [11]:
best_rmse = results['test-rmse-mean'].min()
best_rmse

548.6714505957655

# Training

In [12]:
best_num_boost_round = len(results)

# Train the model on the full dataset using the optimal number of boosting rounds
final_model = xgb.train(
   params,
   dtrain_reg,
   num_boost_round=best_num_boost_round,
   evals=evals,
   verbose_eval=10,
   early_stopping_rounds=50,
)

[0]	train-rmse:2859.16191	validation-rmse:2863.26854
[10]	train-rmse:541.71155	validation-rmse:595.17258
[20]	train-rmse:483.63884	validation-rmse:564.80710
[30]	train-rmse:457.50330	validation-rmse:561.33566
[40]	train-rmse:441.58532	validation-rmse:560.49224
[50]	train-rmse:428.43810	validation-rmse:558.54031
[52]	train-rmse:427.23083	validation-rmse:559.00866


# Testing

In [12]:
from sklearn.metrics import mean_squared_error

preds = final_model.predict(dtest_reg)

In [None]:
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")