## XGBoost

In [56]:
!pip install --user xgboost



## For devices with an nvidia GPU

In [None]:
# Use NVIDIA GPU
# conda install -c conda-forge py-xgboost-gpu

In [57]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import root_mean_squared_log_error

warnings.filterwarnings("ignore")

## Load the Data

In [58]:
diamonds = sns.load_dataset("diamonds")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [59]:
# Extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

## Data preprocessing

In [60]:
# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

## Split the Data

In [61]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [62]:
# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Define hyperparameters


In [63]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}

## Train the Regression Model

In [64]:
n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [65]:
preds = model.predict(dtest_reg)

## Evaluate the model

In [66]:
rmse = root_mean_squared_log_error(y_test, preds)

print(f"RMSLE of the base model: {rmse:.3f}")

RMSLE of the base model: 0.094


In [67]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 100

In [68]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
)

[0]	train-rmse:2874.29379	validation-rmse:2817.38773
[1]	train-rmse:2092.07711	validation-rmse:2054.73630
[2]	train-rmse:1549.52687	validation-rmse:1526.30592
[3]	train-rmse:1184.46798	validation-rmse:1174.90119
[4]	train-rmse:941.09127	validation-rmse:943.28272
[5]	train-rmse:784.58014	validation-rmse:796.09651
[6]	train-rmse:685.75110	validation-rmse:705.22245
[7]	train-rmse:624.67281	validation-rmse:653.32563
[8]	train-rmse:584.19599	validation-rmse:620.30404
[9]	train-rmse:558.77667	validation-rmse:599.24504
[10]	train-rmse:543.85303	validation-rmse:586.99790
[11]	train-rmse:531.92694	validation-rmse:578.68120
[12]	train-rmse:523.08456	validation-rmse:571.73527
[13]	train-rmse:515.67753	validation-rmse:567.19913
[14]	train-rmse:510.77594	validation-rmse:564.66402
[15]	train-rmse:506.68519	validation-rmse:563.21547
[16]	train-rmse:502.96796	validation-rmse:561.80880
[17]	train-rmse:498.90184	validation-rmse:560.36561
[18]	train-rmse:492.74859	validation-rmse:558.46274
[19]	train-rms

## Cross Validation

In [69]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 1000

results = xgb.cv(
   params, dtrain_reg,
   num_boost_round=n,
   nfold=5,
   early_stopping_rounds=20
)

In [70]:
results.keys()

Index(['train-rmse-mean', 'train-rmse-std', 'test-rmse-mean', 'test-rmse-std'], dtype='object')

In [71]:
results['test-rmse-mean'].max()

2877.437273731796