In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")


In [3]:
diamonds = pd.read_csv("seaborn_diamonds_dataset.csv")
diamonds.sample(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
41028,0.41,Ideal,E,VVS2,61.0,56.0,1187,4.79,4.82,2.93
42037,0.51,Ideal,I,VS2,60.5,58.0,1268,5.15,5.2,3.13
19910,1.21,Premium,G,VS1,62.7,58.0,8456,6.79,6.74,4.24
43507,0.43,Ideal,F,VVS1,62.0,57.0,1422,4.84,4.81,2.99
22448,1.72,Very Good,J,VS2,60.9,61.0,10477,7.77,7.79,4.74


In [4]:
diamonds.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [8]:
categories=diamonds.select_dtypes(exclude=np.number).columns.tolist()
for col in categories:
    diamonds[col]=diamonds[col].astype('category')

diamonds.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object

In [9]:
X_clf=diamonds.drop('cut',axis=1)
y_clf=diamonds[['cut']]

print(X_clf.sample(5))
print(y_clf.sample(5))


       carat color clarity  depth  table  price     x     y     z
49225   0.58     E     VS1   61.8   55.0   2082  5.35  5.39  3.32
14808   1.21     I     VS2   62.3   58.0   5962  6.82  6.79  4.24
53517   0.71     F     VS2   62.4   59.0   2686  5.71  5.66  3.55
4417    1.01     E     SI2   63.9   58.0   3611  6.37  6.31  4.05
43030   0.50     G     VS2   62.8   56.0   1373  5.05  5.08  3.18
             cut
20658      Ideal
38658    Premium
43181  Very Good
17688    Premium
20996      Ideal


In [10]:
enc=OrdinalEncoder()
y_encoded_clf=enc.fit_transform(y_clf)
y_encoded_clf.dtype

dtype('float64')

In [20]:
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_encoded_clf, test_size=0.25, random_state=1,stratify=y_encoded_clf)

In [21]:
print(X_train_clf.shape, X_test_clf.shape, y_train_clf.shape, y_test_clf.shape)

(40455, 9) (13485, 9) (40455, 1) (13485, 1)


In [22]:
dtrain_clf= xgb.DMatrix(X_train_clf, y_train_clf, enable_categorical = True)
dtest_clf = xgb.DMatrix(X_test_clf, y_test_clf, enable_categorical= True)

In [23]:
parameters={
    "objective":"multi:softprob",
    "tree_method":"hist",
    "num_class":5,
    "eval_metric":["mlogloss","auc","merror"]
}

n=100

In [24]:
model_clf=xgb.train(
    params=parameters,
    dtrain=dtrain_clf,
    num_boost_round=n
)

In [25]:
#missed


In [27]:
X_reg =diamonds.drop('price',axis=1)
y_reg=diamonds[['price']]

print(X_reg.sample(5))
print(y_reg.sample(5))

       carat        cut color clarity  depth  table     x     y     z
40045   0.54       Good     E     SI2   63.9   54.0  5.18  5.12  3.29
8635    1.07  Very Good     I     SI1   62.6   58.0  6.49  6.57  4.09
24329   2.00      Ideal     H     SI2   61.5   57.0  8.04  8.00  4.93
5891    0.90    Premium     I    VVS2   58.3   60.0  6.32  6.28  3.67
32511   0.40       Good     E     SI2   63.9   57.0  4.71  4.65  2.99
       price
36948    479
24034    633
47297    394
43607    507
48126   1939


In [28]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.25, random_state=1)

In [49]:
dtrain_reg=xgb.DMatrix(X_train_reg, y_train_reg, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test_reg, y_test_reg, enable_categorical=True)

In [50]:
parameters={
    "objective": "reg:squarederror",
    "tree_method": "hist"}

n=100

In [62]:
model_reg=xgb.train(
    params = parameters,
    dtrain = dtrain_reg,
    num_boost_round=n)

predictions_reg=model_reg.predict(dtest_reg)
pd.DataFrame(predictions_reg)

Unnamed: 0,0
0,478.662018
1,5764.281738
2,2823.106689
3,527.639648
4,6267.396973
...,...
13480,2215.891846
13481,958.218811
13482,3661.878906
13483,3771.973389


In [63]:
rmse=mean_squared_error(y_test_reg,predictions_reg,squared=False)
print(f"RMSE of our model: {rmse:.3f}")

RMSE of our model: 552.861


In [64]:
evals= [(dtrain_reg, "train"),(dtest_reg,"validation")]

parameters={
    "objective": "reg:squarederror",
    "tree_method": "hist"}

n=100

In [65]:
model_reg=xgb.train(
    params = parameters,
    dtrain = dtrain_reg,
    num_boost_round=n,
    evals=evals,
    verbose_eval=10
)

[0]	train-rmse:2874.49146	validation-rmse:2817.90814
[10]	train-rmse:548.36512	validation-rmse:592.03160
[20]	train-rmse:491.09887	validation-rmse:558.53485
[30]	train-rmse:469.58201	validation-rmse:555.51015
[40]	train-rmse:454.32953	validation-rmse:554.45666
[50]	train-rmse:438.68033	validation-rmse:554.13365
[60]	train-rmse:425.38361	validation-rmse:551.57888
[70]	train-rmse:414.71115	validation-rmse:549.26109
[80]	train-rmse:405.41008	validation-rmse:549.03952
[90]	train-rmse:391.04269	validation-rmse:551.87206
[99]	train-rmse:383.48826	validation-rmse:552.86131


In [66]:
n=5000
model_reg=xgb.train(
    params = parameters,
    dtrain = dtrain_reg,
    num_boost_round=n,
    evals=evals,
    verbose_eval=250
)


[0]	train-rmse:2874.49146	validation-rmse:2817.90814
[250]	train-rmse:283.21559	validation-rmse:557.61263
[500]	train-rmse:201.44074	validation-rmse:564.77532
[750]	train-rmse:155.76096	validation-rmse:570.41258
[1000]	train-rmse:127.04692	validation-rmse:574.22590
[1250]	train-rmse:105.13407	validation-rmse:575.74609
[1500]	train-rmse:87.17633	validation-rmse:577.88702
[1750]	train-rmse:75.08646	validation-rmse:578.86698
[2000]	train-rmse:64.86890	validation-rmse:579.64879
[2250]	train-rmse:56.54684	validation-rmse:580.07445
[2500]	train-rmse:50.04183	validation-rmse:580.65263
[2750]	train-rmse:44.39520	validation-rmse:581.19185
[3000]	train-rmse:39.38436	validation-rmse:581.63495
[3250]	train-rmse:35.32653	validation-rmse:582.03607
[3500]	train-rmse:31.85327	validation-rmse:582.04309
[3750]	train-rmse:28.94276	validation-rmse:582.38781
[4000]	train-rmse:26.57816	validation-rmse:582.60757
[4250]	train-rmse:24.39706	validation-rmse:582.65767
[4500]	train-rmse:22.47317	validation-rmse:5

In [67]:
parameters={
    "objective": "reg:squarederror",
    "tree_method": "hist"}

n=100

In [68]:
#crossfold Validation
results_reg=xgb.cv(
    params = parameters,
    dtrain = dtrain_reg,
    num_boost_round=n,
    early_stopping_rounds=20,
    nfold=5
)

results_reg.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2874.224552,9.424846,2876.318793,36.995997
1,2088.350837,7.595382,2093.063623,25.351925
2,1552.629638,4.97414,1560.552731,19.550836
3,1185.994963,4.133544,1198.669943,14.648669
4,943.402904,4.757288,962.349383,11.724038


In [69]:
results_reg['test-rmse-mean'].min()

550.2735543625861