In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## About the dataset:

Source: https://www.kaggle.com/datasets/joebeachcapital/diamonds/data

Features:
- carat is a measure of diamond weight. One carat is equivalent to 0.2 grams.
- clarity refers to how clear a diamond is. Diamonds often contain imperfections like cracks or mineral deposits. The fewer and less noticeable a diamond’s imperfections, the better its clarity. clarity contains 8 ordered levels, from “I1” (the worst) to “IF” (the best).
- color refers to the color of the diamond. Colorless diamonds are considered better than diamonds with a yellow tint. diamonds contains diamonds of 7 different colors, represented by different letters. “D” - “F” diamonds are considered colorless, while “G” - “J” diamonds have a very faint color.
- cut refers to how a rough diamond is shaped into a finished diamond. Better cuts create more symmetrical and luminous diamonds. cut has 5 ordered levels: “Fair,” “Good,” “Very Good,” “Premium,” “Ideal.”
- x, y, z, depth, and table are various measures of a diamond’s size, in millimeters

## Goal: 
Based on the given features, predict the diamond prices. 
Selected model for the task: RandomForestRegressor

In [2]:
df = pd.read_csv('data/diamonds.csv')

In [3]:
# Rearrange the columns
diamond_columns = ["carat", "cut", "color", "clarity", "depth", "table", "x", "y", "z", "price"]

In [4]:
df = df[diamond_columns]

In [5]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,326
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,326
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,327
3,0.29,Premium,I,VS2,62.4,58.0,4.20,4.23,2.63,334
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,335
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.50,2757
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61,2757
53937,0.70,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56,2757
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74,2757


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   x        53940 non-null  float64
 7   y        53940 non-null  float64
 8   z        53940 non-null  float64
 9   price    53940 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [7]:
# Split the data to 'X' and 'y':
X = df.drop("price", axis = 1)

In [8]:
y = df["price"]

In [9]:
# Convert non-numerical data to numerical:
# 1. Import OneHotEncoder and ColumnTransfromer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# 2. Define the categorical features to transform
categorical_features = ["cut", "color", "clarity"]

# 3. Create an instance of OneHotEncoder
one_hot = OneHotEncoder()

# 4. Create an instance of ColumnTransformer
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)], remainder="passthrough")

# 5. Turn the categorical features into numbers
transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.  , 0.  , 1.  , ..., 3.95, 3.98, 2.43],
       [0.  , 0.  , 0.  , ..., 3.89, 3.84, 2.31],
       [0.  , 1.  , 0.  , ..., 4.05, 4.07, 2.31],
       ...,
       [0.  , 0.  , 0.  , ..., 5.66, 5.68, 3.56],
       [0.  , 0.  , 0.  , ..., 6.15, 6.12, 3.74],
       [0.  , 0.  , 1.  , ..., 5.83, 5.87, 3.64]])

In [10]:
# Split the data into training and test sets:
from sklearn.model_selection import train_test_split

np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.25)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40455, 26), (13485, 26), (40455,), (13485,))

In [12]:
# Selected model: RandomForestRegressor 
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9808420256850187

In [13]:
# Predictions:
y_preds = model.predict(X_test)
y_preds[:10]

array([  571.9 ,  2335.44,  1216.83,  1239.48, 10024.29,  4166.22,
        1780.88,  1810.7 ,  2041.2 ,  5752.58])

In [17]:
prices = pd.DataFrame({"actual values": y_test, "predictions": y_preds})
prices

Unnamed: 0,actual values,predictions
1388,559,571.90
50052,2201,2335.44
41645,1238,1216.83
42377,1304,1239.48
17244,6901,10024.29
...,...,...
36825,957,1135.98
24803,13187,12576.26
20999,9215,9100.11
8471,4416,4344.19


In [18]:
# Regression model evaluation:

# Mean Absolute Error: 
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

272.1638465137631

In [19]:
# Mean Squared Error
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_preds)


300965.8729456287

In [20]:
%%time
# The model performs well, but for practice let's tune hyperparameters with RandomizedSearchCV. This most likely will cause the model to perform slightly worse.
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    "n_estimators": np.arange(10, 200, 10),
    "max_depth": [None, 5, 10, 15],
    "min_samples_split": np.arange(2, 20, 2),
    "min_samples_leaf": np.arange(1, 20, 2),
    "max_features": [0.5, 1.0, "sqrt"],
    "max_samples": [10000]
}

rs_model = RandomizedSearchCV(RandomForestRegressor(),
                             param_distributions=param_dist,
                             n_iter=20,
                             cv=5,
                             verbose=True)

rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
CPU times: total: 2min 49s
Wall time: 2min 50s


In [21]:
# And the best params are...
rs_model.best_params_

{'n_estimators': 90,
 'min_samples_split': 8,
 'min_samples_leaf': 3,
 'max_samples': 10000,
 'max_features': 0.5,
 'max_depth': 15}

In [23]:
# Train model again with the best parameters:

ideal_model = RandomForestRegressor(n_estimators=90,
                                   min_samples_split=8,
                                   min_samples_leaf=3,
                                   max_samples=10000,
                                   max_features=0.5,
                                   max_depth=15)

ideal_model.fit(X_train, y_train)

In [25]:
# As predicted, the score decreased slightly. 
ideal_model.score(X_test, y_test)

0.9777180452403154

In [27]:
y_preds_ideal = ideal_model.predict(X_test)
y_preds_ideal[:10]

array([ 523.08284394, 2332.30507774, 1139.34047215, 1216.7082845 ,
       9648.69611512, 3978.24805764, 1752.11113361, 1810.25429449,
       2179.21405368, 5836.514451  ])

In [28]:
prices2 = pd.DataFrame({"actual values": y_test, "predictions": y_preds_ideal })
prices2

Unnamed: 0,actual values,predictions
1388,559,523.082844
50052,2201,2332.305078
41645,1238,1139.340472
42377,1304,1216.708285
17244,6901,9648.696115
...,...,...
36825,957,1264.316707
24803,13187,12280.434092
20999,9215,8739.301514
8471,4416,4276.668781


In [29]:
mean_absolute_error(y_test, y_preds_ideal)

308.2198114702241