In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

## Loading data 

In [2]:
df = pd.read_csv("/kaggle/input/dimond-price-dataset/diamonds.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,cut,color,clarity,carat_weight,cut_quality,lab,symmetry,polish,eye_clean,...,meas_depth,girdle_min,girdle_max,fluor_color,fluor_intensity,fancy_color_dominant_color,fancy_color_secondary_color,fancy_color_overtone,fancy_color_intensity,total_sales_price
0,0,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,...,1.79,M,M,unknown,,unknown,unknown,unknown,unknown,200
1,1,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Very Good,unknown,...,1.78,STK,STK,unknown,,unknown,unknown,unknown,unknown,200
2,2,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,...,1.77,TN,M,unknown,,unknown,unknown,unknown,unknown,200
3,3,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,...,1.78,M,STK,unknown,,unknown,unknown,unknown,unknown,200
4,4,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Excellent,unknown,...,1.82,STK,STK,unknown,,unknown,unknown,unknown,unknown,200


In [3]:
df.shape

(219703, 26)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219703 entries, 0 to 219702
Data columns (total 26 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Unnamed: 0                   219703 non-null  int64  
 1   cut                          219703 non-null  object 
 2   color                        219703 non-null  object 
 3   clarity                      219703 non-null  object 
 4   carat_weight                 219703 non-null  float64
 5   cut_quality                  219703 non-null  object 
 6   lab                          219703 non-null  object 
 7   symmetry                     219703 non-null  object 
 8   polish                       219703 non-null  object 
 9   eye_clean                    219703 non-null  object 
 10  culet_size                   219703 non-null  object 
 11  culet_condition              219703 non-null  object 
 12  depth_percent                219703 non-null  float64
 13 

In [5]:
df.isnull().sum()

Unnamed: 0                          0
cut                                 0
color                               0
clarity                             0
carat_weight                        0
cut_quality                         0
lab                                 0
symmetry                            0
polish                              0
eye_clean                           0
culet_size                          0
culet_condition                     0
depth_percent                       0
table_percent                       0
meas_length                         0
meas_width                          0
meas_depth                          0
girdle_min                          0
girdle_max                          0
fluor_color                         0
fluor_intensity                143491
fancy_color_dominant_color          0
fancy_color_secondary_color         0
fancy_color_overtone             1650
fancy_color_intensity               0
total_sales_price                   0
dtype: int64

## Cleaning Data 

In [6]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)
# Drop this columns as more than 90% of data is unknown 
df.drop(['fancy_color_dominant_color', 'fancy_color_secondary_color', 
    'fancy_color_overtone', 'fancy_color_intensity','fluor_color','culet_condition',"eye_clean","fluor_intensity"] ,axis=1, inplace=True)

In [7]:
df.nunique()

cut                     11
color                   11
clarity                 11
carat_weight           878
cut_quality              6
lab                      3
symmetry                 5
polish                   5
culet_size               9
depth_percent          534
table_percent          235
meas_length           1415
meas_width            1155
meas_depth             852
girdle_min              10
girdle_max              10
total_sales_price    22202
dtype: int64

In [8]:
#  Remove "Impossible" Physics
# Diamonds cannot have 0 length, width, or depth. These are data entry errors.
df = df[(df['meas_length'] > 0) & 
        (df['meas_width'] > 0) & 
        (df['meas_depth'] > 0)]

# Remove Target Errors
# Price must be positive
df = df[df['total_sales_price'] > 0]

print(f"Shape after Cleaning Bad Outliers: {df.shape}")

Shape after Cleaning Bad Outliers: (217910, 17)


In [9]:
# Delete Duplicate Rows 
df[df.duplicated()]

Unnamed: 0,cut,color,clarity,carat_weight,cut_quality,lab,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,meas_width,meas_depth,girdle_min,girdle_max,total_sales_price
111,Round,L,I2,0.31,Very Good,GIA,Very Good,Very Good,unknown,64.2,55.0,4.26,4.28,2.75,unknown,unknown,274
391,Round,M,SI1,0.33,Very Good,GIA,Excellent,Very Good,unknown,64.4,57.0,4.30,4.34,2.78,M,TK,354
786,Round,M,SI1,0.30,Very Good,GIA,Very Good,Very Good,unknown,64.4,57.0,4.15,4.21,2.69,unknown,unknown,400
787,Round,M,SI1,0.30,Very Good,GIA,Very Good,Very Good,unknown,63.5,57.0,4.17,4.24,2.67,unknown,unknown,400
893,Round,G,I2,0.30,Very Good,GIA,Very Good,Excellent,unknown,62.1,62.0,4.23,4.28,2.64,unknown,unknown,408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219494,Radiant,unknown,VS1,15.95,unknown,GIA,Good,Very Good,unknown,65.4,67.0,15.41,13.11,8.57,M,STK,378829
219506,Radiant,G,VS2,9.05,unknown,GIA,Very Good,Very Good,N,62.9,75.0,12.51,12.01,7.55,unknown,unknown,385128
219542,Emerald,K,VS2,15.14,unknown,GIA,Very Good,Excellent,N,69.1,68.0,18.22,11.26,7.78,unknown,unknown,420529
219551,Princess,G,VS2,9.05,unknown,GIA,Very Good,Very Good,N,62.9,75.0,12.51,12.01,7.55,unknown,unknown,431217


In [10]:
df = df.drop_duplicates()

In [11]:
df.nunique()

cut                     11
color                   11
clarity                 10
carat_weight           875
cut_quality              6
lab                      3
symmetry                 5
polish                   5
culet_size               9
depth_percent          534
table_percent          234
meas_length           1414
meas_width            1153
meas_depth             851
girdle_min              10
girdle_max              10
total_sales_price    22163
dtype: int64

In [12]:
df.shape

(214013, 17)

## Feature Engg

In [13]:
df.head()

Unnamed: 0,cut,color,clarity,carat_weight,cut_quality,lab,symmetry,polish,culet_size,depth_percent,table_percent,meas_length,meas_width,meas_depth,girdle_min,girdle_max,total_sales_price
0,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,N,62.7,59.0,2.85,2.87,1.79,M,M,200
1,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Very Good,N,61.9,59.0,2.84,2.89,1.78,STK,STK,200
2,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,61.1,59.0,2.88,2.9,1.77,TN,M,200
3,Round,E,VVS2,0.09,Excellent,IGI,Very Good,Very Good,unknown,62.0,59.0,2.86,2.88,1.78,M,STK,200
4,Round,E,VVS2,0.09,Very Good,IGI,Very Good,Excellent,N,64.9,58.5,2.79,2.83,1.82,STK,STK,200


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder # RobustScaler handles outliers better
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [15]:
#Dont involve total sales price as it is target feature 
NUMERICAL_FEATURES = [
    'carat_weight', 'depth_percent', 'table_percent', 
    'meas_length', 'meas_width', 'meas_depth'
]

CATEGORICAL_FEATURES = [
    'cut', 'color', 'clarity', 'cut_quality', 'lab',
    'symmetry', 'polish','culet_size', 
    'girdle_min', 'girdle_max',  
]

In [16]:
for feature in CATEGORICAL_FEATURES:
    # Get unique values and convert to a list for cleaner printing
    uniques = df[feature].unique().tolist()
    print(f"{feature} - {uniques}")

cut - ['Round', 'Pear', 'Oval', 'Marquise', 'Princess', 'Emerald', 'Heart', 'Radiant', 'Cushion Modified', 'Cushion', 'Asscher']
color - ['E', 'F', 'D', 'J', 'I', 'G', 'H', 'M', 'L', 'K', 'unknown']
clarity - ['VVS2', 'VVS1', 'I1', 'VS1', 'VS2', 'IF', 'SI2', 'I2', 'SI1', 'I3']
cut_quality - ['Excellent', 'Very Good', 'unknown', 'Good', 'Fair', 'Ideal']
lab - ['IGI', 'GIA', 'HRD']
symmetry - ['Very Good', 'Excellent', 'Good', 'Fair', 'Poor']
polish - ['Very Good', 'Excellent', 'Good', 'Fair', 'Poor']
culet_size - ['N', 'unknown', 'S', 'M', 'VS', 'L', 'EL', 'SL', 'VL']
girdle_min - ['M', 'STK', 'TN', 'TK', 'unknown', 'VTN', 'VTK', 'XTK', 'XTN', 'STN']
girdle_max - ['M', 'STK', 'TK', 'unknown', 'TN', 'VTK', 'XTK', 'XTN', 'VTN', 'STN']


In [17]:
NOMINAL_FEATURES = [
    'cut',        # SHAPE
    'lab'         # CERTIFICATION
]
ORDINAL_FEATURES = [
    'color',
    'clarity',
    'cut_quality',
    'symmetry',
    'polish',
    'culet_size',
    'girdle_min',
    'girdle_max'
]

In [18]:
#Correct Oridnal order 
ordinal_categories = [
    # color (worst → best)
    ['M','L','K','J','I','H','G','F','E','D','unknown'],

    # clarity (worst → best)
    ['I3','I2','I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF'],

    # cut_quality
    ['Fair','Good','Very Good','Excellent','Ideal','unknown'],

    # symmetry
    ['Poor','Fair','Good','Very Good','Excellent'],

    # polish
    ['Poor','Fair','Good','Very Good','Excellent'],

    # culet_size (largest → smallest)
    ['EL','VL','L','M','S','SL','VS','N','unknown'],

    # girdle_min (thick → thin)
    ['XTK','VTK','STK','TK','M','TN','STN','VTN','XTN','unknown'],

    # girdle_max (thick → thin)
    ['XTK','VTK','STK','TK','M','TN','STN','VTN','XTN','unknown']
]


###  Creating the Pipeline
The `Pipeline` bundles the preprocessing steps and the model together. 
- **Consistency:** Ensures the same transformations are applied to both Training and Testing data.
- **Safety:** Prevents "Data Leakage" during cross-validation.

In [19]:
#Build Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer


In [20]:
#Numerical Pipeline 
# RobustScaler: Removes the median and scales data according to the Interquartile Range (IQR).
# Unlike StandardScaler, it is not influenced by a small number of very large/small outliers.
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])


In [21]:
ord_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ordinal_categories))
])


In [22]:
nom_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(
        handle_unknown='ignore',
        sparse_output=False
    ))
])


###  Defining the ColumnTransformer
We use `ColumnTransformer` to apply different preprocessing steps to different columns:
- **Numerical Features:** Get scaled (e.g., RobustScaler).
- **Categorical Features:** Get encoded (e.g., OneHotEncoder or OrdinalEncoder).

In [23]:
#ColumnTransformer (CORE STEP) 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, NUMERICAL_FEATURES),
        ('ord', ord_pipeline, ORDINAL_FEATURES),
        ('nom', nom_pipeline, NOMINAL_FEATURES)
    ]
)


In [24]:
TARGET = 'total_sales_price'

X = df.drop(TARGET, axis=1)
y = np.log1p(df[TARGET])   # log transform for better DL training

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [25]:
#Fit & Transform (Correct Way) only on training data
X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc  = preprocessor.transform(X_test)

input_dim = X_train_enc.shape[1]


## Model Building and training 

In [26]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

2025-12-24 19:59:18.489468: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766606358.769647      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766606358.851083      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766606359.530782      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766606359.530836      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766606359.530840      17 computation_placer.cc:177] computation placer alr

In [27]:
model = Sequential([
    Dense(256, activation='relu', input_shape=(input_dim,)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.25),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(1)   # regression output
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-12-24 19:59:35.785950: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [28]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mse',
    metrics=['mae']
)

In [29]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=8,
    restore_best_weights=True
)

history = model.fit(
    X_train_enc, y_train,
    validation_data=(X_test_enc, y_test),
    epochs=200,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/200
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - loss: 32.1494 - mae: 4.8617 - val_loss: 0.1103 - val_mae: 0.2019
Epoch 2/200
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 1.4684 - mae: 0.8202 - val_loss: 0.0881 - val_mae: 0.1780
Epoch 3/200
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 0.8645 - mae: 0.6406 - val_loss: 0.0708 - val_mae: 0.1627
Epoch 4/200
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.5875 - mae: 0.5472 - val_loss: 0.0615 - val_mae: 0.1425
Epoch 5/200
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.4533 - mae: 0.4968 - val_loss: 0.0586 - val_mae: 0.1389
Epoch 6/200
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.4002 - mae: 0.4733 - val_loss: 0.0611 - val_mae: 0.1501
Epoch 7/200
[1m669/669[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms

In [30]:
loss, mae = model.evaluate(X_test_enc, y_test)
print("Test MAE (log scale):", mae)

# Convert MAE back to price scale (approx)
print("Approx MAE in price:", np.expm1(mae))


[1m1338/1338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0405 - mae: 0.1082
Test MAE (log scale): 0.10726873576641083
Approx MAE in price: 0.11323337991021873


In [31]:
from sklearn.metrics import r2_score

# Predict on test set (log scale)
y_test_pred_log = model.predict(X_test_enc).ravel()

# Convert back to original price
y_test_true = np.expm1(y_test)
y_test_pred = np.expm1(y_test_pred_log)

# R2 score on real price
r2 = r2_score(y_test_true, y_test_pred)
print("R² Score:", r2)


[1m1338/1338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
R² Score: 0.7457232756558387


In [32]:
import joblib
model.save("diamond_price_model.keras")
joblib.dump(preprocessor, "dl_preprocessor.joblib", compress=9)

['dl_preprocessor.joblib']

In [33]:
import os

os.listdir("/kaggle/working")


['dl_preprocessor.joblib', '__notebook__.ipynb', 'diamond_price_model.keras']