In [6]:
import numpy as np, pandas as pd
# Load & basic cleaning
df = pd.read_csv("diamonds.csv")
df = df.drop_duplicates()

In [8]:
df['volume'] = df['x'] * df['y'] * df['z']
df['price_per_carat'] = df['price'] / df['carat']

# Dimension Ratio = (x + y) / (2 * z)
df['dimension_ratio'] = (df['x'] + df['y']) / (2 * df['z'])

# Carat Category: Light (<0.5), Medium (0.5-1.5), Heavy (>1.5)
conditions = [
    df['carat'] < 0.5,
    (df['carat'] >= 0.5) & (df['carat'] <= 1.5),
    df['carat'] > 1.5
]
choices = ['Light', 'Medium', 'Heavy']
# use default=None so np.select returns an object array (avoids mixing string and float dtypes)
df['carat_category'] = pd.Categorical(
    np.select(conditions, choices, default=None),
    categories=choices,
    ordered=True
)


In [9]:
# Check & remove invalid geometry rows and duplicates
invalid_geom = ((df['x'] <= 0) | (df['y'] <= 0) | (df['z'] <= 0)).sum()
print(f"Invalid geometry rows: {invalid_geom}")

df_clean = df[(df['x'] > 0) & (df['y'] > 0) & (df['z'] > 0 ) & (df['volume']>0)].copy()
print('After dropping invalid geometry:', df_clean.shape)
print('Duplicates:', df_clean.duplicated().sum())


Invalid geometry rows: 19
After dropping invalid geometry: (53775, 14)
Duplicates: 0


In [10]:
numeric_features = ['carat', 'depth', 'table', 'volume', 'price_per_carat','dimension_ratio']
cat_cols = ['cut', 'color', 'clarity', 'carat_category']

# Ordinal mappings
cut_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_order = list("JIHGFED")[::-1]  # -> ['D','E','F','G','H','I','J']
clarity_order = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
carat_order=['Light','Medium','Heavy']

cut_mapping = {v: i for i, v in enumerate(cut_order)}
color_mapping = {v: i for i, v in enumerate(color_order)}
clarity_mapping = {v: i for i, v in enumerate(clarity_order)}
carat_mapping={v: i for i, v in enumerate(carat_order)}

# Apply mapping
df_clean['cut_encoded'] = df_clean['cut'].map(cut_mapping)
df_clean['color_encoded'] = df_clean['color'].map(color_mapping)
df_clean['clarity_encoded'] = df_clean['clarity'].map(clarity_mapping)
df_clean['carat_encoded']=df_clean['carat_category'].map(carat_mapping)

In [11]:
X = df_clean.drop(columns=['price','dimension_ratio','cut', 'color', 'clarity', 'carat_category'])
y = df_clean['price']

In [12]:
from sklearn.model_selection import train_test_split
# Train and compare: LinearRegression, RandomForest, GradientBoosting, SVR, and a PyTorch ANN
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle

X = df_clean.drop(columns=['price','dimension_ratio','cut', 'color', 'clarity', 'carat_category'])
y = df_clean['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
models = {
    'Linear': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR(C=1.0, kernel='rbf')
}

results = []
for name, model in models.items():
    print(f"Training/evaluating {name}...")
    ##pipe = Pipeline([ ('pre', preprocessor), ('model', model)])
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)

    # Save the best model (RandomForest here) using pickle

    with open(f"{name}model.pkl", "wb") as f:   
     pickle.dump(model, f)
    # Fit the pipeline (preprocessor handles imputation/encoding so we avoid NaN errors)
    #pipe.fit(X_train, y_train)
    #pred = pipe.predict(X_test)

    mae_test = mean_absolute_error(y_test, y_pred)
    #rmse_test = mean_squared_error(y_test, pred, squared=False)
    r2_test = r2_score(y_test, y_pred)
    results.append({'model': name, 'mae_test': mae_test,  'r2_test': r2_test})

    print(f"{name}: Test MAE={mae_test:.2f},  Test R2={r2_test:.3f}\n")


Training/evaluating Linear...
Linear: Test MAE=340.49,  Test R2=0.983

Training/evaluating RandomForest...
RandomForest: Test MAE=16.37,  Test R2=1.000

Training/evaluating GradientBoosting...
GradientBoosting: Test MAE=97.42,  Test R2=0.998

Training/evaluating SVR...
SVR: Test MAE=1104.52,  Test R2=0.704



In [7]:
display(X)

Unnamed: 0,carat,depth,table,x,y,z,volume,price_per_carat,cut_encoded,color_encoded,clarity_encoded,carat_encoded
0,0.23,61.5,55.0,3.95,3.98,2.43,38.202030,1417.391304,4,1,1,0
1,0.21,59.8,61.0,3.89,3.84,2.31,34.505856,1552.380952,3,1,2,0
2,0.23,56.9,65.0,4.05,4.07,2.31,38.076885,1421.739130,1,1,4,0
3,0.29,62.4,58.0,4.20,4.23,2.63,46.724580,1151.724138,3,5,3,0
4,0.31,63.3,58.0,4.34,4.35,2.75,51.917250,1080.645161,1,6,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,5.75,5.76,3.50,115.920000,3829.166667,4,0,2,1
53936,0.72,63.1,55.0,5.69,5.75,3.61,118.110175,3829.166667,1,0,2,1
53937,0.70,62.8,60.0,5.66,5.68,3.56,114.449728,3938.571429,2,0,2,1
53938,0.86,61.0,58.0,6.15,6.12,3.74,140.766120,3205.813953,3,4,1,1


In [None]:
# ---- PyTorch ANN ----
import pickle

# Prepare data (use preprocessor from earlier; fit on train only)
#feature_adder = feat_transform
#X_all =df.drop(columns=['price'])
#preprocessor_for_ann = preprocessor.fit(X_all)

#X_train_pre = preprocessor_for_ann.transform((X_train))
#X_test_pre = preprocessor_for_ann.transform((X_test))

# Target: log1p
y_train_log = np.log1p(y_train.values).astype(np.float32)
y_test_log = np.log1p(y_test.values).astype(np.float32)

X_train_t = torch.tensor(X_train.values, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_t = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

#X_train_t = torch.from_numpy(X_train_pre.astype(np.float32))
#X_test_t = torch.from_numpy(X_test_pre.astype(np.float32))
#y_train_t = torch.from_numpy(y_train_log.reshape(-1,1))

dataset = TensorDataset(X_train_t, y_train_t)
dloader = DataLoader(dataset, batch_size=256, shuffle=True, num_workers=0)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

input_dim = X_train.shape[1]
class ANNRegressor(nn.Module):
    def __init__(self, inp):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(inp, 128), nn.ReLU(), nn.BatchNorm1d(128), nn.Dropout(0.2),
            nn.Linear(128, 64), nn.ReLU(), nn.BatchNorm1d(64), nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

net = ANNRegressor(input_dim).to(device)
opt = optim.Adam(net.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# Train
epochs = 60
net.train()
for ep in range(epochs):
    running = 0.0
    for xb, yb in dloader:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        out = net(xb)
        loss = criterion(out, yb)
        loss.backward()
        opt.step()
        running += loss.item() * xb.size(0)
    epoch_loss = running / len(dloader.dataset)
    if (ep+1) % 10 == 0 or ep==0:
        print(f"Epoch {ep+1}/{epochs} loss: {epoch_loss:.6f}")

# Evaluate ANN on test set
net.eval()
with torch.no_grad():
    preds_log = net(X_test_t.to(device)).cpu().numpy().reshape(-1)
    preds = np.expm1(preds_log)

mae_ann = mean_absolute_error(y_test, preds)
#rmse_ann = mean_squared_error(y_test, preds, squared=False)
r2_ann = r2_score(y_test, preds)
print(f"ANN: Test MAE={mae_ann:.2f},  R2={r2_ann:.3f}")

results.append({'model':'ANN', 'mae_test':mae_ann,  'r2_test':r2_ann})

# Save ANN state and preprocessor

#torch.save({'state_dict': net.state_dict(), 'input_dim': input_dim}, 'ann_state.pth')
#pickle.dump(preprocessor_for_ann, open('ann_preprocessor.pkl', 'wb'))
#print('Saved ANN state to ann_state.pth and preprocessor to ann_preprocessor.pkl')


Using device: cpu
Epoch 1/60 loss: 48.201644
Epoch 10/60 loss: 0.532571
Epoch 20/60 loss: 0.281812
Epoch 30/60 loss: 0.238452
Epoch 40/60 loss: 0.220304
Epoch 50/60 loss: 0.196724
Epoch 60/60 loss: 0.173758
ANN: Test MAE=93.36,  R2=0.996


Unnamed: 0,model,mae_test,r2_test
0,Linear,373.879451,0.980171
1,RandomForest,16.070403,0.999742
2,GradientBoosting,99.607858,0.998235
3,SVR,1103.573405,0.704024
4,ANN,93.361323,0.995737


In [4]:
rf_model 

sklearn.ensemble._forest.RandomForestRegressor