In [37]:
import pandas as pd
import numpy as np

In [38]:
# read in the data
df = pd.read_csv('data/gemstone.csv')

In [39]:
# drop id column
df.drop('id', axis=1, inplace=True)

In [40]:
# seprate the data into X and y
X = df.drop('price', axis=1)
y = df['price']

In [41]:
#segregatte the data into categorical and numerical
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(exclude=['object']).columns


In [42]:
cat_cols, num_cols

(Index(['cut', 'color', 'clarity'], dtype='object'),
 Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object'))

In [43]:
# import the necessary libraries
from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.preprocessing import OrdinalEncoder, StandardScaler # for preprocessing the data
from sklearn.compose import ColumnTransformer # for preprocessing
from sklearn.pipeline import Pipeline # for building a pipeline
from sklearn.linear_model import LinearRegression # for regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # for regression
from sklearn.impute import SimpleImputer # for handling missing values

In [44]:
# define the custome categorical
cut_cats = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_cats = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_cats = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
# numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
# categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(categories=[cut_cats, color_cats, clarity_cats]))
])

In [45]:
# preprocessing pipeline
preprocess_pipe = ColumnTransformer([
    ('numerical', num_pipe, num_cols),
    ('categorical', cat_pipe, cat_cols)
])

In [46]:
# data split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=30)

In [47]:
# fit the pipeline
preprocess_pipe.fit_transform(X_train) # fit the pipeline on the training data only to avoid data leakage

array([[-0.97543926, -0.84960654, -0.12153081, ...,  4.        ,
         5.        ,  5.        ],
       [ 0.2351953 ,  1.83363716, -0.12153081, ...,  1.        ,
         1.        ,  2.        ],
       [ 0.49461699,  0.81585507,  0.39980029, ...,  3.        ,
         3.        ,  4.        ],
       ...,
       [ 0.45138004,  1.55606023, -0.6428619 , ...,  1.        ,
         3.        ,  2.        ],
       [ 0.66756478, -1.77486298,  1.44246248, ...,  4.        ,
         3.        ,  4.        ],
       [ 0.25681377,  0.81585507, -0.12153081, ...,  4.        ,
         3.        ,  2.        ]])

In [48]:
X_train = pd.DataFrame(preprocess_pipe.fit_transform(X_train), columns=preprocess_pipe.get_feature_names_out())


In [49]:
X_test = pd.DataFrame(preprocess_pipe.transform(X_test), columns=preprocess_pipe.get_feature_names_out())


In [50]:
X_train.head()

Unnamed: 0,numerical__carat,numerical__depth,numerical__table,numerical__x,numerical__y,numerical__z,categorical__cut,categorical__color,categorical__clarity
0,-0.975439,-0.849607,-0.121531,-1.042757,-1.08097,-1.12315,4.0,5.0,5.0
1,0.235195,1.833637,-0.121531,0.318447,0.279859,0.485354,1.0,1.0,2.0
2,0.494617,0.815855,0.3998,0.570855,0.606458,0.673737,3.0,3.0,4.0
3,-1.018676,0.260701,0.921131,-1.214034,-1.24427,-1.195605,3.0,3.0,3.0
4,-0.953821,-0.664555,-0.642862,-1.069801,-1.044681,-1.094168,4.0,6.0,5.0


In [54]:
# model training
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [56]:
import numpy as np
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2


In [57]:
# Traiin multiple models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet()
}
trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae, mse, r2 = evaluate_model(y_test, y_pred)
    print(f'{list(models)[i]}: MAE: {mae}, MSE: {mse}, R2: {r2}')
    model_list.append(list(models)[i])
    r2_list.append(r2)
    trained_model_list.append(model)



LinearRegression: MAE: 674.025511579685, MSE: 1013.9047094344, R2: 0.9368908248567511
Ridge: MAE: 674.0565132296288, MSE: 1013.9058997760818, R2: 0.9368906766741358
Lasso: MAE: 675.2036880701619, MSE: 1013.8723151049944, R2: 0.9368948574778251
ElasticNet: MAE: 1050.7468664314322, MSE: 1513.914035022976, R2: 0.8592978759337908
