## Model Training

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/gemstone.csv')
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df.drop(labels=['id'],axis=1)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453
...,...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67,1130
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47,2874
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62,3036
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81,681


In [4]:
## Independent and dependent features

X = df.drop(labels=['price'],axis=1)
Y = df[['price']]

In [38]:
## define categorical and numerical
categorical_cols = list(X.select_dtypes(include='object').columns)
numerical_cols = list(X.select_dtypes(exclude='object').columns)
categorical_cols

['cut', 'color', 'clarity']

In [7]:
print(df['cut'].unique(),df['clarity'].unique(),df['color'].unique())

['Premium' 'Very Good' 'Ideal' 'Good' 'Fair'] ['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1'] ['F' 'J' 'G' 'E' 'D' 'H' 'I']


In [8]:
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']


In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn. compose import ColumnTransformer


In [39]:
### Numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)
### Categorical pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)

## combine
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [40]:
## Train test split

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

In [41]:
X_train

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
11504,11504,0.41,Ideal,E,VVS2,60.6,56.0,4.85,4.80,2.93
95284,95284,1.23,Very Good,H,VS1,59.9,59.0,6.91,7.01,4.19
184777,184777,1.70,Premium,H,VS2,62.0,58.0,7.61,7.66,4.74
5419,5419,0.33,Ideal,F,VVS1,61.2,56.0,4.47,4.44,2.73
45466,45466,0.33,Very Good,I,SI1,62.1,58.0,4.41,4.45,2.75
...,...,...,...,...,...,...,...,...,...,...
119879,119879,0.50,Very Good,E,SI1,60.2,61.0,5.11,5.15,3.09
103694,103694,1.91,Very Good,F,SI1,62.3,62.0,7.85,7.79,4.87
131932,131932,1.22,Premium,G,VS2,62.8,58.0,6.82,6.74,4.26
146867,146867,0.31,Very Good,G,VVS1,61.1,56.0,4.37,4.40,2.67


In [42]:

X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())


In [43]:
X_train.head()

Unnamed: 0,num_pipeline__id,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-1.524302,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,-0.936747,1.350746
1,-0.025886,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.574708,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-1.633133,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,-0.32088,2.017037
4,-0.916887,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.52672,-0.648127


In [44]:
### Automate 
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

import numpy as np
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true,predicted)
    return mae,rmse,r2_square

In [45]:
### Train multiple models

models = {
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mae,rmse,r2_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print('RMSE',rmse)
    print('MAE',mae)
    print('R2 Score',r2_square*100)

    r2_list.append(r2_square*100)

    print('='*50)
    print('\n')


LinearRegression
Model Training Performance
RMSE 1014.6337606682039
MAE 675.0760335549253
R2 Score 93.62901674122804


Ridge
Model Training Performance
RMSE 1014.6384101190893
MAE 675.1080421714707
R2 Score 93.62895835239303


Lasso
Model Training Performance
RMSE 1014.6602348152877
MAE 676.2416203426885
R2 Score 93.62868426944394


ElasticNet
Model Training Performance
RMSE 1533.3691922093567
MAE 1060.947963359368
R2 Score 85.44938623256382


