## Model Training

In [1]:
# import libraries
import pandas as pd

In [2]:
#read the dataset
df = pd.read_csv('./data/gemstone.csv')

In [3]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
# drop id columns
df.drop(columns='id', inplace=True)

In [5]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [6]:
#dependent and independent variable
X = df.drop(labels=['price'], axis=1)
y = df[['price']]

In [7]:
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [8]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [9]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [10]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [11]:
numerical_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [12]:
# import required libraries
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
#pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [13]:
cut_map = ['Ideal', 'Premium','Fair', 'Very Good', 'Good']
color_map = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_map = ['IF', 'VVS1', 'VVS2', 'VS1', 'VS2', 'SI1', 'SI2', 'I1']

In [14]:
numerical_pipeline = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy='median')),
        ('scaling', StandardScaler())
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OrdinalEncoder(categories=[cut_map, color_map, clarity_map])),
        ('scaling', StandardScaler())
    ]
)

In [15]:
preprocessor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_cols),
    ('categorical_pipeline', categorical_pipeline, categorical_cols)
])

In [16]:
columns = []
columns.extend(numerical_cols)
columns.extend(categorical_cols)
columns

['carat', 'depth', 'table', 'x', 'y', 'z', 'cut', 'color', 'clarity']

In [17]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [18]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=columns)

In [19]:
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=columns)

In [20]:
X_train.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,-0.822839,-0.936747,-1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,1.418292,0.910853,-0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.075795,0.910853,-0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,-0.822839,-0.32088,-2.017037
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,1.418292,1.52672,0.648127


In [21]:
X_test.head()

Unnamed: 0,carat,depth,table,x,y,z,cut,color,clarity
0,-0.629077,0.25823,-0.12063,-0.600482,-0.581521,-0.572248,-0.822839,-1.552614,0.648127
1,2.605374,-2.148014,-0.12063,2.126042,2.198832,1.959219,1.418292,0.294987,1.314417
2,-1.125026,-1.222536,0.921902,-1.374347,-1.414721,-1.46911,-0.075795,-0.936747,-2.017037
3,-1.017211,-0.574701,0.921902,-1.158385,-1.161138,-1.194265,-0.075795,1.52672,-2.017037
4,0.858771,0.628421,-0.641897,0.947248,0.985258,1.004495,-0.822839,0.910853,0.648127


In [33]:
# model Training
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [34]:
regressor = LinearRegression()

In [35]:
regressor.fit(X_train, y_train)

LinearRegression()

In [36]:
pd.DataFrame(regressor.predict(X_test))

Unnamed: 0,0
0,1591.331240
1,15144.316790
2,1741.875835
3,684.922654
4,6482.019833
...,...
58067,2903.318945
58068,2028.296948
58069,1868.585396
58070,6287.877316


In [37]:
y_test

Unnamed: 0,price
14868,1355
165613,14691
96727,844
145593,707
118689,5797
...,...
39151,2974
32423,1875
17876,967
72938,5656


In [50]:
# model score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import numpy as np

In [51]:
# evaluate model
def evaluate_model(true, predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    r_square = r2_score(true, predicted)
    return (mse, mae, rmse, r_square)
    

In [54]:
models = {
    'Regression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet()
}
model_list = []


for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    model_list.append(list(models.keys())[i])
    # making prediction
    y_pred = model.predict(X_test)
    
    mse, mae, rmse, r_square = evaluate_model(y_test, y_pred)
    print(model_list[i])
    print("*"*20)
    print("Mean Squared Error:", mse)
    print("Mean Absolute Error:", mae)
    print("Root Mean Squared Error:", rmse)
    print("R2 Score: {:.2f} %".format(r_square*100))
    print("*"*20)
    
    

Regression
********************
Mean Squared Error: 1031233.5754625921
Mean Absolute Error: 675.3325144211295
Root Mean Squared Error: 1015.4967136641025
R2 Score: 93.62 %
********************
Ridge
********************
Mean Squared Error: 1031243.6212371534
Mean Absolute Error: 675.3645352372672
Root Mean Squared Error: 1015.5016598889208
R2 Score: 93.62 %
********************
Lasso
********************
Mean Squared Error: 1031284.7558338773
Mean Absolute Error: 676.5090996665215
Root Mean Squared Error: 1015.5219130249613
R2 Score: 93.62 %
********************
ElasticNet
********************
Mean Squared Error: 2353849.1051798766
Mean Absolute Error: 1061.4036059979874
Root Mean Squared Error: 1534.2258977021202
R2 Score: 85.43 %
********************


In [55]:
model_list

['Regression', 'Ridge', 'Lasso', 'ElasticNet']

## Model Training Done