## Model Training

In [73]:
import pandas as pd

In [74]:
df = pd.read_csv('./data/gemstone.csv')

In [75]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [76]:
df = df.drop(labels=['id'],axis=1)

In [77]:
X = df.drop('price', axis = 1)
y = df[['price']]

In [78]:
X, y

(        carat        cut color clarity  depth  table     x     y     z
 0        1.52    Premium     F     VS2   62.2   58.0  7.27  7.33  4.55
 1        2.03  Very Good     J     SI2   62.0   58.0  8.06  8.12  5.05
 2        0.70      Ideal     G     VS1   61.2   57.0  5.69  5.73  3.50
 3        0.32      Ideal     G     VS1   61.6   56.0  4.38  4.41  2.71
 4        1.70    Premium     G     VS2   62.6   59.0  7.65  7.61  4.77
 ...       ...        ...   ...     ...    ...    ...   ...   ...   ...
 193568   0.31      Ideal     D    VVS2   61.1   56.0  4.35  4.39  2.67
 193569   0.70    Premium     G    VVS2   60.3   58.0  5.75  5.77  3.47
 193570   0.73  Very Good     F     SI1   63.1   57.0  5.72  5.75  3.62
 193571   0.34  Very Good     D     SI1   62.9   55.0  4.45  4.49  2.81
 193572   0.71       Good     E     SI2   60.8   64.0  5.73  5.71  3.48
 
 [193573 rows x 9 columns],
         price
 0       13619
 1       13387
 2        2772
 3         666
 4       14453
 ...       ...
 

In [79]:
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude= 'object').columns

In [80]:
cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categorical = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categorical = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [81]:
from sklearn.impute import SimpleImputer ## Handling Missing Value
from sklearn.preprocessing import StandardScaler ## Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder ## Ordinal Encoding
#Pipline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [82]:
## Numerical Pipline
num_pipeline = Pipeline(
    steps = [
        ('impute', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
    ]
)

In [83]:
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=[cut_categories, color_categorical, clarity_categorical])),
    ('scaler', StandardScaler())
])

In [84]:
preprocessor = ColumnTransformer(
    [
        ('num_pipeline', num_pipeline, numerical_cols),
        ('cat_pipeline', cat_pipeline, categorical_cols)
    ]
)

In [85]:
## Train test split
from sklearn.model_selection import train_test_split

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 30)

In [87]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns = preprocessor.get_feature_names_out())

In [88]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

In [89]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [90]:
reg.coef_

array([[ 6433.66003594,  -132.75843566,   -70.42922179, -1720.30971463,
         -499.29302619,   -63.39317848,    72.44537247,  -460.41604642,
          650.76431652]])

In [91]:
reg.intercept_

array([3970.76628955])

In [92]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def evaluate_model(true, pred): 
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    rmse = mse**0.5
    r2 = r2_score(true, pred)
    return mae, mse, rmse, r2

In [93]:
# Train multiple models

In [94]:
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet()
}

In [95]:

model_list = list()
r2_list = list()

In [97]:
for i in range(len(list(models))): 
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae, mse, rmse, r2 = evaluate_model(y_test, y_pred)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
  
    print('Model training preformance')
    print('RMSE', rmse)
    print('MAE', mae)
    print('r2 score', r2 * 100)
    r2_list.append(r2)
    print('*' * 35)
    print('\n')

LinearRegression
Model training preformance
RMSE 1013.9047094344004
MAE 674.0255115796832
r2 score 93.68908248567512
***********************************


Lasso
Model training preformance
RMSE 1013.8784226767013
MAE 675.0716923362162
r2 score 93.68940971841704
***********************************


Ridge
Model training preformance
RMSE 1013.9059272771613
MAE 674.0555800798156
r2 score 93.68906732505941
***********************************


ElasticNet
Model training preformance
RMSE 1533.416245606405
MAE 1060.7368759154729
r2 score 85.56494831165182
***********************************




In [98]:
model_list

['LinearRegression', 'LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']