In [121]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler ## Fearter Scaling
from sklearn.preprocessing import OrdinalEncoder ## For Ordinal Encoding Of Categories Like Cut Clarity Color
    
################# Pipelines ###########################

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [122]:
df = pd.read_csv('./data/gemstone.csv')

In [123]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [124]:
cut_cat = list(df['cut'].unique())
color_cat = list(df['color'].unique())
clarity_cat = list(df['clarity'].unique())

In [125]:
cut_map={"Fair":1,"Good":2,"Very Good":3,"Premium":4,"Ideal":5}
clarity_map = {"I1":1,"SI2":2 ,"SI1":3 ,"VS2":4 , "VS1":5 , "VVS2":6 , "VVS1":7 ,"IF":8}
color_map = {"D":1 ,"E":2 ,"F":3 , "G":4 ,"H":5 , "I":6, "J":7}

In [126]:
num_cols = [x for x in df.columns if df[x].dtype != 'O']
cat_cols = [x for x in df.columns if df[x].dtype == 'O']

In [127]:
x = df.drop(labels=['price'], axis=1)

In [128]:
y = df['price']

<H2>Numerical Pipeline</H2>

In [129]:
num_pipline=Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scalinng', StandardScaler())
    ]
)

<H2>Categorical Pipeline</H2>

In [130]:
## After ordinal encoding need to perform scaling 
## One hot encoding ke case main nahi karne ka

cat_pipeline=Pipeline(
    steps=[ 
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal_encoder', OrdinalEncoder(categories=[cut_cat, color_cat, clarity_cat])),
        ('scaling', StandardScaler())
    ]
)

In [131]:
## Removing Output feature Price from num_cols

num_cols = num_cols[:-1]

In [148]:
cat_cols

['cut', 'color', 'clarity']

In [132]:
preprocessor = ColumnTransformer([
    ('num_pipline', num_pipline, num_cols),
    ('cat_pipeline', cat_pipeline, cat_cols)
])

<H2>Train Test Split</H2>

In [133]:
from sklearn.model_selection import train_test_split

In [134]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=69)

In [147]:
x_train

Unnamed: 0,num_pipline__id,num_pipline__carat,num_pipline__depth,num_pipline__table,num_pipline__x,num_pipline__y,num_pipline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.406417,-1.018388,-1.497101,-0.641368,-1.142175,-1.135757,-1.239762,0.653353,0.058733,1.042275
1,0.561313,-0.823816,0.535229,-0.641368,-0.853601,-0.872537,-0.819274,0.653353,-0.473781,2.164576
2,0.524983,0.884097,1.736151,0.401599,0.832757,0.806624,1.007670,1.688597,-0.473781,-0.080027
3,-0.283390,-0.629244,-1.035208,0.401599,-0.528954,-0.573011,-0.601781,-1.417135,0.058733,0.481124
4,-0.369365,0.019331,1.459015,1.444566,0.183464,0.125883,0.282692,1.688597,-0.473781,-0.080027
...,...,...,...,...,...,...,...,...,...,...
135496,0.825180,-0.629244,0.073336,-1.162851,-0.519936,-0.573011,-0.529283,0.653353,0.591248,-1.202328
135497,1.463786,-1.018388,-0.388557,0.401599,-1.187265,-1.172063,-1.196263,-1.417135,0.058733,0.481124
135498,-1.284509,-1.061627,0.258093,-0.641368,-1.223337,-1.253752,-1.196263,0.653353,1.123763,1.603425
135499,0.789262,1.554290,0.812365,-0.641368,1.418924,1.387524,1.486156,-1.417135,1.123763,-1.202328


In [136]:
preprocessor.fit_transform(x_train)

array([[ 0.40641742, -1.01838824, -1.4971007 , ...,  0.65335255,
         0.05873337,  1.04227468],
       [ 0.56131268, -0.82381593,  0.5352289 , ...,  0.65335255,
        -0.47378123,  2.16457588],
       [ 0.52498254,  0.88409653,  1.73615094, ...,  1.68859652,
        -0.47378123, -0.08002653],
       ...,
       [-1.28450915, -1.06162653,  0.25809305, ...,  0.65335255,
         1.12376258,  1.60342528],
       [ 0.78926196,  1.55429002,  0.81236476, ..., -1.4171354 ,
         1.12376258, -1.20232773],
       [ 0.45225067,  0.49495192, -0.38855728, ...,  0.65335255,
        -1.53881044, -1.20232773]])

In [137]:
preprocessor.transform(x_test)

array([[ 0.1532339 , -0.78057764,  0.62760752, ..., -0.38189143,
         0.05873337, -1.20232773],
       [ 0.27947668,  0.47333277,  0.44285028, ...,  0.65335255,
        -1.53881044, -0.64117713],
       [ 1.63115487,  0.64628593, -1.4971007 , ..., -0.38189143,
         0.59124798, -0.08002653],
       ...,
       [ 0.92708385,  2.65686642, -0.01904281, ...,  0.65335255,
         1.12376258, -0.64117713],
       [-0.7846028 , -0.88867337,  0.81236476, ...,  0.65335255,
         0.05873337, -0.08002653],
       [-0.58174458,  0.23552217, -0.01904281, ..., -0.38189143,
         1.12376258, -0.64117713]])

In [138]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns=preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.transform(x_test), columns=preprocessor.get_feature_names_out())

In [139]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [140]:
regression = LinearRegression()
regression.fit(x_train, y_train)

In [141]:
regression.coef_

array([-1.17220255e-01,  6.56257694e+03, -2.36817395e+02, -1.76124517e+02,
       -2.12788630e+03, -5.52572863e+02, -3.99374442e+01, -7.92901072e+00,
       -1.82485072e+02,  1.24325856e+02])

In [142]:
regression.intercept_

3970.0783536652825

In [143]:
def eavl_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_squred = r2_score(true, predicted)
    return mae, mse, r2_squred

In [144]:
models = {
    'LinearRegression' : LinearRegression(),
    'Lasso' : Lasso(),
    'Ridge' : Ridge(),
    'ElasticNet' : ElasticNet()
}

model_names_list = list(models.keys())
model_list = list(models.values())
r2_list = []

In [145]:
model_list

[LinearRegression(), Lasso(), Ridge(), ElasticNet()]

In [146]:
for model in model_list:
    
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)

    mae, mse, r2_sqaure = eavl_model(true=y_test, predicted=y_pred)

    print("Model Tarinning Performance")
    print("Mean Absolute error : {}\nMeas Squared Error : {}\nR Squared Error : {}\n".format(mae, mse, r2_sqaure * 100))
    print("-"*40)

Model Tarinning Performance
Mean Absolute error : 733.6694790190835
Meas Squared Error : 1478252.8639065959
R Squared Error : 90.93694789108203

----------------------------------------
Model Tarinning Performance
Mean Absolute error : 735.0122880626295
Meas Squared Error : 1477809.879714949
R Squared Error : 90.93966379233976

----------------------------------------
Model Tarinning Performance
Mean Absolute error : 733.6992802473715
Meas Squared Error : 1478249.9047677894
R Squared Error : 90.93696603332944

----------------------------------------
Model Tarinning Performance
Mean Absolute error : 1138.5598111699048
Meas Squared Error : 2694739.8149668644
R Squared Error : 83.47876201742721

----------------------------------------
