In [27]:
import pandas as pd
import numpy as np

In [28]:
# dataset imported from  kaggle
df = pd.read_csv('D:\Study\Data Science\MLOPS\Gemstone_Price_Prediction\experiment\df_clean.csv')
df.head(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [29]:
# checking the details of gemstone dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26879 entries, 0 to 26878
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    26879 non-null  float64
 1   cut      26879 non-null  object 
 2   color    26879 non-null  object 
 3   clarity  26879 non-null  object 
 4   depth    26182 non-null  float64
 5   table    26879 non-null  float64
 6   x        26879 non-null  float64
 7   y        26879 non-null  float64
 8   z        26879 non-null  float64
 9   price    26879 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 2.1+ MB


Description of each Columns in the dataset

Carat: 	 Carat weight of the cubic zirconia.

Cut: 	 Describe the cut quality of the cubic zirconia. Quality is increasing order Fair, Good, Very Good, Premium, Ideal.

Color :  Colour of the cubic zirconia.With D being the best and J the worst.

Clarity: Cubic zirconia Clarity refers to the absence of the Inclusions and Blemishes. (In order from Best to Worst, FL = flawless, I3= level 3 inclusions) FL, IF, VVS1, VVS2, VS1, VS2, SI1, SI2, I1, I2, I3

Depth: The Height of a cubic zirconia, measured from the Culet to the table, divided by its average 
Girdle Diameter.

Table: 	 The Width of the cubic zirconia's Table expressed as a Percentage of its Average Diameter.

X:	 Length of the cubic zirconia in mm.

Y:	 Width of the cubic zirconia in mm.

Z:	 Height of the cubic zirconia in mm.

Price:	 the Price of the cubic zirconia.


In [30]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [31]:
X=df.drop(labels=["price"],axis=1)
y=df["price"]

In [32]:
X.shape

(26879, 9)

In [33]:
y.shape

(26879,)

In [34]:
cat_columns = X.columns[X.dtypes=='object']
cat_columns

Index(['cut', 'color', 'clarity'], dtype='object')

In [35]:
num_columns = X.columns[X.dtypes!='object']
num_columns

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [36]:

cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [37]:
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer()),
        ("scaler",StandardScaler())
    ]
)

In [38]:
cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("ordinalencoder",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories]))
    ]
)

In [39]:
preprocessor=ColumnTransformer(
    [
        ("num_pipeline",num_pipeline,num_columns),
        ("cat_pipeline",cat_pipeline,cat_columns)
    ]
)

In [40]:

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30, random_state=42)

In [41]:

X_train.shape

(18815, 9)

In [42]:

y_train.shape

(18815,)

In [43]:

preprocessor.fit_transform(X_train)

array([[ 8.01829892e-01,  3.26967973e-01, -1.54206281e+00, ...,
         4.00000000e+00,  3.00000000e+00,  5.00000000e+00],
       [ 4.24374532e-01, -2.62214468e+00,  2.02121092e+00, ...,
         2.00000000e+00,  2.00000000e+00,  4.00000000e+00],
       [ 2.35646851e-01,  5.42756704e-01,  1.57580170e+00, ...,
         2.00000000e+00,  1.00000000e+00,  2.00000000e+00],
       ...,
       [-8.54779745e-01,  5.11090384e-15, -6.51244379e-01, ...,
         4.00000000e+00,  3.00000000e+00,  6.00000000e+00],
       [-8.96719229e-01, -1.04609488e-01, -6.51244379e-01, ...,
         2.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 4.87283758e-01,  3.92496654e-02, -1.54206281e+00, ...,
         4.00000000e+00,  0.00000000e+00,  1.00000000e+00]])

In [44]:

preprocessor.transform(X_test)

array([[ 0.71795092,  0.47082713,  0.23957405, ...,  3.        ,
         0.        ,  1.        ],
       [ 1.64061958,  0.11117924, -1.09665359, ...,  4.        ,
         5.        ,  6.        ],
       [-0.91768897,  0.39889755, -0.20583516, ...,  4.        ,
         4.        ,  5.        ],
       ...,
       [-0.20471774, -0.60811653,  0.68498327, ...,  2.        ,
         1.        ,  2.        ],
       [-1.12738639,  0.18310882, -1.09665359, ...,  4.        ,
         3.        ,  2.        ],
       [-1.04350742,  0.2550384 , -1.09665359, ...,  4.        ,
         2.        ,  3.        ]])

In [45]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [46]:

X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [47]:
X_train

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,0.801830,3.269680e-01,-1.542063,0.933511,0.918652,0.991728,4.0,3.0,5.0
1,0.424375,-2.622145e+00,2.021211,0.747105,0.749656,0.420313,2.0,2.0,4.0
2,0.235647,5.427567e-01,1.575802,0.329911,0.386316,0.434599,2.0,1.0,2.0
3,-0.980598,6.866159e-01,-1.096654,-1.125830,-1.117745,-1.079650,4.0,3.0,5.0
4,2.961713,3.269680e-01,0.239574,2.291611,2.143870,2.320267,3.0,4.0,0.0
...,...,...,...,...,...,...,...,...,...
18810,0.717951,7.585454e-01,0.239574,0.844746,0.766556,0.920301,3.0,3.0,1.0
18811,0.445344,-8.958348e-01,0.239574,0.729352,0.622910,0.577452,3.0,3.0,1.0
18812,-0.854780,5.110904e-15,-0.651244,-0.948301,-0.872701,-0.908226,4.0,3.0,6.0
18813,-0.896719,-1.046095e-01,-0.651244,-0.992683,-0.914950,-0.979653,2.0,1.0,1.0


In [48]:

!pip install xgboost




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\Ravi0dubey\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [49]:

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [50]:
models={

    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'Randomforest':RandomForestRegressor(),
    'xgboost':XGBRegressor()

}

In [51]:
trained_model_list=[]
model_list=[]
r2_list=[]

In [52]:
def evaluate_model(true,pred):
    r2=r2_score(true,pred)
    mae=mean_absolute_error(true,pred)
    mse=mean_squared_error(true,pred)

    return mae, mse,r2

In [53]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    #make_prediction
    y_pred=model.predict(X_test)
    #this is for the validaiton
    MAE,MSE,R2=evaluate_model(y_test,y_pred)


    print("model training performance",model)
    print("MSE:", MSE)
    print("MAE:",MAE)
    print("R2 SCORE:",R2)

    r2_list.append(R2)

    print("="*40)
    print("\n")

model training performance LinearRegression()
MSE: 1523253.2680424429
MAE: 813.7298486167483
R2 SCORE: 0.9084989604088661


model training performance Lasso()
MSE: 1523365.6744047205
MAE: 815.15920584531
R2 SCORE: 0.9084922082165545


model training performance Ridge()
MSE: 1523441.680458947
MAE: 813.9934972095208
R2 SCORE: 0.9084876425720075


model training performance ElasticNet()
MSE: 2756776.0346063934
MAE: 1081.991201591939
R2 SCORE: 0.8344018828788883


model training performance RandomForestRegressor()
MSE: 322106.91096052824
MAE: 286.5161148313492
R2 SCORE: 0.9806512036896835


model training performance XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=Non

In [54]:
r2_list

[0.9084989604088661,
 0.9084922082165545,
 0.9084876425720075,
 0.8344018828788883,
 0.9806512036896835,
 0.9816304445266724]

In [55]:
max(r2_list)

0.9816304445266724

XGBOOST is giving 98% accuracy so we will be using it as our model