In [50]:
import pandas as pd
import numpy as np

In [51]:
df = pd.read_csv('data/gemstone.csv')

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193573 entries, 0 to 193572
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       193573 non-null  int64  
 1   carat    193573 non-null  float64
 2   cut      193573 non-null  object 
 3   color    193573 non-null  object 
 4   clarity  193573 non-null  object 
 5   depth    193573 non-null  float64
 6   table    193573 non-null  float64
 7   x        193573 non-null  float64
 8   y        193573 non-null  float64
 9   z        193573 non-null  float64
 10  price    193573 non-null  int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 16.2+ MB


In [53]:
df.describe()

Unnamed: 0,id,carat,depth,table,x,y,z,price
count,193573.0,193573.0,193573.0,193573.0,193573.0,193573.0,193573.0,193573.0
mean,96786.0,0.790688,61.820574,57.227675,5.715312,5.720094,3.534246,3969.155414
std,55879.856166,0.462688,1.081704,1.918844,1.109422,1.102333,0.688922,4034.374138
min,0.0,0.2,52.1,49.0,0.0,0.0,0.0,326.0
25%,48393.0,0.4,61.3,56.0,4.7,4.71,2.9,951.0
50%,96786.0,0.7,61.9,57.0,5.7,5.72,3.53,2401.0
75%,145179.0,1.03,62.4,58.0,6.51,6.51,4.03,5408.0
max,193572.0,3.5,71.6,79.0,9.65,10.01,31.3,18818.0


In [54]:
df = df.drop('id',axis='columns')

In [55]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [56]:
## Independent and dependent features
X =df.drop('price',axis='columns')
y=df[['price']]

In [57]:
## Segregating numerical and categorical variable
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [58]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair','Good','Very Good','Premium','Ideal']
color_categories = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [59]:
# Importing relevant Libraries
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [60]:
## Numerical Pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)
## categorical Pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ('scaler',StandardScaler())
    ]
)
preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numerical_cols),
    ('cat_pipeline',cat_pipeline,categorical_cols)
])

In [61]:
#Train Test Split Data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,random_state=42,test_size=0.30)

In [62]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [63]:
X_train.head()

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.8741,-0.32088,2.017037
4,-0.995648,0.25823,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.52672,-0.648127


In [69]:
def evaluate_model(true:np.array,predicted:np.array)->tuple:
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mean_squared_error(true,predicted))
    r2 = r2_score(true,predicted)
    return mae,mse,rmse,r2

In [71]:
## Train Multiple Models
models = {
    'linear_regression':LinearRegression(),
    'lasso':Lasso(),
    'ridge_regression':Ridge(),
    'elasticnet_regression':ElasticNet()
}
model_list=[]
r2_list=[]
for x in range(len(list(models))):
    #chosing and fitting the model
    model = list(models.values())[x]
    model.fit(X_train,y_train)
    # Making predictions
    predicted = model.predict(X_test)
    mae,mse,rmse,r2 = evaluate_model(y_test,predicted)
    model_list.append(list(models)[x])
    print(list(models)[x])
    print('Model Training performance')
    print(f'Ther Mean absolute error of the model is {mae}')
    print(f'Ther Mean squared error of the model is {mse}')
    print(f'Ther rmse of the model is {rmse}')
    print(f'The r2 Score is {r2*100}')
    print('='*35)
    print('\n')
    r2_list.append(r2)

linear_regression
Model Training performance
Ther Mean absolute error of the model is 675.0758270067445
Ther Mean squared error of the model is 1029473.3531156889
Ther rmse of the model is 1014.6296630375483
The r2 Score is 93.62906819996046


lasso
Model Training performance
Ther Mean absolute error of the model is 676.2421173665505
Ther Mean squared error of the model is 1029533.150650549
Ther rmse of the model is 1014.6591302750638
The r2 Score is 93.62869814082755


ridge_regression
Model Training performance
Ther Mean absolute error of the model is 675.1077629781348
Ther Mean squared error of the model is 1029482.8101268972
Ther rmse of the model is 1014.634323353442
The r2 Score is 93.6290096749163


elasticnet_regression
Model Training performance
Ther Mean absolute error of the model is 1060.9432977143008
Ther Mean squared error of the model is 2351174.871397875
Ther rmse of the model is 1533.3541245902313
The r2 Score is 85.44967219374031




In [44]:
# list(models)

['linear_regression', 'lasso', 'ridge_regression', 'elasticnet_regression']