In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df=pd.read_csv("Data\Gemstone.csv")
df

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70,984
2,3,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779
...,...,...,...,...,...,...,...,...,...,...,...
26962,26963,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09,5408
26963,26964,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74,1114
26964,26965,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17,1656
26965,26966,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60,682


In [3]:
df=df.drop("Unnamed: 0",axis=1)

In [41]:
X=df.iloc[:,:-1]

In [42]:
y=df.iloc[:,-1]

In [44]:
categorical=X.columns[X.dtypes=='object']
numerical=X.columns[X.dtypes!='object']

In [30]:
categorical

Index(['cut', 'color', 'clarity'], dtype='object')

In [45]:
X


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.30,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.70
2,0.90,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.80,2.96
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65
...,...,...,...,...,...,...,...,...,...
26962,1.11,Premium,G,SI1,62.3,58.0,6.61,6.52,4.09
26963,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74
26964,0.51,Premium,E,VS2,61.7,58.0,5.12,5.15,3.17
26965,0.27,Very Good,F,VVS2,61.8,56.0,4.19,4.20,2.60


In [8]:
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [13]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler

In [14]:
from sklearn.preprocessing import OrdinalEncoder 

In [15]:
from sklearn.pipeline import Pipeline

In [46]:
num_pipeline=Pipeline(
    steps=[
    ('Imputer',SimpleImputer(strategy='median')),
    ('Scalar',StandardScaler())
    ]
)

cat_pipeline=Pipeline(
    steps=[
    ('Imputer',SimpleImputer(strategy='most_frequent')),
    ('OrdinalEncoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
    ('Scalar',StandardScaler())

    ]
)

In [47]:
from sklearn.compose import ColumnTransformer

In [48]:
preprocessor=ColumnTransformer(
    [('num_pipeline',num_pipeline,numerical),
    ('cat_pipeline',cat_pipeline,categorical)]
)

In [49]:
preprocessor

In [50]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=34)

In [51]:
x_train=pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())

In [52]:
x_test=pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [53]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet

In [61]:
model={
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lasso":Lasso(),
    "ElasticNet":ElasticNet()
}

In [60]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
def evaluate(y_pred,y_test):
    r2Score=r2_score(y_test,y_pred)
    mse=mean_squared_error(y_test,y_pred)
    mae=mean_absolute_error(y_test,y_pred)

    return (r2Score*100,mse,mae)

In [64]:
model_score=dict()

In [85]:
for i in model.keys():
    print("="*30)
    print(i)
    model_temp=model[i].fit(x_train,y_train)
    y_pred=model_temp.predict(x_test)
    r2,mse,mae=evaluate(y_test,y_pred)
    model_score[i]=r2
    print(f"Model : {i}, r2_score : {r2}, mean squre error : {mse} , mean absolute error : {mae}")
    print("="*30)
    value=(list(model_score.values()))
    best_r2=sorted(value)[-1]
    for i in model_score:
        if(model_score[i]==best_r2):
            print(f"Best model is {i} and r2_score is {model_score[i]}")
            break

LinearRegression
Model : LinearRegression, r2_score : 89.68456343043485, mean squre error : 1505373.4925510043 , mean absolute error : 818.5804124928552
Best model is LinearRegression and r2_score is 89.68456343043485
Ridge
Model : Ridge, r2_score : 89.68104989949485, mean squre error : 1505616.8972281725 , mean absolute error : 818.797871696822
Best model is LinearRegression and r2_score is 89.68456343043485
Lasso
Model : Lasso, r2_score : 89.68302992233187, mean squre error : 1503840.6168850537 , mean absolute error : 819.7031975329884
Best model is LinearRegression and r2_score is 89.68456343043485
ElasticNet
Model : ElasticNet, r2_score : 73.64900347554385, mean squre error : 2683102.886053359 , mean absolute error : 1081.462611683036
Best model is LinearRegression and r2_score is 89.68456343043485


Best model is LinearRegression and r2_score is 89.68456343043485


In [59]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
def evaluate(y_pred,y_test):
    r2Score=r2_score(y_test,y_pred)
    mse=mean_squared_error(y_test,y_pred)
    mae=mean_absolute_error(y_test,y_pred)

    return (r2Score*100,mse,mae)
