In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
df= pd.read_csv('cubic_zirconia.csv')

In [3]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779
5,6,1.02,Ideal,D,VS2,61.5,56.0,6.46,6.49,3.99,9502
6,7,1.01,Good,H,SI1,63.7,60.0,6.35,6.3,4.03,4836
7,8,0.5,Premium,E,SI1,61.5,62.0,5.09,5.06,3.12,1415
8,9,1.21,Good,H,SI1,63.8,64.0,6.72,6.63,4.26,5407
9,10,0.35,Ideal,F,VS2,60.5,57.0,4.52,4.6,2.76,706


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26967 entries, 0 to 26966
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  26967 non-null  int64  
 1   carat       26967 non-null  float64
 2   cut         26967 non-null  object 
 3   color       26967 non-null  object 
 4   clarity     26967 non-null  object 
 5   depth       26270 non-null  float64
 6   table       26967 non-null  float64
 7   x           26967 non-null  float64
 8   y           26967 non-null  float64
 9   z           26967 non-null  float64
 10  price       26967 non-null  int64  
dtypes: float64(6), int64(2), object(3)
memory usage: 2.3+ MB


In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,carat,depth,table,x,y,z,price
count,26967.0,26967.0,26270.0,26967.0,26967.0,26967.0,26967.0,26967.0
mean,13484.0,0.798375,61.745147,57.45608,5.729854,5.733569,3.538057,3939.518115
std,7784.846691,0.477745,1.41286,2.232068,1.128516,1.166058,0.720624,4024.864666
min,1.0,0.2,50.8,49.0,0.0,0.0,0.0,326.0
25%,6742.5,0.4,61.0,56.0,4.71,4.71,2.9,945.0
50%,13484.0,0.7,61.8,57.0,5.69,5.71,3.52,2375.0
75%,20225.5,1.05,62.5,59.0,6.55,6.54,4.04,5360.0
max,26967.0,4.5,73.6,79.0,10.23,58.9,31.8,18818.0


In [6]:
df.duplicated().sum()

0

In [7]:
df.isnull().sum()

Unnamed: 0      0
carat           0
cut             0
color           0
clarity         0
depth         697
table           0
x               0
y               0
z               0
price           0
dtype: int64

In [8]:
df=df.rename(columns={'x':'length_x'})
df=df.rename(columns={'y':'width_y'})
df=df.rename(columns={'z':'height_z'})

In [9]:
df= df.drop(['Unnamed: 0'],axis=1)

In [10]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,length_x,width_y,height_z,price
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [11]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
carat,26967,,,,0.798375,0.477745,0.2,0.4,0.7,1.05,4.5
cut,26967,5.0,Ideal,10816.0,,,,,,,
color,26967,7.0,G,5661.0,,,,,,,
clarity,26967,8.0,SI1,6571.0,,,,,,,
depth,26270,,,,61.7451,1.41286,50.8,61.0,61.8,62.5,73.6
table,26967,,,,57.4561,2.23207,49.0,56.0,57.0,59.0,79.0
length_x,26967,,,,5.72985,1.12852,0.0,4.71,5.69,6.55,10.23
width_y,26967,,,,5.73357,1.16606,0.0,4.71,5.71,6.54,58.9
height_z,26967,,,,3.53806,0.720624,0.0,2.9,3.52,4.04,31.8
price,26967,,,,3939.52,4024.86,326.0,945.0,2375.0,5360.0,18818.0


In [12]:
numcol= df[['carat','depth','table','length_x','width_y','height_z','price']]
numcol

Unnamed: 0,carat,depth,table,length_x,width_y,height_z,price
0,0.30,62.1,58.0,4.27,4.29,2.66,499
1,0.33,60.8,58.0,4.42,4.46,2.70,984
2,0.90,62.2,60.0,6.04,6.12,3.78,6289
3,0.42,61.6,56.0,4.82,4.80,2.96,1082
4,0.31,60.4,59.0,4.35,4.43,2.65,779
...,...,...,...,...,...,...,...
26962,1.11,62.3,58.0,6.61,6.52,4.09,5408
26963,0.33,61.9,55.0,4.44,4.42,2.74,1114
26964,0.51,61.7,58.0,5.12,5.15,3.17,1656
26965,0.27,61.8,56.0,4.19,4.20,2.60,682


In [13]:
df= df.dropna()

In [14]:
def remove_outlier (col):
    sorted(col)
    Q1,Q3=np.percentile(col,[25,75])
    IQR=Q3-Q1
    lower_range = Q1-(1.5*IQR)
    upper_range = Q3+(1.5*IQR)
    return lower_range, upper_range

In [15]:

df['length_x']=np.where(df['length_x']==0,df['length_x'].median(),df['length_x'])
df['width_y']=np.where(df['width_y']==0,df['width_y'].median(),df['width_y'])
df['height_z']=np.where(df['height_z']==0,df['height_z'].median(),df['height_z'])

In [16]:
for column in numcol:
    lr,ur=remove_outlier(df[column])
    df[column]=np.where(df[column]>ur,ur,df[column])
    df[column]=np.where(df[column]<lr,lr,df[column])

In [17]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,length_x,width_y,height_z,price
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499.0
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984.0
2,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289.0
3,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082.0
4,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779.0


In [18]:
from sklearn.preprocessing import OrdinalEncoder
enc= OrdinalEncoder()

In [19]:
Cut_ord= ['Ideal','Premium','Very Good','Good','Fair']
Color_ord= ['D','E','F','G','H','I','J' ]
Clarity_ord= ['IF','VVS1','VVS2','VS1','VS2','SI1','SI2','I1']


In [20]:
enc=OrdinalEncoder(categories=[Cut_ord])
enc.fit_transform(df[['cut']])
df[['cut']]= enc.fit_transform(df[['cut']])
enc=OrdinalEncoder(categories=[Color_ord])
df[['color']]= enc.fit_transform(df[['color']])
enc=OrdinalEncoder(categories=[Clarity_ord])
df[['clarity']]= enc.fit_transform(df[['clarity']])

In [21]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,length_x,width_y,height_z,price
0,0.3,0.0,1.0,5.0,62.1,58.0,4.27,4.29,2.66,499.0
1,0.33,1.0,3.0,0.0,60.8,58.0,4.42,4.46,2.7,984.0
2,0.9,2.0,1.0,2.0,62.2,60.0,6.04,6.12,3.78,6289.0
3,0.42,0.0,2.0,3.0,61.6,56.0,4.82,4.8,2.96,1082.0
4,0.31,0.0,2.0,1.0,60.4,59.0,4.35,4.43,2.65,779.0


In [22]:
df.describe(include='all').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
carat,26270.0,0.793229,0.461918,0.2,0.4,0.7,1.05,2.025
cut,26270.0,1.090103,1.11318,0.0,0.0,1.0,2.0,4.0
color,26270.0,2.601979,1.704706,0.0,1.0,3.0,4.0,6.0
clarity,26270.0,3.944766,1.647732,0.0,3.0,4.0,5.0,7.0
depth,26270.0,61.744151,1.259991,58.75,61.0,61.8,62.5,64.75
table,26270.0,57.435603,2.156251,51.5,56.0,57.0,59.0,63.5
length_x,26270.0,5.729645,1.125256,3.73,4.71,5.69,6.55,9.31
width_y,26270.0,5.731569,1.117231,3.71,4.72,5.7,6.54,9.27
height_z,26270.0,3.537614,0.695305,1.19,2.9,3.52,4.04,5.75
price,26270.0,3736.799124,3468.803481,326.0,945.0,2375.0,5361.0,11985.0


In [23]:
x= df.drop(['price'], axis=1)
y= df.price

In [24]:
x_train,x_test,train_labels,test_labels= train_test_split(x,y,random_state=1,test_size=0.30)

In [25]:
ss =StandardScaler()
x_train_scaled =ss.fit_transform(x_train)
x_test_scaled = ss.transform(x_test)

In [26]:
# Making different regression techniques to compare for the best model using rsquare and RMSE value
ann= MLPRegressor (hidden_layer_sizes= (700), random_state=1,max_iter=1000)
rf= RandomForestRegressor (random_state=1)
dtr =tree.DecisionTreeRegressor(random_state=1)
reg_mod= LinearRegression()

Models =[reg_mod, dtr,rf,ann]

rmse_train=[]
rmse_test=[]
scores_train=[]
scores_test=[]


In [27]:
for i in Models:
    if(i!=ann):
        i.fit(x_train,train_labels)
        scores_train.append(i.score(x_train,train_labels))
        scores_test.append(i.score(x_test,test_labels))
        rmse_train.append(np.sqrt(mean_squared_error(train_labels,i.predict(x_train))))
        rmse_test.append (np.sqrt(mean_squared_error(test_labels,i.predict(x_test))))
    
    else:
        i.fit(x_train_scaled,train_labels)
        scores_train.append(i.score(x_train_scaled,train_labels))
        scores_test.append(i.score(x_test_scaled, test_labels))
        rmse_train.append(np.sqrt(mean_squared_error(train_labels, i.predict(x_train_scaled))))
        rmse_test.append(np.sqrt(mean_squared_error(test_labels,i.predict(x_test_scaled))))
        
        



In [28]:
met_val= pd.DataFrame({'RMSE_Train':rmse_train, 'RMSE_Test':rmse_test, 'Score_Train': scores_train,'Score_Test': scores_test},index=['Linear Regression', 'Decission Tree', 'Random Forest', 'ANN'])

In [29]:
print (met_val)

                   RMSE_Train   RMSE_Test  Score_Train  Score_Test
Linear Regression  908.157875  917.772123     0.931624    0.929583
Decission Tree       2.891423  530.474632     0.999999    0.976474
Random Forest      145.056554  396.745249     0.998256    0.986841
ANN                397.582617  420.404934     0.986895    0.985224


##### Grid Search was not required since there is no sign of over/under fitting in all the regressors.

Random Forest shows the best results.
Linear Regression has the best comparative RMSE results, which we found out while we did the comparision.