In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [11]:
data=pd.read_csv('diamonds.csv')

In [12]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [13]:
data.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


## HotEncoding to convert categorical variables to numeric, for model to be able to use.

In [14]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first')

data_object = data.select_dtypes('object')
ohe.fit(data_object)

codes = ohe.transform(data_object).toarray()
feature_names = ohe.get_feature_names_out(data_object.columns)

data_ohe = pd.concat([data.select_dtypes(exclude='object'), 
               pd.DataFrame(codes,columns=feature_names).astype(int)], axis=1)

data_ohe.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Good,cut_Ideal,cut_Premium,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,0.21,59.8,61.0,326,3.89,3.84,2.31,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0.23,56.9,65.0,327,4.05,4.07,2.31,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0.29,62.4,58.0,334,4.2,4.23,2.63,0,0,1,...,0,1,0,0,0,0,0,1,0,0
4,0.31,63.3,58.0,335,4.34,4.35,2.75,1,0,0,...,0,0,1,0,0,1,0,0,0,0


## Creating Model

In [15]:
x = data_ohe.drop(['price'],axis=1).to_numpy()
y = data_ohe['price'].to_numpy()

#Create Train and Test Datasets
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=100)

#Scaleing
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train2 = sc.fit_transform(x_train)
x_test2 = sc.transform(x_test)

#Model
from sklearn.linear_model import LinearRegression

In [16]:
from sklearn import metrics
method = LinearRegression()

method.fit(x_train2,y_train)
predict = method.predict(x_test2)


## Evalution Metrics 

In [17]:
#Coefficents
print('\nIntercept: {:.2f}'.format(float(method.intercept_)))
coeff_table=pd.DataFrame(np.transpose(method.coef_),
                         data_ohe.drop(['price'],axis=1).columns,
                         columns=['Coefficients'])
print(coeff_table)


Intercept: 3945.97
               Coefficients
carat           5423.170294
depth            -91.829051
table            -58.169284
x              -1190.321358
y                  5.334533
z                -50.839380
cut_Good         180.516501
cut_Ideal        434.958779
cut_Premium      355.008044
cut_Very Good    322.900758
color_E          -77.688697
color_F          -94.109194
color_G         -189.291421
color_H         -350.376438
color_I         -439.321748
color_J         -529.482578
clarity_IF       938.247424
clarity_SI1     1561.600196
clarity_SI2     1003.923103
clarity_VS1     1641.652097
clarity_VS2     1774.775673
clarity_VVS1    1246.722571
clarity_VVS2    1437.893300


In [18]:
print('\nR2: {:.2f}'.format(metrics.r2_score(y_test,predict)))
adjusted_r_squared = 1-(1-metrics.r2_score(y_test,predict))*(len(y)-1)/(len(y)-x.shape[1]-1)
print('Adj_R2: {:0.2f}'.format(adjusted_r_squared))
print('Mean Absolute Error: {:.2f}'.format(metrics.mean_absolute_error(y_test, predict)))  
print('Mean Squared Error: {:.2f}'.format(metrics.mean_squared_error(y_test, predict)))  
print('Root Mean Squared Error: {:.2f}'.format(np.sqrt(metrics.mean_squared_error(y_test, predict)))) 


R2: 0.92
Adj_R2: 0.92
Mean Absolute Error: 732.62
Mean Squared Error: 1284446.80
Root Mean Squared Error: 1133.33
