In [None]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics

In [None]:
data = pd.read_csv('diamonds.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [None]:
data.shape

(53940, 11)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53940 non-null  float64
 2   cut         53940 non-null  object 
 3   color       53940 non-null  object 
 4   clarity     53940 non-null  object 
 5   depth       53940 non-null  float64
 6   table       53940 non-null  float64
 7   price       53940 non-null  int64  
 8   x           53940 non-null  float64
 9   y           53940 non-null  float64
 10  z           53940 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB


In [None]:
data.isna().sum()

cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
dtype: int64

In [None]:
data = data.drop(["Unnamed: 0"], axis=1)
data.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [None]:
data = data.drop(["carat"], axis=1)
data.describe()

Unnamed: 0,depth,table,price,x,y,z
count,53907.0,53907.0,53907.0,53907.0,53907.0,53907.0
mean,61.749741,57.455948,3930.58447,5.731463,5.733292,3.539441
std,1.420119,2.226153,3987.202815,1.119384,1.111252,0.691434
min,50.8,43.0,326.0,3.73,3.68,2.06
25%,61.0,56.0,949.0,4.71,4.72,2.91
50%,61.8,57.0,2401.0,5.7,5.71,3.53
75%,62.5,59.0,5322.0,6.54,6.54,4.04
max,73.6,79.0,18823.0,10.74,10.54,6.98


In [None]:
data = data.drop(["price"], axis=1)
data.describe()

Unnamed: 0,depth,table,x,y,z
count,53907.0,53907.0,53907.0,53907.0,53907.0
mean,61.749741,57.455948,5.731463,5.733292,3.539441
std,1.420119,2.226153,1.119384,1.111252,0.691434
min,50.8,43.0,3.73,3.68,2.06
25%,61.0,56.0,4.71,4.72,2.91
50%,61.8,57.0,5.7,5.71,3.53
75%,62.5,59.0,6.54,6.54,4.04
max,73.6,79.0,10.74,10.54,6.98


In [None]:
data = data.drop(data[data["x"]==0].index)
data = data.drop(data[data["y"]==0].index)
data = data.drop(data[data["z"]==0].index)
data.shape

(53920, 10)

In [None]:
#Dropping the outliers.
data = data[(data["depth"]<75)&(data["depth"]>45)]
data = data[(data["table"]<80)&(data["table"]>40)]
data = data[(data["x"]<30)]
data = data[(data["y"]<30)]
data = data[(data["z"]<30)&(data["z"]>2)]
data.shape

(53907, 10)

In [None]:
# Get list of categorical variables
s = (data.dtypes =="object")
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

Categorical variables:
['cut', 'color', 'clarity']


In [None]:
x = data.select_dtypes('object').columns
for i in x:
    print(i,': ',data[i].unique())

cut :  ['Ideal' 'Premium' 'Good' 'Very Good' 'Fair']
color :  ['E' 'I' 'J' 'H' 'F' 'G' 'D']
clarity :  ['SI2' 'SI1' 'VS1' 'VS2' 'VVS2' 'VVS1' 'I1' 'IF']


In [None]:
# Make copy to avoid changing original data
label_data = data.copy()

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_data[col] = label_encoder.fit_transform(label_data[col])
label_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75


In [None]:
data.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53907.0,53907.0,53907.0,53907.0,53907.0,53907.0,53907.0
mean,0.797628,61.749741,57.455948,3930.58447,5.731463,5.733292,3.539441
std,0.473765,1.420119,2.226153,3987.202815,1.119384,1.111252,0.691434
min,0.2,50.8,43.0,326.0,3.73,3.68,2.06
25%,0.4,61.0,56.0,949.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5322.0,6.54,6.54,4.04
max,5.01,73.6,79.0,18823.0,10.74,10.54,6.98
