### Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np

#from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split#, GridSearchCV
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.metrics import confusion_matrix, accuracy_score 


## Read in Data

In [2]:
data = pd.read_csv('diamonds.csv')

## Clean Data

In [3]:
# preview data
data.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


### Drop and Rename Columns

In [4]:
# drop index column
data.drop(columns=['Unnamed: 0'], inplace=True)

In [7]:
# rename columns to better reflect information contained within
data.rename(columns={'depth': 'depth(%)',
                     'table': 'table(%)',
                     'x': 'length(mm)',
                     'y': 'width(mm)',
                     'z': 'depth(mm)'},
            inplace=True)

### Describe Data

In [8]:
# get information about data types and null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   carat       53940 non-null  float64
 1   cut         53940 non-null  object 
 2   color       53940 non-null  object 
 3   clarity     53940 non-null  object 
 4   depth(%)    53940 non-null  float64
 5   table(%)    53940 non-null  float64
 6   price       53940 non-null  int64  
 7   length(mm)  53940 non-null  float64
 8   width(mm)   53940 non-null  float64
 9   depth(mm)   53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [22]:
# describe data
data.describe()

Unnamed: 0,carat,depth(%),table(%),price,length(mm),width(mm),depth(mm)
count,53920.0,53920.0,53920.0,53920.0,53920.0,53920.0,53920.0
mean,0.797698,61.749514,57.456834,3930.993231,5.731627,5.734887,3.540046
std,0.473795,1.432331,2.234064,3987.280446,1.119423,1.140126,0.70253
min,0.2,43.0,43.0,326.0,3.73,3.68,1.07
25%,0.4,61.0,56.0,949.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5323.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


### Remove 0 Values for Length, Width, and Depth

Based on the above description, the minimum value in each of the 'length(mm)', 'width(mm)', and 'depth(mm)' columns is 0. The fact that a diamond cannot have a length, width, or depth of 0 calls the veracity of these records into question. As such, they are dropped from the data.  

In [11]:
# change value of 0 in 'length(mm)' column to np.nan
data['length(mm)'] = np.where(data['length(mm)']==0, np.nan, data['length(mm)'])

In [16]:
# change value of 0 in 'width(mm)' column to np.nan
data['width(mm)'] = np.where(data['width(mm)']==0, np.nan, data['width(mm)'])

In [18]:
# change value of 0 in 'depth(mm)' column to np.nan
data['depth(mm)'] = np.where(data['depth(mm)']==0, np.nan, data['depth(mm)'])

In [20]:
# drop rows with np.nan values
data.dropna(inplace=True)

In [21]:
# resulting shape of data
data.shape

(53920, 10)

### Convert Factor Variables into Numerical Representations

There are three columns that need to be converted into numerical representations before any data can be fed into the model: 'cut', 'color', and 'clarity'.


#### Diamond Cut

In [23]:
# unique values and counts for 'cut' column
data['cut'].value_counts()

Ideal        21548
Premium      13780
Very Good    12081
Good          4902
Fair          1609
Name: cut, dtype: int64

Given that the 'cut' column is an ordered factor variable, it is important to convert the strings to numeric values, rather than using dummy variables, in order to maintain the information contained within that ordering. 

In [25]:
# create dictionary to map 'cut' values to numbers
cut_dict = {'Ideal': 1,
            'Premium': 2,
            'Very Good': 3,
            'Good': 4,
            'Fair': 5}

In [26]:
# map values in 'cut' column using cut_dict
data['cut'] = data['cut'].map(cut_dict)

In [27]:
# check that mapping was successful
data['cut'].value_counts()

1    21548
2    13780
3    12081
4     4902
5     1609
Name: cut, dtype: int64

#### Diamond Color

In [28]:
# unique values and counts for 'color' column
data['color'].value_counts()

G    11284
E     9797
F     9538
H     8298
D     6774
I     5421
J     2808
Name: color, dtype: int64

Research on diamond value as it relates to color confirms that this is also an ordered factor variable, e.g., a diamond with color D is considered to be the most expensive, followed by E, F, etc. As such, the strings are converted to numeric values in order to preserve information contained within this ordering.

In [29]:
# create dictionary to map 'color' values to numbers
color_dict = {'D': 1, 'E': 2, 'F': 3, 'G': 4, 
              'H': 5, 'I': 6, 'J': 7}

In [30]:
# map values in 'color' column using color_dict
data['color'] = data['color'].map(color_dict)

In [31]:
# check that mapping was successful
data['color'].value_counts()

4    11284
2     9797
3     9538
5     8298
1     6774
6     5421
7     2808
Name: color, dtype: int64

#### Diamond Clarity

In [32]:
# unique values and counts for 'clarity' column
data['clarity'].value_counts()

SI1     13063
VS2     12254
SI2      9185
VS1      8170
VVS2     5066
VVS1     3654
IF       1790
I1        738
Name: clarity, dtype: int64

Research on diamond value as it relates to clarity confirms that this is also an ordered factor variable, e.g., a diamond with clarity IF (Internally Flawless) is considered to be more rare (and thus more valuable) than one with clarity I1 (Imperfect). As such, the strings are converted to numeric values in order to preserve information contained within this ordering.

In [33]:
# create dictionary to map 'clarity' values to numbers
clarity_dict = {'IF': 1, 'VVS1': 2, 'VVS2': 3,
                'VS1': 4, 'VS2': 5, 'SI1': 6,
                'SI2': 7, 'I1': 8}

In [34]:
# map values in 'clarity' column using clarity_dict
data['clarity'] = data['clarity'].map(clarity_dict)

In [35]:
# check that mapping was successful
data['clarity'].value_counts()

6    13063
5    12254
7     9185
4     8170
3     5066
2     3654
1     1790
8      738
Name: clarity, dtype: int64

In [36]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth(%),table(%),price,length(mm),width(mm),depth(mm)
0,0.23,1,2,7,61.5,55.0,326,3.95,3.98,2.43
1,0.21,2,2,6,59.8,61.0,326,3.89,3.84,2.31
2,0.23,4,2,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,2,6,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,4,7,7,63.3,58.0,335,4.34,4.35,2.75
