# DSCI 503 – Project 03
### Matt Snyder

In [1]:
import numpy as np               
import pandas as pd              
import matplotlib.pyplot as plt

## Part 1: Loading the Dataset; Preliminary Analysis
Look at the shape of the dataset, and distribution of values in the columns

In [2]:
diamonds = pd.read_csv('diamonds.txt', sep='\t')
diamonds.head(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
8,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
9,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
10,0.23,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


### Size of Data
Determine the size of the dataset

In [4]:
diamonds.shape

(53940, 10)

### Distribution of Data
Inspect the distribution of the columns in diamonds

In [5]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


## Part 2: Filtering and Sorting
Use filtering and sorting to analyse the high and low edges of the data

## Continue Analysis: Most Expensive
View information about the 5 most expensive diamonds in the dataset

In [6]:
# Select the columns price, carat, cut, color, and clarity from diamonds.
# Sort the resulting DataFrame by price, in descending order.
# Use head() to display the first five rows of the result.
diamonds.loc[:, ['price', 'carat', 'cut', 'color', 'clarity']].sort_values(by='price', ascending=False).head(5)

Unnamed: 0,price,carat,cut,color,clarity
27750,18823,2.29,Premium,I,VS2
27749,18818,2.0,Very Good,G,SI1
27748,18806,1.51,Ideal,G,IF
27747,18804,2.07,Ideal,G,SI2
27746,18803,2.0,Very Good,H,SI1


### Continue Analysis: Least Expensive
View information about the 5 least expensive diamonds in the dataset

In [7]:
# Select the columns price, carat, cut, color, and clarity from diamonds.
# Sort the resulting DataFrame by price, in ascending order.
# Use head() to display the first five rows of the result.
diamonds.loc[:, ['price', 'carat', 'cut', 'color', 'clarity']].sort_values(by='price', ascending=True).head(5)

Unnamed: 0,price,carat,cut,color,clarity
1,326,0.23,Ideal,E,SI2
2,326,0.21,Premium,E,SI1
3,327,0.23,Good,E,VS1
4,334,0.29,Premium,I,VS2
5,335,0.31,Good,J,SI2


### Continue Analysis: Largest with Ideal Cut
View information about the 5 largest diamonds in the dataset with an ideal cut.

In [8]:
# Select the columns price, carat, cut, color, and clarity from diamonds.
# Use boolean masking to filter the DataFrame, keeping only records for diamonds with an ideal cut.
# Sort the resulting DataFrame by carat, in descending order.
# Use head() to display the first five rows of the result.
diamonds.loc[diamonds.loc[:, 'cut'] == 'Ideal', \
    ['price', 'carat', 'cut', 'color', 'clarity']].sort_values(by='carat', ascending=False).head(5)

Unnamed: 0,price,carat,cut,color,clarity
24329,12587,3.5,Ideal,H,I1
24298,12545,3.22,Ideal,I,I1
26468,16037,3.01,Ideal,J,SI2
26745,16538,3.01,Ideal,J,I1
24785,13156,2.75,Ideal,D,I1


### Continue Analysis: Largest with Fair Cut
View information about the 5 largest diamonds in the dataset with an fair cut

In [9]:
# Select the columns price, carat, cut, color, and clarity from diamonds.
# Use boolean masking to filter the DataFrame, keeping only records for diamonds with an fair cut.
# Sort the resulting DataFrame by carat, in descending order.
# Use head() to display the first five rows of the result.
diamonds.loc[diamonds.loc[:, 'cut'] == 'Fair', \
    ['price', 'carat', 'cut', 'color', 'clarity']].sort_values(by='carat', ascending=False).head(5)

Unnamed: 0,price,carat,cut,color,clarity
27416,18018,5.01,Fair,J,I1
27631,18531,4.5,Fair,J,I1
27131,17329,4.13,Fair,H,I1
23645,11668,3.65,Fair,H,I1
26432,15964,3.4,Fair,D,I1


## Part 3: Working with Categorical Variables
Configure dataframe with information about ordering of the categorical columns, and make color palettes for the columns

In [15]:
# Create three lists named clarity_levels, cut_levels, and color_levels. Each list should contain strings
# representing the levels of the associated categorical variable in order from worst to best.
# clarity - Indicates the level of internal defects in the diamond. The levels (from worst to best) are: I1, SI2, SI1, VS2, VS1, VVS2, VVS1, IF.
clarity_levels = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
# cut - Quality of the cut of the diamond. The levels (from worst to best) are Fair, Good, Very Good, Premium, and Ideal
cut_levels = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
# color - Level of the tint in the diamond. Colorless diamonds are generally preferred. The levels of this variable (from worst to best) are: J, I, H, G, F, E, and D.
color_levels = ['J', 'I', 'H', 'G', 'F', 'E', 'D']

### Set Categorial Ordering in Dataframe
Use ordered lists of category strings to communicate to Pandas the correct order for the levels of the three categorical variables.

In [22]:
# Use pd.Categorical() to set the levels of the cut, color, and clarity columns. This will require three calls to pd.Categorical().
diamonds.clarity = pd.Categorical(diamonds.clarity, clarity_levels)
diamonds.cut = pd.Categorical(diamonds.cut, cut_levels)
diamonds.color = pd.Categorical(diamonds.color, color_levels)

### Create Lists of Colors
Create lists of named colors to serve as palettes to be used for visualizations

In [25]:
# Create three lists named clarity_pal, color_pal, and cut_pal. Each list should contain a number of named colors
# equal to the number of levels found for the associated categorical variable. 
# print (f'clarity {len(clarity_levels)} cut {len(cut_levels)} color {len(color_levels)}')
clarity_pal = ['gray', 'rosybrown', 'peachpuff', 'brown', 'sienna', 'tomato', 'red', 'darkred']
color_pal = ['cornsilk', 'gold', 'yellow', 'palegreen', 'limegreen', 'lightseagreen', 'aqua']
cut_pal = ['thistle', 'hotpink', 'magenta', 'blue', 'royalblue']

## Part 4: Displaying Counts for Categorical Variables