# DSCI 503 - Project 03

### Seif Kungulio

In [2]:
# Import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Part 1: Loading the Dataset; Preliminary Analysis

In this section, I will load the data into a DataFrame, and will explore the structure of the data set.

In [4]:
# Load the dataset
diamonds = pd.read_csv('diamonds.txt', delimiter='\t')

# Display the first 10 rows of the dataset
print(diamonds.head(10))

    carat        cut color clarity  depth  table  price     x     y     z
1    0.23      Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
2    0.21    Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
3    0.23       Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
4    0.29    Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
5    0.31       Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
6    0.24  Very Good     J    VVS2   62.8   57.0    336  3.94  3.96  2.48
7    0.24  Very Good     I    VVS1   62.3   57.0    336  3.95  3.98  2.47
8    0.26  Very Good     H     SI1   61.9   55.0    337  4.07  4.11  2.53
9    0.22       Fair     E     VS2   65.1   61.0    337  3.87  3.78  2.49
10   0.23  Very Good     H     VS1   59.4   61.0    338  4.00  4.05  2.39


Next, we will determine the size of the dataset.

In [6]:
# Print the shape of the diamonds DataFrame
print(diamonds.shape)

(53940, 10)


We will now inspect the distribution of the columns in the diamonds dataset by displaying descriptive statistics.

In [8]:
# Display descriptive statistics for each column
print(diamonds.describe())

              carat         depth         table         price             x  \
count  53940.000000  53940.000000  53940.000000  53940.000000  53940.000000   
mean       0.797940     61.749405     57.457184   3932.799722      5.731157   
std        0.474011      1.432621      2.234491   3989.439738      1.121761   
min        0.200000     43.000000     43.000000    326.000000      0.000000   
25%        0.400000     61.000000     56.000000    950.000000      4.710000   
50%        0.700000     61.800000     57.000000   2401.000000      5.700000   
75%        1.040000     62.500000     59.000000   5324.250000      6.540000   
max        5.010000     79.000000     95.000000  18823.000000     10.740000   

                  y             z  
count  53940.000000  53940.000000  
mean       5.734526      3.538734  
std        1.142135      0.705699  
min        0.000000      0.000000  
25%        4.720000      2.910000  
50%        5.710000      3.530000  
75%        6.540000      4.040000  


## Part 2: Filtering and Sorting

In this section, I will use filtering and sorting techniques to display information for diamonds satisfying certain criteria. I will start by viewing information about the 5 most expensive diamonds in the dataset.

In [10]:
# Viewing the 5 most expensive diamonds
diamonds[['price', 'carat', 'cut', 'color', 'clarity']].sort_values(by='price', ascending=False).head(5)

Unnamed: 0,price,carat,cut,color,clarity
27750,18823,2.29,Premium,I,VS2
27749,18818,2.0,Very Good,G,SI1
27748,18806,1.51,Ideal,G,IF
27747,18804,2.07,Ideal,G,SI2
27746,18803,2.0,Very Good,H,SI1


Next, I will view information about the 5 least expensive diamonds in the dataset.

In [12]:
# Viewing the 5 least expensive diamonds
diamonds[['price', 'carat', 'cut', 'color', 'clarity']].sort_values(by='price', ascending=True).head(5)

Unnamed: 0,price,carat,cut,color,clarity
1,326,0.23,Ideal,E,SI2
2,326,0.21,Premium,E,SI1
3,327,0.23,Good,E,VS1
4,334,0.29,Premium,I,VS2
5,335,0.31,Good,J,SI2


Now, I will view information about the 5 largest diamonds in the dataset with an ideal cut.

In [14]:
# Viewing the 5 largest diamonds with an ideal cut
diamonds[['price', 'carat', 'cut', 'color', 'clarity']][diamonds['cut'] == 'Ideal'].sort_values(by='carat', ascending=False).head(5)

Unnamed: 0,price,carat,cut,color,clarity
24329,12587,3.5,Ideal,H,I1
24298,12545,3.22,Ideal,I,I1
26468,16037,3.01,Ideal,J,SI2
26745,16538,3.01,Ideal,J,I1
24785,13156,2.75,Ideal,D,I1


Finally, I will view information about the 5 largest diamonds in the dataset with a fair cut.

In [16]:
# Viewing the 5 largest diamonds with a fair cut
diamonds[['price', 'carat', 'cut', 'color', 'clarity']][diamonds['cut'] == 'Fair'].sort_values(by='carat', ascending=False).head(5)

Unnamed: 0,price,carat,cut,color,clarity
27416,18018,5.01,Fair,J,I1
27631,18531,4.5,Fair,J,I1
27131,17329,4.13,Fair,H,I1
23645,11668,3.65,Fair,H,I1
26432,15964,3.4,Fair,D,I1


## Part 3: Working with Categorical Variables

In this section, we will be creating lists to specify the order for each of the three categorical variables: 'cut', 'color', and 'clarity'. These lists will help us communicate the correct order for the levels of these variables to Pandas. 

In [18]:
# Create three lists
clarity_levels = pd.unique(diamonds['clarity']).tolist()
cut_levels = pd.unique(diamonds['cut']).tolist()
color_levels = pd.unique(diamonds['color']).tolist()

# Print the lists
print(clarity_levels)
print(cut_levels)
print(color_levels)

['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF']
['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
['E', 'I', 'J', 'H', 'F', 'G', 'D']


In [19]:
# Use 'pd.Categorical()' to set the levels of the cut, color, and clarity columns.
diamonds['clarity'] = pd.Categorical(diamonds['clarity'], clarity_levels)
print(diamonds['clarity'])
diamonds['cut'] = pd.Categorical(diamonds['cut'], cut_levels)
print(diamonds['cut'])
diamonds['color'] = pd.Categorical(diamonds['color'], color_levels)
print(diamonds['color'])

1        SI2
2        SI1
3        VS1
4        VS2
5        SI2
        ... 
53936    SI1
53937    SI1
53938    SI1
53939    SI2
53940    SI2
Name: clarity, Length: 53940, dtype: category
Categories (8, object): ['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF']
1            Ideal
2          Premium
3             Good
4          Premium
5             Good
           ...    
53936        Ideal
53937         Good
53938    Very Good
53939      Premium
53940        Ideal
Name: cut, Length: 53940, dtype: category
Categories (5, object): ['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
1        E
2        E
3        E
4        I
5        J
        ..
53936    D
53937    D
53938    D
53939    H
53940    D
Name: color, Length: 53940, dtype: category
Categories (7, object): ['E', 'I', 'J', 'H', 'F', 'G', 'D']
