# S1 W1 Data Prep & Viz

## Basic Data Handling

### Import packages & load data into a pandas dataframe

In [40]:
# Import packages
import pandas as pd
import numpy as np
import math
import seaborn as sns

In [29]:
# Specify the url of the 'diamonds' dataset, saved in my github account
diamonds_url = "https://raw.githubusercontent.com/sba23014/cct_msc_data_analytics/main/s1_data_preparation_%26_visualisation/week_1/diamonds.csv"

In [30]:
# Read files from the url into the pandas df
diamonds_df = pd.read_csv(diamonds_url)

### Observing & describing data

In [31]:
# Observe the first 10 rows of the dataset
diamonds_df.head(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,0.23,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


In [5]:
# View shape of dataset
diamonds_df.shape

(53940, 10)

In [6]:
# View info about attributes in the dataset
diamonds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [32]:
# View summary statistics for numerical values in the dataset
diamonds_df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [33]:
# View summary statistics for object values in the dataset
diamonds_df.describe(include = object)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


### Selecting columns from dataframes

In [34]:
# Filters rows in diamonds_df where 'cut' is 'Ideal'
diamonds_df.loc[diamonds_df['cut'] == 'Ideal']

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
11,0.23,Ideal,J,VS1,62.8,56.0,340,3.93,3.90,2.46
13,0.31,Ideal,J,SI2,62.2,54.0,344,4.35,4.37,2.71
16,0.30,Ideal,I,SI2,62.0,54.0,348,4.31,4.34,2.68
39,0.33,Ideal,I,SI2,61.8,55.0,403,4.49,4.51,2.78
...,...,...,...,...,...,...,...,...,...,...
53925,0.79,Ideal,I,SI1,61.6,56.0,2756,5.95,5.97,3.67
53926,0.71,Ideal,E,SI1,61.9,56.0,2756,5.71,5.73,3.54
53929,0.71,Ideal,G,VS1,61.4,56.0,2756,5.76,5.73,3.53
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50


In [35]:
# Storing this filtered data in a new dataframe
diamonds_low_df = diamonds_df.loc[diamonds_df['cut'] == 'Ideal']

# Viewing the top 5 rows of this new dataframe
diamonds_low_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
11,0.23,Ideal,J,VS1,62.8,56.0,340,3.93,3.9,2.46
13,0.31,Ideal,J,SI2,62.2,54.0,344,4.35,4.37,2.71
16,0.3,Ideal,I,SI2,62.0,54.0,348,4.31,4.34,2.68
39,0.33,Ideal,I,SI2,61.8,55.0,403,4.49,4.51,2.78


### Adding new columns to the dataframe

In [36]:
# Creates a 'price_per_carat' column by dividing 'price' by 'carat' in diamonds_df
diamonds_df['price_per_carat'] = diamonds_df['price'] / diamonds_df['carat']

diamonds_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_per_carat
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1417.391304
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1552.380952
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1421.73913
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,1151.724138
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1080.645161


In [37]:
# This code creates a new column price_per_carat_is_high, setting its value to 1 if 'price_per_carat' is greater than 3500, 
# and 0 otherwise
diamonds_df['price_per_carat_is_high'] = np.where(diamonds_df['price_per_carat'] > 3500, 1, 0)

diamonds_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_per_carat,price_per_carat_is_high
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1417.391304,0
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1552.380952,0
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1421.73913,0
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,1151.724138,0
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1080.645161,0


In [16]:
# Validating results in diamonds_df where 'price_per_carat_is_high' is 1
diamonds_df.loc[diamonds_df['price_per_carat_is_high'] == 1]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_per_carat,price_per_carat_is_high
90,0.70,Ideal,E,SI1,62.5,57.0,2757,5.70,5.72,3.57,3938.571429,1
92,0.70,Ideal,G,VS2,61.6,56.0,2757,5.70,5.67,3.50,3938.571429,1
93,0.71,Very Good,E,VS2,62.4,57.0,2759,5.68,5.73,3.56,3885.915493,1
94,0.78,Very Good,G,SI2,63.8,56.0,2759,5.81,5.85,3.72,3537.179487,1
95,0.70,Good,E,VS2,57.5,58.0,2759,5.85,5.90,3.38,3941.428571,1
...,...,...,...,...,...,...,...,...,...,...,...,...
53934,0.72,Premium,D,SI1,62.7,59.0,2757,5.69,5.73,3.58,3829.166667,1
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,3829.166667,1
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,3829.166667,1
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,3938.571429,1


### Function on multiple columns

In [26]:
# The function is_desired checks if a given dictionary x has both 'Ideal' cut and 'D' color. It returns 'yes' if both 
# conditions are met, otherwise returns 'no'

def is_desired(x):
    bool_var = 'yes' if (x['cut'] == 'Ideal' and x['color'] == 'D') else 'no'
    return bool_var

In [38]:
# Another way of writing the same code, easier to read code

def is_desired(x):
    if x['cut'] == 'Ideal' and x['color'] == 'D':
        return 'yes'
    else:
        return 'no'

In [39]:
diamonds_df['desired'] = diamonds_df.apply(is_desired, axis = 1)
diamonds_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_per_carat,price_per_carat_is_high,desired
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1417.391304,0,no
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1552.380952,0,no
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1421.73913,0,no
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,1151.724138,0,no
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1080.645161,0,no


### Deleting columns from dataframe

In [43]:
# Rounds up the 'price' column in diamonds_df using math.ceil and stores it in a new column called 'rounded_price'
diamonds_df['rounded_price'] = diamonds_df['price'].apply(math.ceil)

diamonds_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_per_carat,price_per_carat_is_high,desired,rounded_price
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1417.391304,0,no,326
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1552.380952,0,no,326
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1421.73913,0,no,327
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,1151.724138,0,no,334
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1080.645161,0,no,335


In [46]:
# Rounds up x to the nearest multiple of 100 using math.ceil
def get_100_multiple_ceil(x):
    y = math.ceil(x / 100) * 100
    return y

In [47]:
# Applies get_100_multiple_ceil to 'price' column, rounding each value up to nearest 100. Stores in 
# 'rounded_price_to_100multiple'
diamonds_df['rounded_price_to_100multiple'] = diamonds_df['price'].apply(get_100_multiple_ceil)

diamonds_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_per_carat,price_per_carat_is_high,desired,rounded_price,rounded_price_to_100multiple
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1417.391304,0,no,326,400
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1552.380952,0,no,326,400
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1421.73913,0,no,327,400
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,1151.724138,0,no,334,400
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1080.645161,0,no,335,400


In [52]:
# This could also be written as a Lamda function in one line as follows
diamonds_df['rounded_price_to_100multiple']=diamonds_df['price'].apply(lambda x: math.ceil(x/100)*100)

In [48]:
# Drops columns 'rounded_price' & 'rounded_priced_to_100multiple' from the 'diamonds_df' dataframe
diamonds_df = diamonds_df.drop(columns = ['rounded_price', 'rounded_price_to_100multiple'])

diamonds_df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_per_carat,price_per_carat_is_high,desired
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,1417.391304,0,no
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,1552.380952,0,no
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,1421.73913,0,no
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,1151.724138,0,no
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,1080.645161,0,no


### Writing dataframe to a file

In [50]:
# Saves diamonds_df as a CSV file named 'diamonds_modified_csv' in the current directory, includes an index
diamonds_df.to_csv('diamonds_modified_csv_index')

In [51]:
# Saves diamonds_df as a CSV file named 'diamonds_modified_csv' in the current directory, doesn't include an index
diamonds_df.to_csv('diamonds_modified_csv_no_index', index = False)