In [None]:
# Import libraries required
!pip install joypy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joypy
from pandas.api.types import CategoricalDtype

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Import data and preview
diamonds = pd.read_csv('../input/diamonds/diamonds.csv')
diamonds.head()

In [None]:
# Review shape of data
diamonds.shape

In [None]:
# Drop the redundant index field and preview
diamonds = diamonds.drop(columns=['Unnamed: 0'])
diamonds.head()

In [None]:
# View summary statistics for the diamonds dataset
diamonds.describe()

In [None]:
# Visualise distribution for price
sns.distplot(diamonds['price']);
print("Skewness: %f" % diamonds['price'].skew())
print("Kurtosis: %f" % diamonds['price'].kurt())

In [None]:
# Visualise distribution for carat
sns.distplot(diamonds['carat']);
print("Skewness: %f" % diamonds['carat'].skew())
print("Kurtosis: %f" % diamonds['carat'].kurt())

In [None]:
# Visualise distribution for table which should have very little skewness
sns.distplot(diamonds['depth']);
print("Skewness: %f" % diamonds['depth'].skew())
print("Kurtosis: %f" % diamonds['depth'].kurt())

In [None]:
# View variable relationships with a pairplot
sns.pairplot(diamonds, hue = "cut", corner = True, plot_kws=dict(marker="+", linewidth=1));

In [None]:
# Remove outliers interfering with the visualisation
diamonds = diamonds[(diamonds.depth<75) & (diamonds.depth>45)]
diamonds = diamonds[(diamonds.table<80) & (diamonds.table>40)]
diamonds = diamonds[(diamonds.x<30) & (diamonds.x>0)]
diamonds = diamonds[(diamonds.y<30) & (diamonds.y>0)]
diamonds = diamonds[(diamonds.z<30) & (diamonds.z>2)]

In [None]:
# Review shape of data and note dropped rows
diamonds.shape

In [None]:
# View variable relationships with a pairplot with outliers removed
sns.pairplot(diamonds, hue = "cut", corner = True, plot_kws=dict(marker="+", linewidth=0.5));

In [None]:
# Visualise correlations with a corrplot
#correlation matrix
#sns.set_theme(style="white")
corr = diamonds.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, square=True, cmap=cmap, linewidths=1, cbar_kws={"shrink":.8});
plt.xticks(rotation=90);
plt.yticks(rotation=0);

In [None]:
# View spread of prices by cut type
cat_size_order = CategoricalDtype(
    ["Premium","Ideal","Very Good","Good","Fair"],
    ordered=True)

diamonds['cut'] = diamonds['cut'].astype(cat_size_order)

fig, axes = joypy.joyplot(diamonds, 
                          by='cut', 
                          column='price',
                          linewidth =.5,
                          overlap=.5,
                          #kind='counts',
                          fade=True,
                          bins=80,
                          color='#F07E6E',
                          title="Distribution of prices by diamond cut")

In [None]:
# View spread of prices by colour type
fig, axes = joypy.joyplot(diamonds, 
                          by='color', 
                          column='price',
                          linewidth =.5,
                          overlap=.5,
                          #kind='counts',
                          fade=True,
                          bins=80,
                          color='#92B3C9',
                          title="Distribution of prices by diamond colour")

In [None]:
# View spread of prices by clarity type
cat_size_order = CategoricalDtype(
    ["IF","VVS1","VVS2","VS1","VS2","SI1","SI2","I1"],
    ordered=True)

diamonds['clarity'] = diamonds['clarity'].astype(cat_size_order)

fig, axes = joypy.joyplot(diamonds, 
                          by='clarity', 
                          column='price',
                          linewidth =.5,
                          overlap=.5,
                          #kind='counts',
                          fade=True,
                          bins=80,
                          color='#01B8AA',
                          title="Distribution of prices by diamond clarity")