In [None]:
from IPython.display import Image
Image("../input/diamond-picture/diamond.jpg")

The aim of this notebook is to hone my visualization skills and show that with few lines of code we can achieve beautiful interactive visualization. After EDA I will create a model which will predict the price of a diamond. I also hope for this notebook to be educational as well, so if you find it interesting and you learn something new don't forget to vote and leave feedback, thanks.

## Libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go

from scipy.stats import skew, kurtosis

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

In [None]:
plt.rcParams["figure.figsize"] = (12,5)

## About the dataset

In [None]:
# Load the dataset
diamonds_df = pd.read_csv("/kaggle/input/diamonds/diamonds.csv") 
diamonds_df.drop("Unnamed: 0", axis=1, inplace=True)

# Create a list of numeric columns
num_feat = [col for col in diamonds_df.select_dtypes(include='number').columns]
# Create a list of categorical columns
cat_feat = [col for col in diamonds_df.select_dtypes("object").columns]

diamonds_df

In [None]:
diamonds_df.info()

In [None]:
diamonds_df.describe()

In [None]:
corr_map = diamonds_df.corr()
mask = np.zeros_like(corr_map)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(10, 8))
sns.heatmap(corr_map,
            mask=mask,
            annot=True,
            linewidth=1,
            linecolor='w',
            #square=True,
            cbar=False);

In [None]:
def skew_test(df):
    col = df.skew(axis = 0, skipna = True)
    val = df.skew(axis = 0, skipna = True)
    sk_table = pd.concat([col, val], axis=1)
    
    sk_table = sk_table.rename(
    columns = {0 : "skewness"})
    
    return sk_table.drop([1], axis = 1).sort_values("skewness", ascending = False).reset_index()

In [None]:
skk = skew_test(diamonds_df[num_feat])

In [None]:
skk['kurtosis'] = diamonds_df[num_feat].kurtosis().values
skk.style.background_gradient(cmap='Blues')

In [None]:
sns.scatterplot(x='price', y='x', data=diamonds_df)

In [None]:
# Will drop these outliers later on
diamonds_df[diamonds_df['x'] <1]

## Numeric features

In [None]:
num_feat

In [None]:
fig = plt.figure(figsize=(12, 20))
for i, col in enumerate(num_feat):
    plt.subplot(12, 2, i+1 )
    sns.histplot(x=diamonds_df[col])
    plt.grid('darkgrid')
    plt.tight_layout()
    
fig.show()

**Label "Price"**

In [None]:
fig = px.box(diamonds_df, x='price')
fig.show()

In [None]:
fig = px.histogram(data_frame=diamonds_df, x='price', nbins=40, color='cut', marginal='rug',
                  color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

**"carat" feature**

In [None]:
fig = px.histogram(data_frame=diamonds_df, x='carat', nbins=40, color='cut', marginal='rug',
                  color_discrete_sequence=px.colors.qualitative.G10)
fig.show()

In [None]:
fig = px.scatter(diamonds_df, x='price', y='carat', size='carat', color='color', opacity=0.6,
                 color_discrete_sequence=px.colors.qualitative.Vivid)
fig.show()

In [None]:
fig = px.scatter(diamonds_df, x="carat", y="price", facet_col="cut", color='color')
fig.show()

**"depth" feature**

In [None]:
fig = px.scatter(diamonds_df, x='carat', y='depth', size='carat', color='color', opacity=0.4)
fig.show()

In [None]:
fig = px.histogram(data_frame=diamonds_df, x='depth', nbins=45)
fig.show()

**"table" feature** 

In [None]:
fig = px.histogram(data_frame=diamonds_df, x='table', nbins=45, marginal='rug', opacity=0.6)
fig.show()

## Categorical features

In [None]:
cat_feat

**"Cut" feature**

In [None]:
fig = px.pie(diamonds_df, values='price', names='cut', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [None]:
x = diamonds_df["cut"].value_counts().index
y = diamonds_df['cut'].value_counts().values

fig = go.Figure(data=[go.Bar(x=x, y=y)])
# Customize aspect
fig.update_traces(marker_color=px.colors.sequential.Plasma, marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Diamond Cut Scale')
fig.show()

In [None]:
fig = px.box(diamonds_df, x='cut', y='depth', color='cut')
fig.update_traces(quartilemethod="exclusive")
fig.show()

In [None]:
fig = px.box(diamonds_df, x='cut', y='price', color='color')
fig.update_traces(quartilemethod="exclusive")
fig.show()

**"Color feature"**

### Color Explained

('E','F','D') - Colorness (scale- Excellent) (4)

('I','J','H','G') - Near Colorness (scale- Very Good) (3)

('K','L','M') - Faint Yellow (scale- Good) (2)

('N','O','P','Q','R') - Very Light Yellow (scale - Fair) (1)

('S','T','U','V','W','X','Y','Z') - Light Yellow (scale - Poor) (0)

In [None]:

fig = px.pie(diamonds_df, values='price', names='color', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [None]:
x = diamonds_df["color"].value_counts().index
y = diamonds_df['color'].value_counts().values


fig = go.Figure(data=[go.Bar(x=x, y=y)])
# Customize aspect
fig.update_traces(marker_color= px.colors.sequential.RdBu, marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Diamond Color')
fig.show()

In [None]:
fig = px.box(diamonds_df, x='color', y='depth', color='color')
fig.update_traces(quartilemethod="exclusive")
fig.show()

**"Clarity feature"**

### Clarity explained

'IF' - Internaly Flawless  (4)

'VVS2' - Very Very Slight Inclusions  (3)

'VVS1' - Very Very Slight Inclusions  (3)

'VS1' - Very Slight Inclusions  (2)

'VS2' - Very Slight Inclusions  (2)

'SI2' - Slight Inclusions  (1)

'SI1' - Slight Inclusions  (1)

'I1' - Imperfect   (0)

In [None]:
labels = diamonds_df['clarity'].value_counts().index
values = diamonds_df['clarity'].value_counts().values

# Use `hole` to create a donut-like pie chart
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5,marker_colors=px.colors.sequential.Jet)])
fig.show()

In [None]:
diamonds_df['clarity'].unique()

In [None]:
x = diamonds_df["clarity"].value_counts().index
y = diamonds_df['clarity'].value_counts().values

order = ['IF','VVS2', 'VVS1','VS1', 'VS2','SI2', 'SI1','I1']

fig = go.Figure(data=[go.Bar(x=x, y=y)])
# Customize aspect
fig.update_traces(marker_color='LightSkyBlue', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Diamond Clarity')
fig.show()

In [None]:
fig = px.box(diamonds_df, x='clarity', y='depth', color='clarity')
fig.update_traces(quartilemethod="exclusive")
fig.show()

**"x, y, z" features**

In [None]:
#fig = px.scatter_3d(diamonds_df, x='x', y='y', z='z', color="cut")
#fig.show()

EDA summary:

1. There is skewness in numrical and categorical features.
2. Very high correlation between columns "x","y", "z", "carat" and "price".
3. There is linear relation between above features.
4. Categorical features are ordinal (instead onehotencode them we need to map these features)
5. There are only two categories in "cut" column - very good and excellent

## Feature engineering

In [None]:
diamonds_df.head()

In [None]:
for col in cat_feat:
    print(f"{col} unique values: {diamonds_df[col].unique()}")

In [None]:
dict_map = {"Fair":0, "Good":1, "Very Good":2, "Premium":3, "Ideal":4,
            'I1':1, 'SI1':1, 'SI2':1, 'VS1':2, 'VS2':2, 'VVS1':3, 'VVS2':3, "IF":4,
            'I':0,'J':0,'H':0,'G':0, 'E':1,'F':1,'D':1}

for col in cat_feat:
    diamonds_df[col] = diamonds_df[col].map(dict_map)

In [None]:
diamonds_df.head()

In [None]:
# Drop outliers
new_diamonds_df = diamonds_df.drop(diamonds_df[diamonds_df['z'] <1].index, axis=0).copy()

In [None]:
sns.scatterplot(x='price', y='z', data=new_diamonds_df)

In [None]:
sns.scatterplot(x='price', y='y', data=new_diamonds_df)

In [None]:
y_drop_idx = new_diamonds_df[new_diamonds_df['y'] > 30].index
new_diamonds_df = new_diamonds_df.drop(y_drop_idx, axis=0)

In [None]:
z_drop_idx = new_diamonds_df[new_diamonds_df['z'] > 30].index
new_diamonds_df = new_diamonds_df.drop(z_drop_idx, axis=0)

In [None]:
new_diamonds_df.head()

## Split our dataset into X/y

In [None]:
# X/y sets
X = new_diamonds_df.drop("price", axis=1)
y = new_diamonds_df['price']

# Split into train/test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=45)

# Scale data
scaler = StandardScaler()

scaled_X_train = scaler.fit_transform(X_train)
scaled_X_val = scaler.transform(X_val)

In [None]:

pd.DataFrame(scaled_X_train, columns=X.columns)

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
elcv = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1])

elcv.fit(scaled_X_train, y_train)

In [None]:
elcv.l1_ratio_

In [None]:
X.columns.shape

In [None]:
pd.DataFrame(elcv.coef_, index=X.columns, columns=['coefficient'])

## Evaluate a model

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
y_preds = elcv.predict(scaled_X_val)

In [None]:
print(f"Base MAE: {mean_absolute_error(y_val, y_preds)}")
print(f"Base RMSE: {np.sqrt(mean_squared_error(y_val, y_preds))}")
print(f"Base R-square: {r2_score(y_val, y_preds)}")

In [None]:
100 * mean_absolute_error(y_val, y_preds) / new_diamonds_df['price'].mean()

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor()
xgb.fit(scaled_X_train, y_train)

In [None]:
y_preds = xgb.predict(scaled_X_val)

print(f"Base MAE: {mean_absolute_error(y_val, y_preds)}")
print(f"Base RMSE: {np.sqrt(mean_squared_error(y_val, y_preds))}")
print(f"Base R-square: {r2_score(y_val, y_preds)}")

In [None]:
100 * mean_absolute_error(y_val, y_preds) / new_diamonds_df['price'].mean()

Ideas to explore:
1. Try other algorithms for regression
2. Hyperparameters tunning with GridSearchCV
3. Cross-validation.

In [None]:
Where to go from here: