# Diamond Data Set

In [7]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import seaborn as sns
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [17]:
# Load the data set

df=pd.read_csv(r'C:\Users\user\Downloads\diamonds.csv')
df

In [18]:
# removing the unknown column

df.drop(columns = 'Unnamed: 0',inplace = True)


In [19]:
df.head()

In [20]:
df.info()

In [21]:
df.isnull().sum()

In [8]:
df.drop_duplicates(inplace=True)

In [22]:
df.shape

In [23]:
df.nunique()

In [24]:
df['cut'].unique()

In [25]:
df['color'].unique()

In [26]:
df['clarity'].unique()

In [27]:
df.describe().T

In [28]:
# EDA Data Visualization

# Subplots of Bar Graphs: Cut, Color, and Clarity


x = df['cut'].value_counts().reset_index()
x.columns = ['cut', 'count']

y = df['color'].value_counts().reset_index()
y.columns = ['color', 'count']

z = df['clarity'].value_counts().reset_index()
z.columns = ['clarity', 'count']

fig = make_subplots(rows=1, cols=3, subplot_titles=['Cut', 'Color', 'Clarity'])

fig.add_trace(go.Bar(x=x['cut'], y=x['count'], name='Cut'), row=1, col=1)
fig.add_trace(go.Bar(x=y['color'], y=y['count'], name='Color'), row=1, col=2)
fig.add_trace(go.Bar(x=z['clarity'], y=z['count'], name='Clarity'), row=1, col=3)

fig.update_layout(title_text='Subplots of Bar Graphs: Cut, Color, and Clarity',
                  xaxis=dict(title='Categories'),
                  yaxis=dict(title='Count'),
                  showlegend=False)

fig.show()

In [29]:
# Histogram Subplots of Numerical Features


numerical_columns = ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']

num_rows = (len(numerical_columns) // 2) + (len(numerical_columns) % 2)
num_cols = 3

fig = sp.make_subplots(rows=num_rows, cols=num_cols,
                       subplot_titles=numerical_columns,
                       vertical_spacing=0.15)
for i, column in enumerate(numerical_columns):
    row = (i // num_cols) + 1
    col = (i % num_cols) + 1
    trace = go.Histogram(x=df[column], nbinsx=20)
    fig.add_trace(trace, row=row, col=col)

fig.update_layout(height=num_rows * 300, width=num_cols * 400,
                  title_text='Histogram Subplots of Numerical Features')
fig.update_yaxes(title_text='Frequency')
fig.update_xaxes(title_text='Value')

fig.show()

In [30]:
# Distribution of Price

histogram_fig = px.histogram(df, x='price', nbins=30,
                             title='Distribution of Price',
                             labels={'price': 'Price', 'count': 'Frequency'})
histogram_fig.show()

In [31]:
# Box plot of Cut vs Price using Plotly

box_fig = px.box(df, x='cut', y='price',
                 title='Box Plot of Price by Cut',
                 labels={'cut': 'Cut Quality', 'price': 'Price'})
box_fig.show()

In [36]:
# Carat vs Price Colored by Cut

# Scatter plot of Carat vs Price colored by Cut using Plotly

scatter_fig = px.scatter(df, x='carat', y='price', color='cut',
                         title='Carat vs Price Colored by Cut',
                         labels={'carat': 'Carat', 'price': 'Price', 'cut': 'Cut Quality'})
scatter_fig.show()

In [37]:
# 3D Scatter plot of Carat, Cut, and Price using Plotly

scatter_3d_fig = px.scatter_3d(df, x='carat', y='cut', z='price', color='cut',
                               title='3D Scatter Plot of Carat, Cut, and Price',
                               labels={'carat': 'Carat', 'cut': 'Cut Quality', 'price': 'Price'})

scatter_3d_fig.show()

In [38]:
# 3D Scatter Plot of Price

#import plotly.express as px

fig = px.scatter_3d(df, x='carat', y='depth', z='price', color='price',
                    title='3D Scatter Plot of Price')
fig.show()


In [39]:
# correlation_matrix

numerical_columns = ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']
correlation_matrix = df[numerical_columns].corr()
correlation_matrix

In [40]:
# Select only numerical columns
numerical_columns = ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']

# Create a subset of the dataset with numerical columns
numerical_data = df[numerical_columns]

# Calculate correlation matrix
corr_matrix = numerical_data.corr()

# Create a heatmap using Plotly with annotations
heatmap_fig = go.Figure(data=go.Heatmap(
                   z=corr_matrix.values,
                   x=corr_matrix.columns,
                   y=corr_matrix.columns,
                   colorscale='Viridis',
                   colorbar=dict(title="Correlation"),
                   zmin=-1, zmax=1))

heatmap_fig.update_layout(title="Correlation Heatmap (Numerical Features)",
                         xaxis=dict(title="Features"),
                        yaxis=dict(title="Features"))
heatmap_fig.show()

In [41]:
# Animated plots

scatter_fig = px.scatter(df, x='carat', y='price', animation_frame='cut', color='cut',
                        title='Animated Scatter Plot: Carat vs Price Colored by Cut')
scatter_fig.show()


In [43]:
# Show the animated strip plot

strip_fig = px.strip(df, x='cut', y='price', animation_frame='color', color='clarity',
                     title='Animated Strip Plot: Price by Cut and Color with Clarity',
                     labels={'cut': 'Cut Quality', 'price': 'Price', 'color': 'Color', 'clarity': 'Clarity'})

strip_fig.show()


In [42]:
scatter_3d_fig = px.scatter_3d(df, x='carat', y='depth', z='price', animation_frame='cut', color='cut',
                              title='Animated 3D Scatter Plot: Carat, Depth, and Price Colored by Cut')
scatter_3d_fig.show()