In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import OrdinalEncoder

In [None]:
sns.set(style="whitegrid")

In [None]:
diamonds_data = pd.read_csv('/kaggle/input/diamonds-prices/Diamonds Prices2022.csv')

In [None]:
diamonds_data.describe()

In [None]:
diamonds_data.info()

In [None]:
diamonds_data.head()

In [None]:
diamonds_data = diamonds_data.drop_duplicates()

In [None]:
diamonds_data = diamonds_data.drop(columns=['Unnamed: 0'])

In [None]:
diamonds_data.isnull().sum()

In [None]:
cut_order = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_order = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
clarity_order = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']


In [None]:
encoder = OrdinalEncoder(categories=[cut_order, color_order, clarity_order])
diamonds_data[['cut', 'color', 'clarity']] = encoder.fit_transform(diamonds_data[['cut', 'color', 'clarity']])

In [None]:
numerical_columns = ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']
Q1 = diamonds_data[numerical_columns].quantile(0.25)
Q3 = diamonds_data[numerical_columns].quantile(0.75)
IQR = Q3 - Q1
diamonds_data = diamonds_data[~((diamonds_data[numerical_columns] < (Q1 - 1.5 * IQR)) | (diamonds_data[numerical_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]


In [None]:
diamonds_data.head()

In [None]:
z_scores = np.abs(stats.zscore(diamonds_data.select_dtypes(include=[np.number])))
diamonds_data = diamonds_data[(z_scores < 3).all(axis=1)]


In [None]:
diamonds_data.shape

In [None]:
numerical_columns = ['carat', 'depth', 'table', 'price', 'x', 'y', 'z']
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 12))
fig.suptitle('Univariate Analysis of Numerical Variables', fontsize=16)
for i, col in enumerate(numerical_columns):
    sns.histplot(diamonds_data[col], kde=True, ax=axes[i//3, i%3], bins=30, color="skyblue")
    axes[i//3, i%3].set_title(f'{col} Distribution')

In [None]:
for j in range(i + 1, 9):
    fig.delaxes(axes[j//3, j%3])

In [None]:
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
categorical_columns = ['cut', 'color', 'clarity']
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(categorical_columns):
    sns.countplot(data=diamonds_data, x=col, palette="viridis", ax=axes[i])
    axes[i].set_title(f'{col} Count Distribution')
    

In [None]:
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
spearman_corr = diamonds_data.corr(method='spearman')
sns.heatmap(spearman_corr, annot=True, cmap="viridis", fmt=".2f")
plt.title("Spearman Correlation Matrix of Numerical Variables")
plt.show()


In [None]:
plt.figure(figsize=(10, 8))
pearson_corr = diamonds_data.corr()
sns.heatmap(pearson_corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Pearson Correlation Matrix of Numerical Variables")
plt.show()


In [None]:
sns.pairplot(diamonds_data[numerical_columns], diag_kind='kde', plot_kws={'alpha':0.3})
plt.suptitle("Pair Plot of Numerical Variables", y=1.02)
plt.show()

In [None]:

# Box plots to explore relationships between price and categorical variables
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, col in enumerate(categorical_columns):
    sns.boxplot(data=diamonds_data, x=col, y='price', palette="muted", ax=axes[i])
    axes[i].set_title(f'Price vs {col}')

plt.tight_layout()
plt.show()


In [None]:
# Analyzing the effect of 'cut', 'color', and 'clarity' on price with carat size as a hue in scatter plots
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
sns.scatterplot(data=diamonds_data, x='cut', y='price', hue='carat', palette='cool', ax=axes[0])
axes[0].set_title("Price vs Cut (Hue: Carat)")
sns.scatterplot(data=diamonds_data, x='color', y='price', hue='carat', palette='cool', ax=axes[1])
axes[1].set_title("Price vs Color (Hue: Carat)")
sns.scatterplot(data=diamonds_data, x='clarity', y='price', hue='carat', palette='cool', ax=axes[2])
axes[2].set_title("Price vs Clarity (Hue: Carat)")
plt.tight_layout()
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D##Checking relationships between carat, depth, table, and price through 3D scatter plots
# Import 3D plotting library

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(diamonds_data['carat'], diamonds_data['depth'], diamonds_data['table'],
                c=diamonds_data['price'], cmap='viridis', alpha=0.6)
plt.colorbar(sc, label='Price')
ax.set_xlabel('Carat')
ax.set_ylabel('Depth')
ax.set_zlabel('Table')
plt.title('3D Scatter Plot: Carat, Depth, Table vs Price')
plt.show()
