In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
data = pd.read_csv('/kaggle/input/diamonds/diamonds.csv')
data

In [None]:
data = data.drop('Unnamed: 0', axis=1)

## Categorical Data Visualization

In [None]:
print(f"Cuts:{data['cut'].unique()}")
print(f"Colors: {data['color'].unique()}")
print(f"Clarities: {data['clarity'].unique()}")

In [None]:
# Cut
fig,ax=plt.subplots(1,2, figsize = (18,6))
fig.suptitle('Diamonds by Cut', fontsize=20)
g1=sns.countplot(ax=ax[0],x="cut", data=data,order=['Fair','Good','Very Good','Premium','Ideal'])
g1.set(xlabel=None)
g1.tick_params(labelrotation=45)
g2=sns.boxplot(ax=ax[1],x="cut", y="price", data=data, order=['Fair','Good','Very Good','Premium','Ideal'])
g2.set(xlabel=None)
g2.tick_params(labelrotation=45)

plt.show()

Most diamonds in the dataset has Ideal cut, and the Premium cut is most expensive.

In [None]:
# Color
fig,ax=plt.subplots(1,2, figsize = (18,6))
fig.suptitle('Diamonds by Color', fontsize=20)
g1=sns.countplot(ax=ax[0],x="color", data=data.sort_values(by=['color'],ascending=False))
g1.set(xlabel=None)
g2=sns.boxplot(ax=ax[1],x="color", y="price", data=data.sort_values(by=['color'],ascending=False))
g2.set(xlabel=None)

plt.show()

Most diamonds are in colour G, E, F, repectively
Most expensive colour are J, I, respectively, G and H they have similar price.

In [None]:
# Clarity
fig,ax=plt.subplots(1,2, figsize = (18,6))
fig.suptitle('Diamonds by Clarity', fontsize=20)
g1=sns.countplot(ax=ax[0],x="clarity", data=data.sort_values(by=['clarity']))
g1.set(xlabel=None)
g1.tick_params(labelrotation=45)
g2=sns.boxplot(ax=ax[1],x="clarity", y="price", data=data.sort_values(by=['clarity']))
g2.set(xlabel=None)
g2.tick_params(labelrotation=45)
plt.show() 

The most diamonds has SI1 and VS2.
The most expensive diamonds are VS1 and VS2 clarity with similar price.

## Feature Engineering 1: apply get_dummies

In [None]:
data1 = data.copy()

data1 = pd.get_dummies(data1)
data1

In [None]:
corr1 = data1.corr(method='pearson')
plt.figure(figsize=(15,10))
sns.heatmap(corr1, annot=True, fmt='.2f', cmap='coolwarm')
plt.show()

In [None]:
X1 = data1.drop('price',axis=1)
y1 = data['price']

scaler = StandardScaler()
X1 = scaler.fit_transform(X1)

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2,random_state=0)

## Feature Engineering 2: apply LabelEncoder

In [None]:
data2 = data.copy()

encoder = LabelEncoder()

columns = ['cut', 'color', 'clarity']

for col in columns:
    data2[col] = encoder.fit_transform(data2[col])

data2

In [None]:
corr2 = data2.corr(method='pearson')
plt.figure(figsize=(11,8))
sns.heatmap(corr2, annot=True, fmt='.2f', cmap='coolwarm')
plt.show()

In [None]:
X2 = data2.drop('price',axis=1)
y2 = data['price']

scaler2 = StandardScaler()
X2 = scaler2.fit_transform(X2)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2,random_state=0)

## Models Building

In [None]:
std_model = LinearRegression()
l1_model = Lasso(alpha=1)
l2_model = Ridge(alpha=1)
tre_model = DecisionTreeRegressor()
random_model = RandomForestRegressor()

In [None]:

std_1 = std_model.fit(X1_train, y1_train)
l1_1 = l1_model.fit(X1_train, y1_train)
l2_1 = l2_model.fit(X1_train, y1_train)
tree_1 = tre_model.fit(X1_train, y1_train) 
random_1 = random_model.fit(X1_train, y1_train)

print("Feature Enginnering 1: get_dummies")
print(f"---Without regularization: {std_1.score(X1_test, y1_test)}")
print(f"Lasso (L1) regularization: {l1_1.score(X1_test, y1_test)}")
print(f"Ridge (L2) regularization: {l2_1.score(X1_test, y1_test)}")
print(f"Decision Tree: {tree_1.score(X1_test, y1_test)}")
print(f"Random Florest: {random_1.score(X1_test, y1_test)}")

In [None]:
std_2 = std_model.fit(X2_train, y2_train)
l1_2 = l1_model.fit(X2_train, y2_train)
l2_2 = l2_model.fit(X2_train, y2_train)
tree_2 = tre_model.fit(X2_train, y2_train) 
random_2 = random_model.fit(X2_train, y2_train)

print("Feature Enginnering 2: LabelEncoder")
print(f"---Without regularization: {std_2.score(X2_test, y2_test)}")
print(f"Lasso (L1) regularization: {l1_2.score(X2_test, y2_test)}")
print(f"Ridge (L2) regularization: {l2_2.score(X2_test, y2_test)}")
print(f"Decision Tree: {tree_2.score(X2_test, y2_test)}")
print(f"Random Florest: {random_2.score(X2_test, y2_test)}")

**Conclusion: Linear regression models were influenced by the technique used to convert categorical data into numeric ones. The ensemble (tree) models, on the other hand, did not suffer significant influence**