# SHINE BRIGHT LIKE A DIAMOND

## ETL

In [None]:
import pandas as pd
df_data_1 = pd.read_csv("../input/diamonds/diamonds.csv")
df_data_1.head()


In [None]:
diamond = df_data_1.copy()

In [None]:
diamond.head()

In [None]:
diamond.columns

In [None]:
diamond.drop("Unnamed: 0", axis = 1, inplace = True)

#### Installing packages

In [None]:
import numpy as np
import seaborn as sns; sns.set_theme(style = "dark")
%matplotlib inline
import matplotlib.pyplot as plt

### Data Visualization

In [None]:
plt.figure(figsize = (16,12))
sns.kdeplot(x = "price", data = diamond,hue = "cut")
plt.xlabel("Price", fontsize = 12)
plt.ylabel("Density", fontsize = 12)
plt.title("The price of diamonds based on different cuts")
plt.show();

In [None]:
plt.figure(figsize = (20,8))
sns.scatterplot(x = "carat", y = "price", data = diamond, hue = "cut", palette = "viridis")
plt.xlabel("Carat")
plt.ylabel("Price")
plt.title("Price vs Carat")
plt.show();

In [None]:
plt.figure(figsize = (16,8))
cor = diamond.corr()
sns.heatmap(cor, annot = True, cmap = "viridis")
plt.show();

### Building predictive model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import KernelPCA

In [None]:
onehot = OneHotEncoder()
std = StandardScaler()
ord1 = OrdinalEncoder()

In [None]:
ord2 = OrdinalEncoder(categories = [["Fair", "Good", "Very Good", "Premium", "Ideal"],
                                    ['D', 'E', 'F', 'G', 'H', 'I', 'J'],
                                    ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']])

In [None]:
diamond2 = diamond.copy()

In [None]:
diamond2["cut_num"] = diamond2["cut"]
diamond2["cut_num"] = diamond2["cut_num"].replace(["Fair", "Good", "Very Good", "Premium", "Ideal"], [0,1,2,3,4])
diamond2["cut_num"] = diamond2["cut_num"].astype(int)

In [None]:
diamond2["color_num"] = diamond2["color"]
diamond2["color_num"] = diamond2["color_num"].replace(['D', 'E', 'F', 'G', 'H', 'I', 'J'], [0,1,2,3,4,5,6])
diamond2["color_num"] = diamond2["color_num"].astype(int)

In [None]:
diamond2["clarity_num"] = diamond2["clarity"]
diamond2["clarity_num"] = diamond2["clarity_num"].replace(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'], [0,1,2,3,4,5,6,7])
diamond2["clarity_num"] = diamond2["clarity_num"].astype(int)

In [None]:
plt.figure(figsize= (16,12))
cor2 = diamond2.corr()
sns.heatmap(cor2, annot = True, cmap = "viridis")
plt.show()

In [None]:
column_pipeline = ColumnTransformer([("ordinal", ord2, ["cut", "color", "clarity"]), ("std", std, ["carat","x", "y", "z"])])

In [None]:
X = diamond.drop(["price", "depth", "table"], axis = 1)

In [None]:
y = diamond["price"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42069)

In [None]:
from sklearn.linear_model import Ridge
rfe = RFECV(Ridge(alpha = 1.0), cv = 5)
poly = PolynomialFeatures()
forest = RandomForestRegressor()

In [None]:
full_pipeline = Pipeline([("col", column_pipeline),("rid", rfe),("reg", forest)])

In [None]:
param_grid = [{"reg__n_estimators" : np.arange(1,50), "reg__max_features":np.arange(1,6)}]
grid_search = GridSearchCV(full_pipeline, param_grid, cv = 5)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
model = grid_search.best_estimator_

In [None]:
grid_search.best_estimator_

In [None]:
model.fit(x_train, y_train)

In [None]:
y_train_pred = cross_val_predict(model, x_train, y_train, cv = 5)

In [None]:
new_train = x_train.copy()
new_train['price'] = y_train
new_train['predicted price'] = y_train_pred

In [None]:
new_train.head()

In [None]:
y_pred = model.predict(x_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
import math
mse = math.sqrt(mean_squared_error(y_test, y_pred))
mse

In [None]:
cross_val_score(model, x_train, y_train, cv = 5)

In [None]:
new_test = x_test.copy()
new_test["price"] = y_test
new_test["predicted price"] = y_pred
new_test.head()

In [None]:
print("The predictive model has an accuracy of %.2f"%r2_score(y_test,y_pred))