# Dataset Description

price price in US dollars (\$326--\$18,823)

carat weight of the diamond (0.2--5.01)

cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color diamond colour, from J (worst) to D (best)

clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

x length in mm (0--10.74)

y width in mm (0--58.9)

z depth in mm (0--31.8)

depth total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

table width of top of diamond relative to widest point (43--95

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# read csv file
diam = pd.read_csv("../input/diamond-dataset/diamonds.csv")
diam.head(10)

In [None]:
# displaying dataset information
diam.info()

In [None]:
# dataset description
diam.describe().T

In [None]:
# displaying frequency of different categorical values
for i in list(diam.select_dtypes(include="object").columns):
    print(str.upper(i))
    print(diam[i].value_counts())
    print("")

## Object Data Description

cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color diamond colour, from J (worst) to D (best)

clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))


### Referring to conversion chart

cut - scale of 1-5

color - scale of 1-7 (J-D)

clarity - scale of 1-8 (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

In [None]:
# assigning a copy of dataset to 'data' object
data = diam.copy()

In [None]:
sorted(data["clarity"].unique().tolist())

In [None]:
# converting categorical values into numbers

data["cut"].replace(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'],[5,4,3,2,1], inplace=True)
data["color"].replace(['D', 'E', 'F', 'G', 'H', 'I', 'J'], [7,6,5,4,3,2,1], inplace=True)
data["clarity"].replace(['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF'], [1,2,3,4,5,6,7,8], inplace=True)

In [None]:
# displaying first 10 rows of data using head() function
data.head(10)

## Data Visualization

In [None]:
# Visualizing data using heatmap
# Determining feature correlations by displaying a correlation matrix heatmap
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm")

In [None]:
# Pair-wise correlation plotting of feature labels
sns.pairplot(diam)

In [None]:
data.head()

In [None]:
data.loc[data.carat == data.carat.min()]

In [None]:
# Boxplot showing quartile range of values under each category under 'cut' label w.r.t 'price' of the diamond 
plt.figure(figsize=(10,8))
sns.boxplot(x="cut",y="price",data=diam,dodge=True)

# Price Prediction Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

In [None]:
# Calling MinMaxScaler() function for feature scaling

scaler = MinMaxScaler()

In [None]:
X = data.drop("price", axis=1)

y = data["price"]

In [None]:
X.shape, y.shape

In [None]:
# Splitting data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)

In [None]:
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
# importing RandomForestRegressor() function from scikit-learn library

from sklearn.ensemble import RandomForestRegressor

In [None]:
# Instantiating RandomForestRegressor() model

rf = RandomForestRegressor()

In [None]:
# Fitting model into training data

rf.fit(X_train, y_train)

In [None]:
# Displaing model score
# MODEL SCORE ---------->  above 99.7%     (Pretty good score)

rf.score(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, r2_score, accuracy_score

In [None]:
pred = rf.predict(X_test)

In [None]:
# Plotting data points between true values and predicted values for the fitted model

plt.scatter(y_test, pred, color="#00FF00")
plt.plot(pred, pred, color="black")

#### The above graph shows that the true values and predicted values are highly positively correlated with each other. Thus, the model has proven to be a good fit for the given data.

In [None]:
# Calculating the R2 (Overall) score of the model
print(r2_score(y_test, pred))

## Testing on Data

In [None]:
# Calculating error for a specific row of data as input

n = int(input("Enter index no : "))
print()

new_data = [list(data.loc[n])]
rp = new_data[0][-4]
print("Recorded Price : ", rp)
new_data[0].pop(-4)
print()

# Using predict() function to predicting/estimating the price for that row of data
new_data = scaler.transform(new_data)
pp = rf.predict(new_data)[-1]
print("Predicted Price : ", np.round(pp,3))
print()


print("$ Error : ", np.round(np.abs(pp-rp),2))

## Saving model as 'filename.model' format

In [None]:
import pickle

In [None]:
filename = "diamond_model.model"
pickle.dump(rf, open(filename, 'wb'))

In [None]:
d = list(data.loc[33])
d.pop(-4)
d

In [None]:
model = pickle.load(open("diamond_model.model", 'rb'))

res = model.predict([[0.23, 2.0, 6.0, 5.0, 59.5, 58.0, 4.01, 4.06, 2.4]])

print(res)