In [None]:
%%HTML
<style type="text/css">

div.h1 {
    font-size: 32px; 
    margin-bottom:2px;
    background-color: steelblue; 
    color: white; 
    text-align: center;
}
div.h2 {
    background-color: steelblue; 
    color: white; 
    padding: 8px; 
    padding-right: 300px; 
    font-size: 24px; 
    max-width: 1500px; 
    margin-top: 50px;
    margin-bottom:4px;
    
}
div.h3 {
    color: steelblue; 
    font-size: 20px; 
    margin-top: 4px; 
    margin-bottom:8px;
}
div.h4 {
    font-size: 15px; 
    margin-top: 20px; 
    margin-bottom: 8px;
}

</style>


<div class="h1">Predicting diamonds price</div>



# Part 1: Exploratory data analysis and data visualization

<h2> Diamond is the most precious gemstones in the world. </h2>
<h2>Its price depends on several factors:</h2>
<ol>
<li> Carat: the weight of diamond peice </li>
<li> Cut: the process of converting the mined rough diamonds into gems, whether the cut is fair, good, very good, premium or ideal </li>
<li> Color: according GIA universalized the D-to-Z Color Grading Scale with D (colorless = best) to Z (light yellow = worst) In our dataset from D to J.</li>
<li> Clarity: the absence of inclusions and flaws. The ascending scale is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)) </li>
<li> Depth percentage: known by dividing its physical depth measurement by its width</li>
<li> Table: the flat facet on its surface</li>
<li> Additional features: length (x), width (y) and depth (z) all in mm
z in the depth  



<p1>For example, the current diamond price per 1.0 carat ranges from ($2,500 – $18,000) according to Diamond Carat Weight </p1>
<a href='https://www.diamonds.pro/education/diamond-prices/'> Source </a> <p1>(accessed in 28-2-2021 )<p1>
<p1> In our dataset, it ranges from $326 – $18,823 </p1>
<center> <img src='https://yourdiamondguru.com/wp-content/uploads/2018/09/GIA-Cut-Scale.png' >  
<br>
<a href='https://yourdiamondguru.com/grading/depth-and-table-values'> Source </a>
<br>
<center> <img src="https://www.millsjewelers.com/wp-content/uploads/2017/05/14_4CS_img1.jpg"></center>
<br>
<a href='https://memoryjewellery.com/diamond-guide/diamond-carat/'> Source </a>
</body>
</html>

## Importing libaries

In [None]:
# importing libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score

%matplotlib inline 
sns.set(rc={'figure.figsize': [10,10]}, font_scale=1.3)

## Getting the data

In [None]:
# read the dataset
df = pd.read_csv('../input/diamonds/diamonds.csv')
df.head()

## Dropping first column "Unnamed"

In [None]:
#droping the first column as the data is already index by row index
df.drop('Unnamed: 0', axis=1, inplace = True) 
df

## checking for null values

In [None]:
# cleaning process searching for null values
df.info()

In [None]:
#same way to detect the number of null values
df.isnull().sum()

## Exploring the categorical features

In [None]:
df['cut'].value_counts()

## Recoding caterogical features

In [None]:
# coding the categories of 'cut' attribute into numbers by mapping method replacing the string by number
cut_dict = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
df['cut'] = df['cut'].map(cut_dict)
df.info() 

In [None]:
df.color.unique()

In [None]:
df.clarity.unique()

## Selecting enteries where x, y or z features are zero

In [None]:
df.describe()

In [None]:
# using loc method and | (or) operator
df.loc[(df['x'] == 0) | (df['y'] == 0) | (df['z'] == 0)]


### features x, y, z contain Zero values, and we should exclude these enteries

In [None]:
len(df[(df['x']==0) | (df['y']==0) | (df['z']==0)])

## removing these enteries

In [None]:
df = df[(df[['x', 'y', 'z']] !=0).all(axis=1)]

In [None]:
len(df[(df['x']==0) | (df['y']==0) | (df['z']==0)])

## Featuring engineering using domain knowledge

In [None]:
df['size'] = df['x'] * df['y'] * df['z']
df

In [None]:
df = df.drop(df[['x', 'y', 'z']], axis=1)
df

## Visualizing distribution of each feature

In [None]:
df.hist(bins=20, figsize=(20,15))
plt.show()

In [None]:
## checking correlation between different attributes
corr = df.corr()
corr

In [None]:
sns.heatmap(data=corr, square=True, annot=True, cmap="BuPu")

It is clear from the table and plot the following:
<ol>
<li> The price is strongly correlated with the carat and the size</li>
<li> The price is poorly correlated with table</li>
<li> The price is inversly correlated with depth; i.e. the deeper the diamond, the cheaper they will be, as well as the cut </li>

</ol>

# Part 2: Building a prediction model

In [None]:
X = df[['carat', 'cut', 'depth','table', 'size']]
y = df['price']

In [None]:
# splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 0)

In [None]:
scaler= StandardScaler()

scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
models = {
    "LR": LinearRegression(),
    "KNR" : KNeighborsRegressor(), 
    "DT": DecisionTreeRegressor(),
    "RF": RandomForestRegressor(),
}

In [None]:
for name, model in models.items():
    print(f'Using model: {name}')
    model.fit(X_train, y_train)
    print(f'Training Score: {model.score(X_train, y_train)}')
    print(f'Test Score: {model.score(X_test, y_test)}')  
    print(f'RMSE: {np.sqrt(mean_squared_error(y_test, model.predict(X_test)))}')
    print('-'*30)

the best model with lower RMSE is the random forest

# Part 3: Saving the model for deployment

In [None]:
model = RandomForestRegressor(n_estimators = 6)
model.fit(X_train,y_train)

In [None]:
import joblib

In [None]:
joblib.dump(model, 'model.h5')

In [None]:
joblib.dump(scaler, 'scaler.h5')

In [None]:
X.columns

In [None]:
inp = [0.3, 3, 57, 57, 60]

In [None]:
inp = scaler.transform([inp])

In [None]:
model.predict(inp)