# Diamond Prediction

After the exploratory data analysis, a prediction will be made.
The folowing steps for this prediction:
1. **Preprocessing** - Handle outliers, Feature engineering.
2. **Predictions** - predicting with validation data and then with test data, metrics.
3. **Evaluation** - Baselines for comparison.

## Libraries & settings

In [47]:
#numpy
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

#model explainability
import eli5
from eli5.sklearn import PermutationImportance

#pipeline tools
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from pipelinehelper import PipelineHelper
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
#time related
from timeit import default_timer as timer
from datetime import timedelta

#timer for entire code
start = timer()

#warning hadle
import warnings
warnings.filterwarnings("always")
warnings.filterwarnings("ignore")

#plotly
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

#settings
pd.options.display.float_format = '{:.3f}'.format

## Baseline 1: Basic approach

### Decisions:
* **Preprocessing:**
    1. **Outliers:** **continuous** - fill with median.               
    2. **Feature Engineering**: only categoric labels, OrdinalEncoder.
* **Model training** - using cross validation in validation data on a simple linear regression model.
* **Model testing** - train on whole train + validation set and use test data for results.
* **Model evaluating** - record for validation and test the following metrics:
    1. MSE - [Mean squared error](https://en.wikipedia.org/wiki/Mean_squared_error)
    2. R2 - [R Square](https://en.wikipedia.org/wiki/Coefficient_of_determination)
    3. MAE - [Mean absolut error](https://en.wikipedia.org/wiki/Mean_absolute_error)
    4. NRMSE - [Negative root mean squared error](https://en.wikipedia.org/wiki/Root-mean-square_deviation)
    

**Preprocessing**

In [48]:
# Preprocessing for continuous data

def Outlier_Detector(X,factor):
    X = pd.DataFrame(X).copy()
    for i in range(X.shape[1]):
        x = pd.Series(X.iloc[:,i]).copy()
        q1 = x.quantile(0.25)
        q3 = x.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (factor * iqr)
        upper_bound = q3 + (factor * iqr)
        X.iloc[((X.iloc[:,i] < lower_bound) | (X.iloc[:,i] > upper_bound)),i] = np.nan 
    return X

#creating outlier_remover object using FunctionTransformer with factor=1.5
Outlier = FunctionTransformer(Outlier_Detector,kw_args={'factor':1.5})

#contiuous_transformer = SimpleImputer(strategy='median')

contiuous_transformer = Pipeline(steps=[
('outlier', Outlier),
('imputer', SimpleImputer(strategy='median'))
])

# building categorical transformers (worst to best)
cut_enc = OrdinalEncoder(categories=[["Fair", "Good", "Very Good", "Premium","Ideal"]])
color_enc = OrdinalEncoder(categories=[['J', 'I', 'H', 'G', 'F', 'E','D']])
clarity_enc = OrdinalEncoder(categories=[["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1",'IF']])


# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', contiuous_transformer, Continuous),
        ('cuts', cut_enc, ["cut"]),
        ('colors', color_enc, ["color"]),
        ('clarities', clarity_enc, ["clarity"])
    ])

**Model**

In [49]:
model = LinearRegression()

**Final Pipeline**

In [50]:
# Bundle preprocessing and modeling code in a pipeline
Baseline1 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
Baseline1.fit(X_train2, y_train2)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('outlier',
                                                                   FunctionTransformer(func=<function Outlier_Detector at 0x000001F8505A15E0>,
                                                                                       kw_args={'factor': 1.5})),
                                                                  ('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('cuts',
                                                  OrdinalEncoder(categories=[['Fair',
                                                                              'Good',
                 

**Validation Prediction**

In [51]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


# Preprocessing of validation data, get predictions
b1_val_preds = Baseline1.predict(X_val)

# Evaluate the model
b1_val_mae = mean_absolute_error(y_val, b1_val_preds)
b1_val_mse = mean_squared_error(y_val, b1_val_preds)
b1_val_r2 = r2_score(y_val, b1_val_preds)

print('MAE:', b1_val_mae)
print("MSE: ",b1_val_mse)
print("R2: ",b1_val_r2)

MAE: 1227.2738971261278
MSE:  2652792.3125459263
R2:  0.8367188169244258


**Cross Validation Prediction (NRMSE)**

In [52]:
from sklearn.model_selection import cross_val_score
CV = cross_val_score(Baseline1, X_train, y_train, cv=5, scoring = "neg_root_mean_squared_error")
print(f"validation negative root mean squared error on 5 fold cross validation: {CV}")
print(f"validation negative root mean squared error accuracy: {CV.mean()}")

validation negative root mean squared error on 5 fold cross validation: [-1674.34968697 -1594.12551928 -1562.22328898 -1589.48519538
 -1627.25022853]
validation negative root mean squared error accuracy: -1609.4867838266748


**Test Prediction**

In [53]:
Baseline1.fit(X_train, y_train)
b1_test_preds = Baseline1.predict(X_test)
b1_test_mae = mean_absolute_error(y_test, b1_test_preds)
b1_test_mse = mean_squared_error(y_test, b1_test_preds)
b1_test_r2 = r2_score(y_test, b1_test_preds)
print('MAE:', b1_test_mae)
print("MSE: ",b1_test_mse)
print("R2: ",b1_test_r2)

MAE: 1249.8900248224795
MSE:  2697964.8791565355
R2:  0.8331690897943889


**Test NRMSE**

In [54]:
print("NRMSE: ",-np.sqrt(b1_test_mse))

NRMSE:  -1642.5482882267222


**Model Explainability**

The chosen method is [Permutation](https://brilliant.org/wiki/permutations/#permutations-problem-solving) Importance.
Each feature is shuffled and predicted with the rest of the normally ordered columns, where the weights are mostly changed in positive numbers means they most effect the model. negative means the model performed better with this random ordered feature, which means it's that it is useless as a predictor.

In [55]:
x_tr = Baseline1.named_steps["preprocessor"].fit_transform(X_train)#preprocess inputs
perm = PermutationImportance(Baseline1.named_steps["model"]).fit(x_tr, y_train)#fit model
eli5.show_weights(perm, feature_names = X_train.columns.tolist())#show results

Weight,Feature
0.7135  ± 0.0052,depth
0.2190  ± 0.0014,table
0.1020  ± 0.0006,z
0.0993  ± 0.0017,clarity
0.0550  ± 0.0015,carat
0.0230  ± 0.0006,y
0.0033  ± 0.0003,x
0.0007  ± 0.0001,cut
0.0003  ± 0.0001,color


**insights:**
* depth is very important for this models prediction. 
* color has no effect on the models predictions.
* only table, z and clarity has a big effect on the model other featres less so.

In [56]:
Baseline1.named_steps["preprocessor"]

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('outlier',
                                                  FunctionTransformer(func=<function Outlier_Detector at 0x000001F8505A15E0>,
                                                                      kw_args={'factor': 1.5})),
                                                 ('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['carat', 'depth', 'table', 'x', 'y', 'z']),
                                ('cuts',
                                 OrdinalEncoder(categories=[['Fair', 'Good',
                                                             'Very Good',
                                                             'Premium',
                                                             'Ideal']]),
                                 ['cut']),
                                ('colors',
                           

### Baseline 1 Summary
The model was validated with mse, mae and r2, cross validated 5 times for NRMSE, and tested 1 time for all the metrics:
* **Mae**: validation set and test set around 1,200 and validation performed slightly better. 
* **Mse**: validation set and test set around 2,650,000 and validation performed slightly better.
* **R2**: validation set and test set around 0.83 and validation performed slightly better.
* **NRMSE**: validation set and test set around 1,600 and validation performed slightly better.

The baseline is saved for comparison as a pandas dataframe: 

In [57]:
baseline1 = pd.DataFrame({"val_mae": b1_val_mae,"val_mse": b1_val_mse,"val_r2": b1_val_r2,"val_nrmse": CV.mean(),"test_mae": b1_test_mae,"test_mse": b1_test_mse,"test_r2": b1_test_r2, "test_nrmse": -np.sqrt(b1_test_mse)}, index=["Baseline1"])
baseline1

Unnamed: 0,val_mae,val_mse,val_r2,val_nrmse,test_mae,test_mse,test_r2,test_nrmse
Baseline1,1227.274,2652792.313,0.837,-1609.487,1249.89,2697964.879,0.833,-1642.548


## Baseline 2: Preprocess Parameter tuning

### Decisions:
* **Preprocessing:**
    1. **Outliers:**  **continuous** - choose what is considered an outlier, fill with median/mean and scaling.               
    2. **Feature Engineering**: only categoric labels, OrdinalEncoder.
* **Model training** - using cross validation in validation data on a simple linear regression model.
* **Model testing** - validate with grid search and validation set then train on whole train + validation set and use test data for results.
* **Model evaluating** - record for validation and test the following metrics:
    1. MSE - [Mean squared error](https://en.wikipedia.org/wiki/Mean_squared_error)
    2. R2 - [R Square](https://en.wikipedia.org/wiki/Coefficient_of_determination)
    3. MAE - [Mean absolut error](https://en.wikipedia.org/wiki/Mean_absolute_error)
    4. NRMSE - [Negative root mean squared error](https://en.wikipedia.org/wiki/Root-mean-square_deviation)
    

**Preprocessing**

In [58]:
# Preprocessing for continuous data
Outlier2 = FunctionTransformer(Outlier_Detector,kw_args={'factor':np.nan})

#contiuous_transformer = SimpleImputer(strategy='median')

contiuous_transformer = Pipeline(steps=[
('outlier', Outlier2),
('imputer', SimpleImputer()),
('scaler', StandardScaler())    
])

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', contiuous_transformer, Continuous),
        ('cuts', cut_enc, ["cut"]),
        ('colors', color_enc, ["color"]),
        ('clarities', clarity_enc, ["clarity"])
    ])

In [59]:
# Bundle preprocessing and modeling code in a pipeline
Baseline2 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
Baseline2.fit(X_train2, y_train2)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('outlier',
                                                                   FunctionTransformer(func=<function Outlier_Detector at 0x000001F8505A15E0>,
                                                                                       kw_args={'factor': nan})),
                                                                  ('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['carat', 'depth', 'table',
                                                   'x', 'y', 'z']),
                                                 ('cuts',
                                           

In [60]:
Baseline2.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'model', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__num', 'preprocessor__cuts', 'preprocessor__colors', 'preprocessor__clarities', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__outlier', 'preprocessor__num__imputer', 'preprocessor__num__scaler', 'preprocessor__num__outlier__accept_sparse', 'preprocessor__num__outlier__check_inverse', 'preprocessor__num__outlier__func', 'preprocessor__num__outlier__inv_kw_args', 'preprocessor__num__outlier__inverse_func', 'preprocessor__num__outlier__kw_args', 'preprocessor__num__outlier__validate', 'preprocessor__num__imputer__add_indicator', 'preprocessor__num__imputer__copy', 'preprocessor__num__imputer__fill_value', 'preprocessor__num__imputer__missing_values', 'preprocessor__num__impute

In [61]:
hyperparameters = {'preprocessor__num__outlier__kw_args':[{'factor':0},{'factor':0.5},{'factor':1},{'factor':1.5},{'factor':2},{'factor':2.5},{'factor':3}],
              'preprocessor__num__imputer__strategy':['mean','median']}
#grid search
b2_test_clf = GridSearchCV(Baseline2, hyperparameters,cv = 5, scoring = "neg_root_mean_squared_error", n_jobs = -1, verbose = 2) 

In [62]:
%%time
# Fit and tune model
b2_test_clf.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
Wall time: 3.65 s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('outlier',
                                                                                          FunctionTransformer(func=<function Outlier_Detector at 0x000001F8505A15E0>,
                                                                                                              kw_args={'factor': nan})),
                                                                                         ('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                     

In [63]:
b2_test_clf.best_params_

{'preprocessor__num__imputer__strategy': 'mean',
 'preprocessor__num__outlier__kw_args': {'factor': 3}}

In [70]:
b2_val_clf = b2_test_clf.best_estimator_
b2_val_clf.fit(X_train2, y_train2)
b2_val_preds = b2_val_clf.predict(X_val)

# Evaluate the model
b2_val_mae = mean_absolute_error(y_val, b2_val_preds)
b2_val_mse = mean_squared_error(y_val, b2_val_preds)
b2_val_r2 = r2_score(y_val, b2_val_preds)

print('MAE:', b2_val_mae)
print("MSE: ",b2_val_mse)
print("R2: ",b2_val_r2)
print("NRMSE: ",-np.sqrt(b2_val_mse))

MAE: 829.0934236938818
MSE:  1413280.7606081744
R2:  0.9130116015797015
NRMSE:  -1188.8148554792601


In [69]:
b2_Te_clf = b2_test_clf.best_estimator_
b2_Te_clf.fit(X_train, y_train)
b2_test_preds = b2_Te_clf.predict(X_test)

# Evaluate the model
b2_test_mae = mean_absolute_error(y_test, b2_test_preds)
b2_test_mse = mean_squared_error(y_test, b2_test_preds)
b2_test_r2 = r2_score(y_test, b2_test_preds)

print('MAE:', b2_test_mae)
print("MSE: ",b2_test_mse)
print("R2: ",b2_test_r2)
print("NRMSE: ",-np.sqrt(b2_test_mse))

MAE: 827.987068391731
MSE:  1451969.502775263
R2:  0.9102162538844779
NRMSE:  -1204.9769718858793


In [71]:
baseline2 = pd.DataFrame({"val_mae": b2_val_mae,"val_mse": b2_val_mse,"val_r2": b2_val_r2,"val_nrmse": -np.sqrt(b2_val_mse),"test_mae": b2_test_mae,"test_mse": b2_test_mse,"test_r2": b2_test_r2, "test_nrmse": -np.sqrt(b2_test_mse)}, index=["Baseline2"])
baseline2

Unnamed: 0,val_mae,val_mse,val_r2,val_nrmse,test_mae,test_mse,test_r2,test_nrmse
Baseline2,829.093,1413280.761,0.913,-1188.815,827.987,1451969.503,0.91,-1204.977


In [75]:
x_tr = b2_test_clf.best_estimator_.named_steps["preprocessor"].fit_transform(X_train)#preprocess inputs
perm = PermutationImportance(b2_test_clf.best_estimator_.named_steps["model"]).fit(x_tr, y_train)#fit model
eli5.show_weights(perm, feature_names = X_train.columns.tolist())#show results

Weight,Feature
2.5679  ± 0.0150,carat
0.2312  ± 0.0023,clarity
0.1729  ± 0.0023,depth
0.0851  ± 0.0010,z
0.0357  ± 0.0011,y
0.0062  ± 0.0002,table
0.0026  ± 0.0002,x
0.0002  ± 0.0000,color
0.0000  ± 0.0000,cut


In [72]:
Baselines = pd.concat([baseline1,baseline2])
Baselines

Unnamed: 0,val_mae,val_mse,val_r2,val_nrmse,test_mae,test_mse,test_r2,test_nrmse
Baseline1,1227.274,2652792.313,0.837,-1609.487,1249.89,2697964.879,0.833,-1642.548
Baseline2,829.093,1413280.761,0.913,-1188.815,827.987,1451969.503,0.91,-1204.977
