# Diamonds
## Price Prediction (Regression)
Data source: https://www.kaggle.com/shivam2503/diamonds

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

%matplotlib inline

### Data Exploration

In [2]:
input_data_raw = pd.read_csv('diamonds_input.csv')
input_data_raw.drop(['Unnamed: 0'], inplace=True, axis=1)
input_data_raw.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
input_data_raw.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [4]:
numerical_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']
categorical_cols = ['cut', 'color', 'clarity']
features = numerical_cols + categorical_cols
predict_col = ['price']

X = input_data_raw[features]
y = input_data_raw[predict_col]

In [5]:
input_data_raw[categorical_cols].nunique()

cut        5
color      7
clarity    8
dtype: int64

Each categorical column has 8 or fewer unique values, so One Hot Encoding is viable.

### Preprocessing Pipeline

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

### Prediction Models

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor

In [9]:
lr_1 = LinearRegression(n_jobs=2)
lr_2 = LinearRegression(normalize=True, n_jobs=2)

sgd_1 = SGDRegressor(loss='squared_loss')
sgd_2 = SGDRegressor(loss='huber', max_iter=2000)
sgd_3 = SGDRegressor(loss='epsilon_insensitive', max_iter=2000)

dtr_1 = DecisionTreeRegressor(criterion='mse')
dtr_2 = DecisionTreeRegressor(criterion='mae')
dtr_3 = DecisionTreeRegressor(criterion='poisson')

svr_1 = LinearSVR()

rf_1 = RandomForestRegressor(n_estimators = 100, n_jobs=2)
rf_2 = RandomForestRegressor(n_estimators = 150, n_jobs=2)
rf_3 = RandomForestRegressor(n_estimators = 200, n_jobs=2)

models = [
    lr_1,
    lr_2,
    sgd_1,
    sgd_2,
    sgd_3,
    dtr_1,
    dtr_2,
    dtr_3,
    svr_1,
    rf_1,
    rf_2,
    rf_3
]

### Execute + Measure Accuracy

In [10]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

results = pd.DataFrame(columns=['Model', 'MAE', 'MAPE', 'R2'])

for model in models:
    # Bundle preprocessing and modeling code in a pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                          ('model', model)
                         ])

    # Preprocessing of training data, fit model 
    pipeline.fit(X_train, y_train.values.ravel())

    # Preprocessing of validation data, get predictions
    preds = pipeline.predict(X_valid)

    results.loc[len(results)] = [
        str(model),
        round(mean_absolute_error(y_valid, preds),4),
        round(mean_absolute_percentage_error(y_valid, preds),4),
        round(r2_score(y_valid, preds),4)
    ]

results



Unnamed: 0,Model,MAE,MAPE,R2
0,LinearRegression(n_jobs=2),737.4355,0.3941,0.9212
1,"LinearRegression(n_jobs=2, normalize=True)",737.4355,0.3941,0.9212
2,SGDRegressor(),744.0997,0.3963,0.9212
3,"SGDRegressor(loss='huber', max_iter=2000)",2615.4966,0.458,-0.1841
4,"SGDRegressor(loss='epsilon_insensitive', max_i...",1057.2517,0.3208,0.7607
5,DecisionTreeRegressor(),352.2187,0.0838,0.9659
6,DecisionTreeRegressor(criterion='mae'),365.1518,0.0864,0.9618
7,DecisionTreeRegressor(criterion='poisson'),612.296,0.1561,0.9031
8,LinearSVR(),784.2893,0.2473,0.8592
9,RandomForestRegressor(n_jobs=2),266.6855,0.0641,0.9812


### Best option

In [12]:
results[:][11:12]

Unnamed: 0,Model,MAE,MAPE,R2
11,"RandomForestRegressor(n_estimators=200, n_jobs=2)",265.8645,0.064,0.9813
