In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn as sk

# import useful methods for pre-processing and evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import r_regression
from sklearn.model_selection import cross_validate

# import regression models to be used
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor

# import metrics to be used
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score as R2
from sklearn.metrics import make_scorer

# List of regressor tuples to be swapped into the end of the pipeline of transforms
regressors = [
    ("linear", LinearRegression()),
    ("k_neighbors", KNeighborsRegressor()),
    ("ridge", Ridge(random_state = 309)),
    ("decision_tree", DecisionTreeRegressor(random_state = 309)),
    ("random_forest", RandomForestRegressor(random_state = 309)),
    ("gradient_boosting", GradientBoostingRegressor(random_state = 309)),
    ("stochastic_gd", SGDRegressor(random_state = 309)),
    ("support_vector", SVR()),
    ("linear_svr", LinearSVR(random_state = 309)),
    ("multi_layer_perceptron", MLPRegressor(random_state = 309))
]

# Scoring callables to score each regressor
metrics = {
    "mse": make_scorer(lambda y_true, y_pred: MSE(y_true, y_pred, squared = True)),
    "rmse": make_scorer(lambda y_true, y_pred: MSE(y_true, y_pred, squared = False)),
    "rse": make_scorer(lambda y_true, y_pred: 1 - R2(y_true, y_pred)),
    "mae": make_scorer(MAE),
}

In [2]:
# Loading data

df = pd.read_csv('data/diamonds.csv')

# Drop instance id column
df = df.drop(["Unnamed: 0"], axis=1)

# Separate target from data
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [3]:
# Building the model

# numeric features to use with numerical_transformer
num_feats = ["carat", "depth", "table", "x", "y", "z"]

#transformer for numeric variables
numerical_transformer = QuantileTransformer(output_distribution = 'normal', random_state = 309)

# Specifying order of ordinal variables from best to worst and
# creating transformations of ordinal variables
cut_transformer = OrdinalEncoder(categories = [["Ideal", "Premium", "Very Good", "Good", "Fair"]])
color_transformer = OrdinalEncoder(categories = [["D", "E", "F", "G", "H", "I", "J"]])
clarity_transformer = OrdinalEncoder(categories = [["IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1"]])

# preproccessor to handle encoding of all independant variables
preprocessor = ColumnTransformer(
    transformers = [
        ("carat", numerical_transformer, ["carat"]),
        ("cut", cut_transformer, ["cut"]),
        ("color", color_transformer, ["color"]),
        ("clarity", clarity_transformer, ["clarity"]),
        ("num_trans", numerical_transformer, num_feats[1:])
    ]
)

# Pipeline for preprocessing
pipeline = Pipeline(steps = [('process_X', preprocessor), ('model', None)])

# Wrap in a TransformedTargetRegressor to handle target variable 'price'
model = TransformedTargetRegressor(regressor = pipeline, transformer = numerical_transformer)

# Show visual representation of the model
model

In [4]:
# Cross validation on test set

# Split for cross-validation on training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 309)

cv_results = []
for regressor in regressors:
    # setting the estimator to use for regression
    model.regressor.steps[1] = regressor
    
    # Perform 5-fold cross-validation on training set
    train_scores = cross_validate(model, X_train, y_train, scoring = metrics, n_jobs = -1)
    cv_results.append(train_scores)

In [5]:
# Testing model

# Splitting method for evaluating model on test data
tt_split = ShuffleSplit(n_splits = 1, test_size= 0.3, random_state = 309)

test_results = []
for regressor in regressors:
    # setting the estimator to use for regression
    model.regressor.steps[1] = regressor
    
    # Evaluate scores for test set
    scores = cross_validate(model, X, y, scoring = metrics, cv = tt_split, n_jobs = -1)
    test_results.append(scores)

In [6]:
# Collecting results

def average_scores(d):
    for k,v in zip(d.keys(), d.values()):
        d[k] = np.mean(v)
        
# Averaging scores across cv folds
for d in test_results:
    average_scores(d)
for d in cv_results:
    average_scores(d)

reg_names = np.array(regressors)[:, 0] 
eval_mets = test_results[0].keys()
test_results_df = pd.DataFrame(test_results, index = reg_names, columns = eval_mets)
cv_results_df = pd.DataFrame(cv_results, index = reg_names, columns = eval_mets)

In [7]:
# Displaying results of cross validation
cv_sorted = cv_results_df.sort_values(by=['test_mse'])
cv_sorted

Unnamed: 0,fit_time,score_time,test_mse,test_rmse,test_rse,test_mae
random_forest,13.771016,0.2935,329579.133193,573.270462,0.020874,276.028926
gradient_boosting,3.511454,0.03437,393273.419428,626.590674,0.024909,321.808902
support_vector,47.417407,20.755635,506085.062685,710.902328,0.032058,380.995798
decision_tree,0.315595,0.034373,576015.807602,758.035745,0.036475,369.121679
multi_layer_perceptron,8.856643,0.035922,653314.658838,800.35243,0.041334,432.824598
k_neighbors,0.221627,0.509489,664202.839751,814.623536,0.042074,437.762571
stochastic_gd,0.186614,0.045886,889325.243279,942.812874,0.056332,561.952581
ridge,0.126592,0.033598,909869.72554,953.728012,0.057636,568.823659
linear,0.167552,0.032954,910459.858727,954.038207,0.057673,568.9949
linear_svr,5.263007,0.034369,974053.214768,986.665703,0.061701,554.725839


In [8]:
# Displaying results of test

# test_results_df

In [9]:
# Sorting results by a metric

sorted_results = test_results_df.sort_values(by=['test_mse'])

# Rounding for intepretation 
rounded_results = sorted_results.round(2)
rounded_results

Unnamed: 0,fit_time,score_time,test_mse,test_rmse,test_rse,test_mae
random_forest,13.43,0.48,323376.8,568.66,0.02,267.57
gradient_boosting,3.5,0.06,404892.08,636.31,0.02,324.19
multi_layer_perceptron,8.32,0.03,408876.5,639.43,0.03,342.51
support_vector,43.66,30.28,474266.97,688.67,0.03,374.01
decision_tree,0.33,0.05,585564.39,765.22,0.04,368.44
k_neighbors,0.22,0.66,644830.2,803.01,0.04,423.56
stochastic_gd,0.14,0.06,959889.79,979.74,0.06,565.74
ridge,0.13,0.05,990344.86,995.16,0.06,576.68
linear,0.13,0.05,990855.81,995.42,0.06,576.81
linear_svr,2.82,0.03,1035373.39,1017.53,0.06,555.82
