In [1]:
import pickle

import numpy as np
import pandas as pd
import tensorflow as tf
from prettytable import PrettyTable
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from xgboost import XGBRegressor

from utils import (
    calc_performance_metrics,
    process_data,
    transform_test_data,
    transform_train_valid_data,
)


Data Preprocessing

In [2]:
train = pd.read_csv("data/train.csv.gz", compression="gzip")


Drop invalid rows where price is 0 or a negative number

In [3]:
train.drop(train[train["price"] <= 0].index, inplace=True)
train.reset_index(inplace=True, drop=True)


In [4]:
train = process_data(train)


Our end metric is the root mean squared logarithmic error (RMSLE). Take the log of the target to optimize with root mean squared error (RMSE) directly.

In [5]:
y = np.log1p(train["price"])
x = train.drop(["id", "seller_id", "price"], axis=1)


Split into 90% training, 10% validation set

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(
    x, y, test_size=0.1, random_state=42
)


Transform the categorical data

In [7]:
X_train, X_valid, label_binarizers, count_vectorizers = transform_train_valid_data(
    X_train, X_valid
)


In [8]:
pickle.dump(label_binarizers, open("models/label_binarizers", "wb"))
pickle.dump(count_vectorizers, open("models/count_vectorizers", "wb"))


Gradient Boosting with XGBoost

Initial model

In [9]:
init_model = XGBRegressor()
init_model.fit(X_train, y_train)


In [10]:
# The model is trained with log(target). The model.predict() will also give log(target).
# To get the same range of predictions as the ground truth, I calculated exp(prediction) - 1.
# So when calculated metric I also used exp(target) - 1 to get back the original value.

init_metrics = calc_performance_metrics(
    init_model, X_train, X_valid, np.expm1(y_train), np.expm1(y_valid)
)


Hyperparameter tuning. Tune max_depth and subsample to prevent overfitting.

In [11]:
param_grid = {
    "max_depth": [10, 15, 20],
    "subsample": [0.7, 0.8, 0.9],
}

xgb_model = XGBRegressor(eval_metric=mean_squared_error)

# I first tried GridSearchCV but the training was taking a very long time, so
# to save time I decided to use RandomizedSearchCV.
gs = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    cv=3,
    scoring="neg_mean_squared_error",
    n_iter=5,
    n_jobs=-1,
    verbose=0,
)
model = gs.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)])
model.best_estimator_.save_model("models/model.json")


[0]	validation_0-rmse:1.77585	validation_0-mean_squared_error:3.15363	validation_1-rmse:1.78820	validation_1-mean_squared_error:3.19767
[0]	validation_0-rmse:1.77487	validation_0-mean_squared_error:3.15018	validation_1-rmse:1.78725	validation_1-mean_squared_error:3.19428
[0]	validation_0-rmse:1.77529	validation_0-mean_squared_error:3.15164	validation_1-rmse:1.78671	validation_1-mean_squared_error:3.19233
[0]	validation_0-rmse:1.77460	validation_0-mean_squared_error:3.14922	validation_1-rmse:1.78605	validation_1-mean_squared_error:3.18998
[1]	validation_0-rmse:1.30719	validation_0-mean_squared_error:1.70874	validation_1-rmse:1.32047	validation_1-mean_squared_error:1.74363[1]	validation_0-rmse:1.30925	validation_0-mean_squared_error:1.71414	validation_1-rmse:1.32276	validation_1-mean_squared_error:1.74970

[1]	validation_0-rmse:1.30786	validation_0-mean_squared_error:1.71049	validation_1-rmse:1.32098	validation_1-mean_squared_error:1.74498
[1]	validation_0-rmse:1.30565	validation_0-mean_



Prepare the test set

In [12]:
test = pd.read_csv("data/test.csv.gz", compression="gzip")
preds = pd.DataFrame(columns=["id", "price"])
preds["id"] = test["id"]

test = process_data(test.drop(["id", "seller_id"], axis=1))

X_test = transform_test_data(test, label_binarizers, count_vectorizers)


In [13]:
saved_model = XGBRegressor()
saved_model.load_model("models/model.json")

preds["price"] = np.expm1(saved_model.predict(X_test)).astype("int")
preds.to_csv("submission.csv", index=False)

tuned_metrics = calc_performance_metrics(
    saved_model, X_train, X_valid, np.expm1(y_train), np.expm1(y_valid)
)


Multilayer Perceptron

Fully connected neural network with 4 hidden layers

In [14]:
def build_model():
    model = tf.keras.Sequential(
        [
            tf.keras.layers.Input(
                shape=(X_train.shape[1],), dtype="float32", sparse=True
            ),
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(1),
        ]
    )

    rmse = tf.keras.metrics.RootMeanSquaredError(name="rmse")
    msle = tf.keras.metrics.MeanSquaredLogarithmicError(name="msle")

    model.compile(
        loss="mean_squared_error",
        optimizer=tf.keras.optimizers.Adam(0.001),
        metrics=[rmse, msle],
    )

    return model


def train_model(x_train, y_train, x_val, y_val):
    train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(32)
    val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(32)

    es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", verbose=1, patience=10)
    mc = tf.keras.callbacks.ModelCheckpoint(
        "models/tf_model",
        monitor="val_loss",
        verbose=1,
        save_best_only=True,
    )

    model = build_model()
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=25,
        verbose=1,
        callbacks=[mc, es],
    )

    return history


In [15]:
# Need to convert sparse matrix into a sparse tensor before feeding it into the neural network
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))


In [16]:
X_train_tensor = convert_sparse_matrix_to_sparse_tensor(X_train)
X_valid_tensor = convert_sparse_matrix_to_sparse_tensor(X_valid)


2022-05-20 14:50:27.445317: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Train the neural network

In [17]:
train_history = train_model(X_train_tensor, y_train, X_valid_tensor, y_valid)


Epoch 1/25




Epoch 1: val_loss improved from inf to 0.26465, saving model to models/tf_model




INFO:tensorflow:Assets written to: models/tf_model/assets


INFO:tensorflow:Assets written to: models/tf_model/assets


Epoch 2/25
Epoch 2: val_loss did not improve from 0.26465
Epoch 3/25
Epoch 3: val_loss did not improve from 0.26465
Epoch 4/25
Epoch 4: val_loss did not improve from 0.26465
Epoch 5/25
Epoch 5: val_loss did not improve from 0.26465
Epoch 6/25
Epoch 6: val_loss did not improve from 0.26465
Epoch 7/25
Epoch 7: val_loss did not improve from 0.26465
Epoch 8/25
Epoch 8: val_loss did not improve from 0.26465
Epoch 9/25
Epoch 9: val_loss did not improve from 0.26465
Epoch 10/25
Epoch 10: val_loss did not improve from 0.26465
Epoch 11/25
Epoch 11: val_loss did not improve from 0.26465
Epoch 11: early stopping


Training results

In [18]:
training_results = pd.DataFrame(train_history.history)
training_results


Unnamed: 0,loss,rmse,msle,val_loss,val_rmse,val_msle
0,0.371659,0.609639,0.031134,0.264649,0.514441,0.018608
1,0.211381,0.459762,0.015133,0.270553,0.520148,0.019129
2,0.151791,0.389604,0.010957,0.281323,0.530399,0.019885
3,0.126666,0.355901,0.00888,0.323326,0.568618,0.022673
4,0.103205,0.321255,0.007191,0.284097,0.533008,0.020186
5,0.087994,0.296638,0.006024,0.297922,0.545822,0.020993
6,0.078249,0.27973,0.005334,0.296766,0.544762,0.021447
7,0.066464,0.257806,0.004536,0.306497,0.553621,0.022154
8,0.058201,0.24125,0.003958,0.289552,0.538101,0.02094
9,0.054556,0.233571,0.003687,0.287074,0.535792,0.020724


In [19]:
trained_model = tf.keras.models.load_model("models/tf_model")
tf_metrics = calc_performance_metrics(
    trained_model, X_train_tensor, X_valid_tensor, np.expm1(y_train), np.expm1(y_valid)
)




In [20]:
table = PrettyTable()
table.field_names = [
    "Model",
    "Training RMSLE",
    "Validation RMSLE",
    "Training MAE",
    "Validation MAE",
]
table.add_row(
    [
        "XGBoost (Default hyperparameters)",
        f'{init_metrics["Training RMSLE"]:.4f}',
        f'{init_metrics["Validation RMSLE"]:.4f}',
        f'{init_metrics["Training MAE"]:.4f}',
        f'{init_metrics["Validation MAE"]:.4f}',
    ]
)
table.add_row(
    [
        "XGBoost (Hyperparameter tuning)",
        f'{tuned_metrics["Training RMSLE"]:.4f}',
        f'{tuned_metrics["Validation RMSLE"]:.4f}',
        f'{tuned_metrics["Training MAE"]:.4f}',
        f'{tuned_metrics["Validation MAE"]:.4f}',
    ]
)
table.add_row(
    [
        "TensorFlow",
        f'{tf_metrics["Training RMSLE"]:.4f}',
        f'{tf_metrics["Validation RMSLE"]:.4f}',
        f'{tf_metrics["Training MAE"]:.4f}',
        f'{tf_metrics["Validation MAE"]:.4f}',
    ]
)


Summary table

In [21]:
table


Model,Training RMSLE,Validation RMSLE,Training MAE,Validation MAE
XGBoost (Default hyperparameters),0.4753,0.5136,8.611,9.8018
XGBoost (Hyperparameter tuning),0.4187,0.4995,7.4214,9.4769
TensorFlow,0.4534,0.5144,8.4607,9.916
