# Used Car Pricing Algorithm

## Environment

In [1]:
import numpy as np
import pandas as pd

In [2]:
# show 100 rows, 50 columns
pd.options.display.max_rows = 100
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
def print_shape(df):
    print(f"Rows: {df.shape[0]:,} \nColumns: {df.shape[1]:,}")

In [4]:
def print_rows(df):
    print(f"Rows: {df.shape[0]:,}")

In [5]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def assess_results(y_test, y_pred):
    print("R2: ", round(r2_score(y_test, y_pred),3))
    print("MAE: ", round(mean_absolute_error(y_test, y_pred),2))
    print("MAPE: ", round(np.mean(np.abs((y_test - y_pred) / y_test)) * 100,1),"%")
    print("RMSE: ", round(np.sqrt(mean_squared_error(y_test, y_pred)),2))
    print("Percent Underpayed: ", round(len((y_test - y_pred)[(y_test - y_pred) >= 0]) / len(y_test) * 100,1),"%")
    print("Mean of Underpayment: ", round(np.mean((y_test - y_pred)[(y_test - y_pred) > 0]),2))
    print("Percent Overpayed: ", round(len((y_test - y_pred)[(y_test - y_pred) < 0]) / len(y_test) * 100,1),"%")
    print("Mean of Overpayment: ", round(np.mean((y_test - y_pred)[(y_test - y_pred) < 0]),2))
    print("Mean Error: ", round(np.mean(y_test - y_pred),2)) 
    

In [6]:
working_dir = "/data/p_dsi/capstone_projects/shea/working/"
train = pd.read_pickle(working_dir + "train.pkl")
validate = pd.read_pickle(working_dir + "validate.pkl")
test = pd.read_pickle(working_dir + "test.pkl")

## Model

### Column Inventory

#### HVF

In [54]:
# hvf columns
hvf_cols = [col for col in train.columns if col.startswith("hvf_")]
hvf_cols.remove("hvf_opt_missing")
hvf_cols.remove("hvf_std_missing")
hvf_cols.remove("hvf_optional")
hvf_cols.remove("hvf_standard")
hvf_cols

['hvf_opt_comp_0',
 'hvf_opt_comp_1',
 'hvf_opt_comp_2',
 'hvf_opt_comp_3',
 'hvf_opt_comp_4',
 'hvf_opt_comp_5',
 'hvf_opt_comp_6',
 'hvf_opt_comp_7',
 'hvf_opt_comp_8',
 'hvf_all_comp_0',
 'hvf_all_comp_1',
 'hvf_all_comp_2',
 'hvf_all_comp_3',
 'hvf_all_comp_4',
 'hvf_all_comp_5',
 'hvf_all_comp_6',
 'hvf_all_comp_7',
 'hvf_all_comp_8']

#### Numeric

In [55]:
# numeric columns
num_cols = train.select_dtypes(include=["number"]).columns.tolist()
num_cols.sort()
num_cols.remove("mvr_price")
for hvf_col in hvf_cols:
    num_cols.remove(hvf_col)
num_cols

['age_months',
 'consumer_sentiment_index',
 'cylinders',
 'doors',
 'engine_size',
 'gas_price_index',
 'hvf_opt_missing',
 'hvf_std_missing',
 'is_certified',
 'make_model_encode',
 'make_model_trim_encode',
 'median_home_value',
 'median_income',
 'monthly_mileage',
 'mpg',
 'mvr_mileage',
 'mvr_model_year',
 'mvr_price_bc',
 'new_car_price_index',
 'population_density',
 'used_car_price_index']

In [56]:
# categorical columns
cat_cols = list(train.columns)

# remove num cols
for num_col in num_cols:
    cat_cols.remove(num_col)
    
# remove hvf cols
for hvf_col in hvf_cols:
    cat_cols.remove(hvf_col)

# remove selected other cols
other_cols = ["hvf_optional","hvf_standard","mvr_purchase_date","mvr_purchase_yearmonth","zip"]
for other_col in other_cols:
    cat_cols.remove(other_col)

# remove target and cleanup
cat_cols.remove("mvr_price")
cat_cols.sort()
cat_cols

['base_exterior_color',
 'base_interior_color',
 'body_subtype',
 'body_type',
 'drivetrain',
 'engine_block',
 'fuel_type',
 'make',
 'model',
 'mvr_purchase_month',
 'mvr_state',
 'transmission',
 'trim',
 'vehicle_type']

In [57]:
# target
target = "mvr_price"

### Feature Prep

In [7]:
train[num_cols].sample(5).T

Unnamed: 0,5641706,746779,2789002,4146870,2645290
age_months,31.0,79.0,27.0,51.0,42.0
consumer_sentiment_index,96.8,96.8,96.2,58.2,67.4
cylinders,6.0,6.0,6.0,8.0,4.0
doors,5.0,5.0,4.0,4.0,4.0
engine_size,3.6,3.6,3.6,5.6,2.5
gas_price_index,2.59775,2.59775,2.83575,3.975,3.3948
hvf_opt_missing,1.0,1.0,1.0,1.0,1.0
hvf_std_missing,0.0,0.0,1.0,0.0,0.0
is_certified,0.0,0.0,0.0,0.0,0.0
make_model_encode,26526.765625,27208.826172,16777.140625,31075.59375,16777.140625


### CatBoost

In [58]:
from catboost import CatBoostRegressor

In [59]:
train.columns

Index(['mvr_price', 'make', 'model', 'trim', 'mvr_model_year', 'mvr_mileage',
       'age_months', 'vehicle_type', 'body_type', 'body_subtype', 'drivetrain',
       'fuel_type', 'engine_block', 'engine_size', 'transmission', 'doors',
       'cylinders', 'base_exterior_color', 'base_interior_color',
       'is_certified', 'zip', 'mvr_state', 'hvf_standard', 'hvf_optional',
       'mvr_purchase_date', 'make_model_encode', 'make_model_trim_encode',
       'mvr_purchase_month', 'monthly_mileage', 'mpg', 'population_density',
       'median_income', 'median_home_value', 'mvr_purchase_yearmonth',
       'used_car_price_index', 'new_car_price_index', 'gas_price_index',
       'consumer_sentiment_index', 'hvf_opt_missing', 'hvf_std_missing',
       'hvf_opt_comp_0', 'hvf_opt_comp_1', 'hvf_opt_comp_2', 'hvf_opt_comp_3',
       'hvf_opt_comp_4', 'hvf_opt_comp_5', 'hvf_opt_comp_6', 'hvf_opt_comp_7',
       'hvf_opt_comp_8', 'hvf_all_comp_0', 'hvf_all_comp_1', 'hvf_all_comp_2',
       'hvf_all_com

In [61]:
selected_num_cols = ['age_months',
                     'mvr_mileage',
                     'mvr_model_year',
                     'doors',
                     'cylinders',
                     'engine_size',
                     'mpg',
                     'make_model_trim_encode',
                     'hvf_opt_comp_0',
                     'hvf_opt_comp_1',
                     'hvf_opt_comp_2',
                     'hvf_opt_comp_3',
                     'hvf_opt_comp_4',
                     'hvf_opt_comp_5',
                     'hvf_opt_comp_6',
                     'hvf_opt_comp_7',
                     'hvf_opt_comp_8',
                     'hvf_std_missing',
                     'hvf_opt_missing',
                    # 'gas_price_index',
                    # 'new_car_price_index',
                     'used_car_price_index',
                     'consumer_sentiment_index',
                     'median_income',
                     'is_certified'
                    ]

selected_cat_cols = ['base_exterior_color',
                     'base_interior_color',
                     'body_type',
                     'drivetrain',
                     'engine_block',
                     'fuel_type',
                     'mvr_purchase_month',
                     'mvr_state',
                     'transmission',
                     'vehicle_type'
                    ]

In [32]:
# init model
cb_model = CatBoostRegressor(cat_features=selected_cat_cols
                             ,verbose=False
                             ,task_type="GPU"
                             ,devices="0:3"
                            ,per_float_feature_quantization=gf_index+':border_count=1024'
                            )

# fit
cb_model.fit(train[selected_num_cols + selected_cat_cols]
             ,train[target]
             ,eval_set=(validate[selected_num_cols + selected_cat_cols], validate[target])
            )

<catboost.core.CatBoostRegressor at 0x2b9360642c10>

In [33]:
# predict
cb_preds = cb_model.predict(validate[selected_num_cols + selected_cat_cols])

In [34]:
# results
assess_results(validate[target], cb_preds)

R2:  0.878
MAE:  3121.41
MAPE:  17.8 %
RMSE:  4514.6
Percent Underpayed:  58.2 %
Mean of Underpayment:  3611.44
Percent Overpayed:  41.8 %
Mean of Overpayment:  -2438.54
Mean Error:  1083.83


In [None]:
# print top ten features by importance with importance measure
importance = cb_model.get_feature_importance(prettified=True)
importance.head(15)

### CatBoost Hyperparameter Tuning
https://catboost.ai/en/docs/concepts/parameter-tuning

https://catboost.ai/en/docs/references/training-parameters/common

In [43]:
# golden feature
# selected as a highly predictive feature for more splitting
gf_index = (selected_num_cols + selected_cat_cols).index('make_model_trim_encode')
gf_index = str(gf_index)

In [65]:
hpt_params = grid_search_result['params']
hpt_params

{'bagging_temperature': 1,
 'depth': 10,
 'iterations': 2000,
 'learning_rate': 0.4}

In [66]:
# train the model with the best hyperparameters on the full training set
hpt_model = CatBoostRegressor(cat_features=selected_cat_cols
                             ,task_type="GPU"
                             ,devices="0:3"
                             ,border_count=254
                             ,per_float_feature_quantization=gf_index+':border_count=1024'
                             ,verbose=False
                              ,bagging_temperature=hpt_params["bagging_temperature"]
                              ,depth=hpt_params["depth"]
                              ,iterations=10000
                              ,learning_rate=hpt_params["learning_rate"]
                              ,verbose=True
                            )

# fit
hpt_model.fit(train[selected_num_cols + selected_cat_cols]
             ,train[target]
             ,eval_set=(validate[selected_num_cols + selected_cat_cols], validate[target])
              ,early_stopping_rounds=100 
            )

<catboost.core.CatBoostRegressor at 0x2b93623de0a0>

In [136]:
# predict
hpt_preds = hpt_model.predict(validate[selected_num_cols + selected_cat_cols])

# results
assess_results(validate[target], hpt_preds)

R2:  0.894
MAE:  2837.78
MAPE:  16.8 %
RMSE:  4194.96
Percent Underpayed:  55.6 %
Mean of Underpayment:  3263.89
Percent Overpayed:  44.4 %
Mean of Overpayment:  -2304.39
Mean Error:  791.06


In [None]:
print('Feature importances:', hpt_model.feature_importances_)

## Tilted Loss
Allows targetting of quantiles

https://brendanhasz.github.io/2018/12/15/quantile-regression.html#quantile-regression

In [75]:
# fit
cb_model_50 = CatBoostRegressor(cat_features=selected_cat_cols,verbose=False
                            ,loss_function='Quantile:alpha=0.5')
cb_model_50.fit(train[selected_num_cols + selected_cat_cols],train[target])

<catboost.core.CatBoostRegressor at 0x2b9360e9bfd0>

## Model Checkpoint

In [None]:
working_dir = "/data/p_dsi/capstone_projects/shea/working/"
with open(working_dir + 'hi_iter_model.pickle', 'wb') as file:
    pickle.dump(hpt_model, file)

In [None]:
working_dir = "/data/p_dsi/capstone_projects/shea/working/"
with open(working_dir + 'hi_iter_model_50.pickle', 'wb') as file:
    pickle.dump(cb_model_50, file)

In [9]:
working_dir = "/data/p_dsi/capstone_projects/shea/working/"
with open(working_dir + 'model.pickle', 'rb') as f:
    model = pickle.load(f)

## Final Test Results

In [185]:
# predict
hpt_preds_test = hpt_model.predict(test[selected_num_cols + selected_cat_cols])

# results
assess_results(test[target], hpt_preds_test)

R2:  0.881
MAE:  3074.26
MAPE:  18.4 %
RMSE:  4450.25
Percent Underpayed:  59.3 %
Mean of Underpayment:  3543.41
Percent Overpayed:  40.7 %
Mean of Overpayment:  -2391.63
Mean Error:  1125.92


In [None]:
test