## Appraisal model v.1
appraisal model using xgboost

### Import Libraries and Read preprocessed data

In [94]:
import pandas as pd
import numpy as np
import xgboost as xg
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 
from sklearn.metrics import mean_absolute_error as MAE 
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import statistics

In [95]:
df = pd.read_csv('../AppraisalDataPreprocessing/preprocessed_data.csv')
df.head(5)

Unnamed: 0,num__car_year,num__mile,nom__model_3,nom__model_BT-50 PRO,nom__model_CX-3,nom__model_CX-30,nom__model_CX-5,nom__model_CX-8,nom__sub_model_1.5,nom__sub_model_1.6,...,nom__color_gray,nom__color_green,nom__color_other,nom__color_red,nom__color_silver,nom__color_sky,nom__color_white,Id,cost,car_model
0,-0.680002,0.851441,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,245000,Mazda 2 1.5 Sports Maxx Sports Hatchback
1,-1.670146,-0.039278,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,269000,Mazda 3 2.0 Maxx Sports Hatchback
2,1.300286,-0.722674,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2,390000,Mazda 2 1.3 S Leather Sedan
3,1.300286,-0.031599,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,3,650000,Mazda CX-30 2.0 SP SUV
4,-0.349954,1.043406,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4,357900,Mazda 2 1.3 High Connect Sedan


### Appraise cost using xgboost

get mean of each car model for evaluate

In [96]:
df['car_model_mean'] = df['cost'].groupby(df['car_model']).transform(np.mean)

seperate target column from dataframe

In [97]:
X = df.drop(columns=['cost', 'car_model', 'Id'])
y = df['cost']

splitting data  
train : test = 80 : 20  
stratify by car model

In [98]:
car_model = df['car_model']
df.drop(columns=['car_model'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=car_model, random_state=1234)

In [99]:
X_train.drop(columns=['car_model_mean'], inplace=True)
car_model_mean = X_test['car_model_mean']
X_test.drop(columns=['car_model_mean'], inplace=True)

In [100]:
# one2car = pd.read_csv('../AppraisalDataPreprocessing/data_one2car.csv')
# one2car.rename(columns={'Unnamed: 0':'Id'}, inplace=True)
# test = X_test[['Id']].merge(one2car, on='Id')
# test.to_csv('test_dataset.csv')

In [101]:
ttb_id = pd.read_csv('test_dataset_ttb_id.csv')
ttb_bluebook = pd.read_excel('ttb_bluebook.xlsx')
ttb_bluebook.rename(columns={'Unnamed: 0':'ttb_bluebook_id'}, inplace=True)
ttb_id = ttb_id.merge(ttb_bluebook[['avg_used_price', 'ttb_bluebook_id']], how='left', on='ttb_bluebook_id')
ttb_id['avg_used_price'].fillna(value=ttb_id['avg_used_price'].mean(), inplace=True)
estimated_price = ttb_id['avg_used_price']
estimated_price

0       620000.0
1       480000.0
2       525000.0
3      1170000.0
4       235000.0
         ...    
430     430000.0
431     660000.0
432     450000.0
433     235000.0
434     970000.0
Name: avg_used_price, Length: 435, dtype: float64

Use xgboost model to appraise the value

In [102]:
xgb_r = xg.XGBRegressor(objective='reg:squarederror', n_estimators=200, eta=0.05, seed=1234, tree_method="hist", device="cuda", n_jobs=-1, eval_metric="mape")

params = {
    'max_depth': [10, 30, 100],
    'gamma': [0.5, 1, 2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

gs = GridSearchCV(
    estimator = xgb_r,
    param_grid = params,
    verbose=10,
    cv=3,
    n_jobs=-1
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [103]:
model = gs.best_estimator_
gs.best_estimator_

In [104]:
pred = model.predict(X_test)

### Evaluate model

In [105]:
result_cols = ['model', 'mae', 'mse', 'rmse', 'mape', 'r2']
result_df = pd.DataFrame(columns=result_cols)

standard deviation of car price

In [106]:
sd = statistics.stdev(y_test)
print(sd)

185489.10530089893


#### Mean price of each car model

In [107]:
mae = MAE(y_test, car_model_mean)
mse = MSE(y_test, car_model_mean, squared=True)
rmse = MSE(y_test, car_model_mean, squared=False)
mape = MAPE(y_test, car_model_mean)
r2 = r2_score(y_test, car_model_mean)

result = {'model':'mean price', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### Estimated price of each car from ttb bluebook

VAT 7% has been included to ttb bluebook estimated price

In [108]:
estimated_price = estimated_price/1.07

In [109]:
mae = MAE(y_test, estimated_price)
mse = MSE(y_test, estimated_price, squared=True)
rmse = MSE(y_test, estimated_price, squared=False)
mape = MAPE(y_test, estimated_price)
r2 = r2_score(y_test, estimated_price)

result = {'model':'ttb bluebook', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### XGBoost

In [110]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'xgboost', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [111]:
result_df

Unnamed: 0,model,mae,mse,rmse,mape,r2
0,mean price,54430.452623,6043130000.0,77737.574557,0.122708,0.823955
1,ttb bluebook,64602.434698,8458107000.0,91967.969289,0.151228,0.753603
2,xgboost,38734.157148,3374354000.0,58089.192258,0.090079,0.9017


In [112]:
import pickle

model_file = open('./models2/xgboost.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

### Other model

#### Decision tree

In [113]:
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(criterion = 'squared_error', random_state=42)

params = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 10, 20],
}

gs = GridSearchCV(
    estimator = clf,
    param_grid = params,
    n_jobs=-1,
    verbose=10
)
gs.fit(X_train, y_train)

model = gs.best_estimator_
gs.best_estimator_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [114]:
pred = model.predict(X_test)

In [115]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'decision tree', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [116]:
import pickle

model_file = open('./models2/decision_tree.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### Random Forest

In [117]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(criterion = 'squared_error', random_state=42)

params = {
    'max_depth': [5, 10]+[None],
    'min_samples_split': [2, 10, 50],
    'n_estimators': [100, 200, 300],
    'max_features': [10, 30, 50, 100, 152],
}

gs = GridSearchCV(
    estimator = clf,
    param_grid = params,
    verbose=10
)

gs.fit(X_train, y_train)

model = gs.best_estimator_
gs.best_estimator_

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV 1/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 1/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.703 total time=   0.0s
[CV 2/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 2/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.754 total time=   0.0s
[CV 3/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 3/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.812 total time=   0.0s
[CV 4/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 4/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.733 total time=   0.0s
[CV 5/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV

In [118]:
pred = model.predict(X_test)

In [119]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'random forest', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [120]:
import pickle

model_file = open('./models2/random_forest.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### XGBoostRFRegressor

In [121]:
xgb_r = xg.XGBRFRegressor(objective='reg:squarederror', n_estimators=200, eta=0.05, seed=1234, tree_method="hist", device="cuda", n_jobs=-1, eval_metric="mape")

params = {
    'max_depth': [10, 30],
    'gamma': [0.5, 1],
    'subsample': [0.6, 1.0],
    'colsample_bytree': [0.6, 1.0],
}

gs = GridSearchCV(
    estimator = xgb_r,
    param_grid = params,
    verbose=10,
    cv=3,
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

model = gs.best_estimator_
gs.best_estimator_

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV 1/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 1/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.834 total time=   3.3s
[CV 2/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 2/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.891 total time=   3.1s
[CV 3/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 3/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.864 total time=   3.0s
[CV 1/3; 2/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0
[CV 1/3; 2/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0;, score=0.837 total time=   3.6s
[CV 2/3; 2/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0
[CV 2/3; 2/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1

In [122]:
pred = model.predict(X_test)

In [123]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'xgboostrf', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [124]:
import pickle

model_file = open('./models2/xgboostrf.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### CatBoost

In [125]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(loss_function='RMSE')

model.fit(X_train, y_train, verbose=100)

Learning rate set to 0.044675
0:	learn: 177342.9114125	total: 1.55ms	remaining: 1.55s
100:	learn: 63382.5800185	total: 138ms	remaining: 1.23s
200:	learn: 55737.1979091	total: 353ms	remaining: 1.4s
300:	learn: 51252.9188368	total: 476ms	remaining: 1.11s
400:	learn: 48395.7062794	total: 598ms	remaining: 894ms
500:	learn: 46311.9946129	total: 718ms	remaining: 715ms
600:	learn: 44488.3189374	total: 841ms	remaining: 558ms
700:	learn: 42823.0699979	total: 966ms	remaining: 412ms
800:	learn: 41465.2327938	total: 1.08s	remaining: 270ms
900:	learn: 40240.4048107	total: 1.21s	remaining: 133ms
999:	learn: 39174.2679338	total: 1.33s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x17c148bf8e0>

In [126]:
pred = model.predict(X_test)

In [127]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'catboost', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [128]:
import pickle

model_file = open('./models2/catboost.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### LightGBM

In [129]:
from lightgbm import LGBMRegressor
 
model = LGBMRegressor(metric='mape')
 
model.fit(X_train, y_train)
 
pred = model.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 354
[LightGBM] [Info] Number of data points in the train set: 1737, number of used features: 45
[LightGBM] [Info] Start training from score 451456.712723


In [130]:
pred = model.predict(X_test)

In [131]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'lightgbm', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [132]:
import pickle

model_file = open('./models2/lightgbm.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### SVR

In [133]:
from sklearn.svm import SVR

svr = SVR()

params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001],  # Kernel coefficient
    'epsilon': [0.1, 0.01, 0.001, 0.0001]  # Epsilon
}


gs = GridSearchCV(
    estimator = svr,
    param_grid = params,
    verbose=10,
    cv=5,
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

model = gs.best_estimator_
gs.best_estimator_

Fitting 5 folds for each of 384 candidates, totalling 1920 fits
[CV 1/5; 1/384] START C=0.1, epsilon=0.1, gamma=scale, kernel=linear............


[CV 1/5; 1/384] END C=0.1, epsilon=0.1, gamma=scale, kernel=linear;, score=-0.036 total time=   0.0s
[CV 2/5; 1/384] START C=0.1, epsilon=0.1, gamma=scale, kernel=linear............
[CV 2/5; 1/384] END C=0.1, epsilon=0.1, gamma=scale, kernel=linear;, score=-0.055 total time=   0.0s
[CV 3/5; 1/384] START C=0.1, epsilon=0.1, gamma=scale, kernel=linear............
[CV 3/5; 1/384] END C=0.1, epsilon=0.1, gamma=scale, kernel=linear;, score=-0.033 total time=   0.0s
[CV 4/5; 1/384] START C=0.1, epsilon=0.1, gamma=scale, kernel=linear............
[CV 4/5; 1/384] END C=0.1, epsilon=0.1, gamma=scale, kernel=linear;, score=-0.076 total time=   0.0s
[CV 5/5; 1/384] START C=0.1, epsilon=0.1, gamma=scale, kernel=linear............
[CV 5/5; 1/384] END C=0.1, epsilon=0.1, gamma=scale, kernel=linear;, score=-0.013 total time=   0.0s
[CV 1/5; 2/384] START C=0.1, epsilon=0.1, gamma=scale, kernel=poly..............
[CV 1/5; 2/384] END C=0.1, epsilon=0.1, gamma=scale, kernel=poly;, score=-0.037 total time

In [134]:
pred = model.predict(X_test)
pred



array([444248.02025973, 371655.09612736, 454865.23570549, 490976.21323647,
       311245.1514056 , 330446.31574733, 256424.26105571, 477465.61054466,
       287454.4255503 , 417555.63677437, 378303.42657471, 471949.51225544,
       412472.40437933, 260679.20552798, 261458.54756943, 501793.80627072,
       310391.43699732, 357002.68311793, 406230.59740141, 252595.5723217 ,
       420038.74050072, 518070.08056682, 505936.98953254, 433767.93253614,
       446692.30115828, 391392.25630452, 396814.19915034, 275074.3163912 ,
       419869.1515963 , 450856.88569159, 293325.34921914, 297182.23157866,
       499359.71825868, 307172.91589327, 416693.56589228, 465274.34225879,
       417057.83086966, 453629.39363569, 362739.26864036, 463387.39098216,
       242882.98898824, 342579.56472633, 441637.15721451, 520209.32349782,
       382067.94167777, 430293.41209272, 399368.27483786, 407743.30693884,
       451209.05837579, 375139.48242726, 553491.96410575, 375292.58823622,
       492508.17080584, 4

In [135]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'svr', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [136]:
import pickle

model_file = open('./models2/svr.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### Linear Regression

In [137]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

params = {
    'fit_intercept': [True, False]
}

gs = GridSearchCV(
    estimator = lr_model,
    param_grid = params,
    verbose=10,
    cv=5,
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

model = gs.best_estimator_
gs.best_estimator_

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START fit_intercept=True..........................................
[CV 1/5; 1/2] END ...........fit_intercept=True;, score=0.761 total time=   0.0s
[CV 2/5; 1/2] START fit_intercept=True..........................................
[CV 2/5; 1/2] END ...........fit_intercept=True;, score=0.857 total time=   0.0s
[CV 3/5; 1/2] START fit_intercept=True..........................................
[CV 3/5; 1/2] END ...........fit_intercept=True;, score=0.892 total time=   0.0s
[CV 4/5; 1/2] START fit_intercept=True..........................................
[CV 4/5; 1/2] END ...........fit_intercept=True;, score=0.850 total time=   0.0s
[CV 5/5; 1/2] START fit_intercept=True..........................................
[CV 5/5; 1/2] END ...........fit_intercept=True;, score=0.853 total time=   0.0s
[CV 1/5; 2/2] START fit_intercept=False.........................................
[CV 1/5; 2/2] END ..........fit_intercept=False;,

In [138]:
pred = model.predict(X_test)
pred



array([ 528149.41500476,  387454.41500476,  465741.35005067,
        697461.41500476,  261752.63405227,  295734.55705352,
        179615.81099628,  565294.41500476,  205019.66557957,
        486574.41500476,  374666.35744597,  562590.41500476,
        389774.41500476,  171731.16964451,  299300.3630741 ,
       1024702.41500476,  270622.41500476,  328512.84168008,
        385274.45223432,  127107.7404545 ,  414478.18387609,
        711797.41500476,  695918.41500476,  355810.75214861,
        587982.41500476,  490597.41500476,  415049.5014772 ,
        216233.92897248,  463877.41500476,  423709.3812995 ,
        231682.78556185,  199525.41500476,  512917.41500476,
        259478.41500476,  419244.49247537,  651061.41500476,
        445327.89185177,  467211.96030745,  267546.10467522,
        415294.81388906,  143806.41500476,  379077.41500476,
        476398.60631012,  746438.41500476,  404665.40798551,
        466852.0187447 ,  391148.91740753,  387926.76144206,
        462968.26557508,

In [139]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'linear regression', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [140]:
import pickle

model_file = open('./models2/lr.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

### Result

In [141]:
result_df

Unnamed: 0,model,mae,mse,rmse,mape,r2
0,mean price,54430.452623,6043130000.0,77737.574557,0.122708,0.823955
1,ttb bluebook,64602.434698,8458107000.0,91967.969289,0.151228,0.753603
2,xgboost,38734.157148,3374354000.0,58089.192258,0.090079,0.9017
3,decision tree,43495.016829,4137417000.0,64322.759754,0.099694,0.879471
4,random forest,37976.916409,3000376000.0,54775.691535,0.087624,0.912595
5,xgboostrf,39594.447486,3361655000.0,57979.7827,0.090518,0.90207
6,catboost,39186.35979,3070393000.0,55411.12791,0.091633,0.910555
7,lightgbm,40849.105815,3350000000.0,57879.18071,0.094091,0.90241
8,svr,83508.043931,19033200000.0,137960.851476,0.168301,0.445535
9,linear regression,47755.085675,4233982000.0,65069.056151,0.111952,0.876658


In [142]:
result_df.to_excel('result_v2.xlsx')