## Appraisal model v.1
appraisal model using xgboost

### Import Libraries and Read preprocessed data

In [1]:
import pandas as pd
import numpy as np
import xgboost as xg
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 
from sklearn.metrics import mean_absolute_error as MAE 
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import statistics

In [2]:
df = pd.read_csv('../AppraisalDataPreprocessing/preprocessed_data.csv')
df.head(5)

Unnamed: 0,num__car_year,num__model_year_start,num__model_year_end,num__mile,nom__model_3,nom__model_BT-50 PRO,nom__model_CX-3,nom__model_CX-30,nom__model_CX-5,nom__model_CX-8,...,nom__color_gray,nom__color_green,nom__color_other,nom__color_red,nom__color_silver,nom__color_sky,nom__color_white,Id,cost,car_model
0,-0.680002,-2.01442,-1.817434,0.851441,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,245000,Mazda 2 1.5 Sports Maxx Sports Hatchback
1,-1.670146,-1.276841,-1.817434,-0.039278,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,269000,Mazda 3 2.0 Maxx Sports Hatchback
2,1.300286,0.198318,0.917991,-0.722674,0,0,0,0,0,0,...,1,0,0,0,0,0,0,2,390000,Mazda 2 1.3 S Leather Sedan
3,1.300286,2.042266,0.917991,-0.031599,0,0,0,1,0,0,...,0,0,0,1,0,0,0,3,650000,Mazda CX-30 2.0 SP SUV
4,-0.349954,0.198318,0.917991,1.043406,0,0,0,0,0,0,...,0,0,0,1,0,0,0,4,357900,Mazda 2 1.3 High Connect Sedan


### Appraise cost using xgboost

get mean of each car model for evaluate

In [3]:
df['car_model_mean'] = df['cost'].groupby(df['car_model']).transform(np.mean)

seperate target column from dataframe

In [4]:
X = df.drop(columns=['cost', 'car_model', 'Id'])
y = df['cost']

splitting data  
train : test = 80 : 20  
stratify by car model

In [5]:
car_model = df['car_model']
df.drop(columns=['car_model'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=car_model, random_state=1234)

In [6]:
X_train.drop(columns=['car_model_mean'], inplace=True)
car_model_mean = X_test['car_model_mean']
X_test.drop(columns=['car_model_mean'], inplace=True)

In [7]:
# one2car = pd.read_csv('../AppraisalDataPreprocessing/data_one2car.csv')
# one2car.rename(columns={'Unnamed: 0':'Id'}, inplace=True)
# test = X_test[['Id']].merge(one2car, on='Id')
# test.to_csv('test_dataset.csv')

In [8]:
ttb_id = pd.read_csv('test_dataset_ttb_id.csv')
ttb_bluebook = pd.read_excel('ttb_bluebook.xlsx')
ttb_bluebook.rename(columns={'Unnamed: 0':'ttb_bluebook_id'}, inplace=True)
ttb_id = ttb_id.merge(ttb_bluebook[['avg_used_price', 'ttb_bluebook_id']], how='left', on='ttb_bluebook_id')
ttb_id['avg_used_price'].fillna(value=ttb_id['avg_used_price'].mean(), inplace=True)
estimated_price = ttb_id['avg_used_price']
estimated_price

0       620000.0
1       480000.0
2       525000.0
3      1170000.0
4       235000.0
         ...    
430     430000.0
431     660000.0
432     450000.0
433     235000.0
434     970000.0
Name: avg_used_price, Length: 435, dtype: float64

Use xgboost model to appraise the value

In [9]:
xgb_r = xg.XGBRegressor(objective='reg:squarederror', n_estimators=200, eta=0.05, seed=1234, tree_method="hist", device="cuda", n_jobs=-1, eval_metric="mape")

params = {
    'max_depth': [10, 30, 100],
    'gamma': [0.5, 1, 2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

gs = GridSearchCV(
    estimator = xgb_r,
    param_grid = params,
    verbose=10,
    cv=3,
    n_jobs=-1
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [10]:
model = gs.best_estimator_
gs.best_estimator_

In [11]:
pred = model.predict(X_test)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




### Evaluate model

In [12]:
result_cols = ['model', 'mae', 'mse', 'rmse', 'mape', 'r2']
result_df = pd.DataFrame(columns=result_cols)

standard deviation of car price

In [13]:
sd = statistics.stdev(y_test)
print(sd)

185489.10530089893


#### Mean price of each car model

In [14]:
mae = MAE(y_test, car_model_mean)
mse = MSE(y_test, car_model_mean, squared=True)
rmse = MSE(y_test, car_model_mean, squared=False)
mape = MAPE(y_test, car_model_mean)
r2 = r2_score(y_test, car_model_mean)

result = {'model':'mean price', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### Estimated price of each car from ttb bluebook

VAT 7% has been included to ttb bluebook estimated price

In [15]:
estimated_price = estimated_price/1.07

In [16]:
mae = MAE(y_test, estimated_price)
mse = MSE(y_test, estimated_price, squared=True)
rmse = MSE(y_test, estimated_price, squared=False)
mape = MAPE(y_test, estimated_price)
r2 = r2_score(y_test, estimated_price)

result = {'model':'ttb bluebook', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### XGBoost

In [17]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'xgboost', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [18]:
result_df

Unnamed: 0,model,mae,mse,rmse,mape,r2
0,mean price,54430.452623,6043130000.0,77737.574557,0.122708,0.823955
1,ttb bluebook,64602.434698,8458107000.0,91967.969289,0.151228,0.753603
2,xgboost,37684.064835,3040342000.0,55139.295899,0.088001,0.91143


In [19]:
import pickle

model_file = open('./models/xgboost.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

### Other model

#### Decision tree

In [20]:
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(criterion = 'squared_error', random_state=42)

params = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 10, 20],
}

gs = GridSearchCV(
    estimator = clf,
    param_grid = params,
    n_jobs=-1,
    verbose=10
)
gs.fit(X_train, y_train)

model = gs.best_estimator_
gs.best_estimator_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [21]:
pred = model.predict(X_test)

In [22]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'decision tree', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [23]:
import pickle

model_file = open('./models/decision_tree.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### Random Forest

In [24]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(criterion = 'squared_error', random_state=42)

params = {
    'max_depth': [5, 10]+[None],
    'min_samples_split': [2, 10, 50],
    'n_estimators': [100, 200, 300],
    'max_features': [10, 30, 50, 100, 152],
}

gs = GridSearchCV(
    estimator = clf,
    param_grid = params,
    verbose=10
)

gs.fit(X_train, y_train)

model = gs.best_estimator_
gs.best_estimator_

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV 1/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 1/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.755 total time=   0.0s
[CV 2/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 2/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.823 total time=   0.0s
[CV 3/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 3/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.869 total time=   0.0s
[CV 4/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 4/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.809 total time=   0.0s
[CV 5/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV

In [25]:
pred = model.predict(X_test)

In [26]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'random forest', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [27]:
import pickle

model_file = open('./models/random_forest.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### XGBoostRFRegressor

In [28]:
xgb_r = xg.XGBRFRegressor(objective='reg:squarederror', n_estimators=200, eta=0.05, seed=1234, tree_method="hist", device="cuda", n_jobs=-1, eval_metric="mape")

params = {
    'max_depth': [10, 30],
    'gamma': [0.5, 1],
    'subsample': [0.6, 1.0],
    'colsample_bytree': [0.6, 1.0],
}

gs = GridSearchCV(
    estimator = xgb_r,
    param_grid = params,
    verbose=10,
    cv=3,
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

model = gs.best_estimator_
gs.best_estimator_

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV 1/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 1/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.850 total time=   3.5s
[CV 2/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 2/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.905 total time=   3.4s
[CV 3/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 3/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.883 total time=   3.5s
[CV 1/3; 2/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0
[CV 1/3; 2/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0;, score=0.840 total time=   4.0s
[CV 2/3; 2/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0
[CV 2/3; 2/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1

In [29]:
pred = model.predict(X_test)

In [30]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'xgboostrf', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [31]:
import pickle

model_file = open('./models/xgboostrf.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### CatBoost

In [32]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(loss_function='RMSE')

model.fit(X_train, y_train, verbose=100)

Learning rate set to 0.044675
0:	learn: 177382.2217472	total: 139ms	remaining: 2m 18s
100:	learn: 60645.3312550	total: 267ms	remaining: 2.38s
200:	learn: 53624.9311865	total: 393ms	remaining: 1.56s
300:	learn: 49702.3315234	total: 528ms	remaining: 1.23s
400:	learn: 47006.2050162	total: 663ms	remaining: 990ms
500:	learn: 44914.1262838	total: 803ms	remaining: 799ms
600:	learn: 43350.8832313	total: 938ms	remaining: 623ms
700:	learn: 41776.4249452	total: 1.08s	remaining: 459ms
800:	learn: 40604.9153319	total: 1.21s	remaining: 301ms
900:	learn: 39482.2379805	total: 1.35s	remaining: 148ms
999:	learn: 38626.8337890	total: 1.49s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x278649a3e80>

In [33]:
pred = model.predict(X_test)

In [34]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'catboost', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [35]:
import pickle

model_file = open('./models/catboost.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### LightGBM

In [36]:
from lightgbm import LGBMRegressor
 
model = LGBMRegressor(metric='mape')
 
model.fit(X_train, y_train)
 
pred = model.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 376
[LightGBM] [Info] Number of data points in the train set: 1737, number of used features: 47
[LightGBM] [Info] Start training from score 451456.712723


In [37]:
pred = model.predict(X_test)

In [38]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'lightgbm', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [39]:
import pickle

model_file = open('./models/lightgbm.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

### Result

In [40]:
result_df

Unnamed: 0,model,mae,mse,rmse,mape,r2
0,mean price,54430.452623,6043130000.0,77737.574557,0.122708,0.823955
1,ttb bluebook,64602.434698,8458107000.0,91967.969289,0.151228,0.753603
2,xgboost,37684.064835,3040342000.0,55139.295899,0.088001,0.91143
3,decision tree,42038.232399,3620182000.0,60167.945508,0.095907,0.894539
4,random forest,36315.873255,2594775000.0,50938.931755,0.083991,0.92441
5,xgboostrf,36047.617708,2628843000.0,51272.240317,0.083856,0.923418
6,catboost,36567.529046,2586570000.0,50858.334752,0.085114,0.924649
7,lightgbm,37911.112692,2799271000.0,52908.140237,0.087894,0.918453


In [41]:
result_df.to_excel('result_v1.xlsx')