## Appraisal model v.1
appraisal model using xgboost

### Import Libraries and Read preprocessed data

In [10]:
import pandas as pd
import numpy as np
import xgboost as xg
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 
from sklearn.metrics import mean_absolute_error as MAE 
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import statistics

In [11]:
df = pd.read_csv('../AppraisalDataPreprocessing/preprocessed_data.csv')
df.head(5)

Unnamed: 0,Id,car_year,model_year_start,model_year_end,mile,cost,model_3,model_BT-50 PRO,model_CX-3,model_CX-30,...,color_cream,color_gold,color_gray,color_green,color_other,color_red,color_silver,color_sky,color_white,car_model
0,0,-0.622813,-1.894351,-1.742805,0.831625,245000,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Mazda 2 1.5 Sports Maxx Sports Hatchback
1,1,-1.572176,-1.189571,-1.742805,-0.053644,269000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,Mazda 3 2.0 Maxx Sports Hatchback
2,2,1.275912,0.21999,0.922832,-0.732859,390000,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Mazda 2 1.3 S Leather Sedan
3,3,1.275912,1.981942,0.922832,-0.046012,650000,0,0,0,1,...,0,0,0,0,0,1,0,0,0,Mazda CX-30 2.0 SP SUV
4,4,-0.306359,0.21999,0.922832,1.022416,357900,0,0,0,0,...,0,0,0,0,0,1,0,0,0,Mazda 2 1.3 High Connect Sedan


### Appraise cost using xgboost

get mean of each car model for evaluate

In [12]:
df['car_model_mean'] = df['cost'].groupby(df['car_model']).transform(np.mean)

seperate target column from dataframe

In [13]:
X = df.drop(columns=['cost', 'car_model', 'Id'])
y = df['cost']

splitting data  
train : test = 80 : 20  
stratify by car model

In [14]:
car_model = df['car_model']
df.drop(columns=['car_model'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=car_model, random_state=1234)

In [15]:
X_train.drop(columns=['car_model_mean'], inplace=True)
car_model_mean = X_test['car_model_mean']
X_test.drop(columns=['car_model_mean'], inplace=True)

In [16]:
# one2car = pd.read_csv('../AppraisalDataPreprocessing/data_one2car.csv')
# one2car.rename(columns={'Unnamed: 0':'Id'}, inplace=True)
# test = X_test[['Id']].merge(one2car, on='Id')
# test.to_csv('test_dataset.csv')

In [17]:
ttb_id = pd.read_csv('test_dataset_ttb_id.csv')
ttb_bluebook = pd.read_excel('ttb_bluebook.xlsx')
ttb_bluebook.rename(columns={'Unnamed: 0':'ttb_bluebook_id'}, inplace=True)
ttb_id = ttb_id.merge(ttb_bluebook[['avg_used_price', 'ttb_bluebook_id']], how='left', on='ttb_bluebook_id')
ttb_id['avg_used_price'].fillna(value=ttb_id['avg_used_price'].mean(), inplace=True)
estimated_price = ttb_id['avg_used_price']
estimated_price

0       620000.0
1       480000.0
2       525000.0
3      1170000.0
4       235000.0
         ...    
430     430000.0
431     660000.0
432     450000.0
433     235000.0
434     970000.0
Name: avg_used_price, Length: 435, dtype: float64

Use xgboost model to appraise the value

In [18]:
xgb_r = xg.XGBRegressor(objective='reg:squarederror', n_estimators=200, eta=0.05, seed=1234, tree_method="hist", device="cuda", n_jobs=-1, eval_metric="mape")

params = {
    'max_depth': [10, 30, 100],
    'gamma': [0.5, 1, 2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

gs = GridSearchCV(
    estimator = xgb_r,
    param_grid = params,
    verbose=10,
    cv=3,
    n_jobs=-1
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [19]:
model = gs.best_estimator_
gs.best_estimator_

In [20]:
pred = model.predict(X_test)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




### Evaluate model

In [21]:
result_cols = ['model', 'mae', 'mse', 'rmse', 'mape', 'r2']
result_df = pd.DataFrame(columns=result_cols)

standard deviation of car price

In [22]:
sd = statistics.stdev(y_test)
print(sd)

185489.10530089893


#### Mean price of each car model

In [23]:
mae = MAE(y_test, car_model_mean)
mse = MSE(y_test, car_model_mean, squared=True)
rmse = MSE(y_test, car_model_mean, squared=False)
mape = MAPE(y_test, car_model_mean)
r2 = r2_score(y_test, car_model_mean)

result = {'model':'mean price', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### Estimated price of each car from ttb bluebook

VAT 7% has been included to ttb bluebook estimated price

In [24]:
estimated_price = estimated_price/1.07

In [25]:
mae = MAE(y_test, estimated_price)
mse = MSE(y_test, estimated_price, squared=True)
rmse = MSE(y_test, estimated_price, squared=False)
mape = MAPE(y_test, estimated_price)
r2 = r2_score(y_test, estimated_price)

result = {'model':'ttb bluebook', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### XGBoost

In [26]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'xgboost', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [27]:
result_df

Unnamed: 0,model,mae,mse,rmse,mape,r2
0,mean price,54430.452623,6043130000.0,77737.574557,0.122708,0.823955
1,ttb bluebook,64602.434698,8458107000.0,91967.969289,0.151228,0.753603
2,xgboost,37398.53319,3098090000.0,55660.489503,0.08804,0.909748


In [28]:
import pickle

model_file = open('./models/xgboost.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

### Other model

#### Decision tree

In [29]:
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(criterion = 'squared_error', random_state=42)

params = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 10, 20],
}

gs = GridSearchCV(
    estimator = clf,
    param_grid = params,
    n_jobs=-1,
    verbose=10
)
gs.fit(X_train, y_train)

model = gs.best_estimator_
gs.best_estimator_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [30]:
pred = model.predict(X_test)

In [31]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'decision tree', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [32]:
import pickle

model_file = open('./models/decision_tree.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### Random Forest

In [33]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(criterion = 'squared_error', random_state=42)

params = {
    'max_depth': [5, 10]+[None],
    'min_samples_split': [2, 10, 50],
    'n_estimators': [100, 200, 300],
    'max_features': [10, 30, 50, 100, 152],
}

gs = GridSearchCV(
    estimator = clf,
    param_grid = params,
    verbose=10
)

gs.fit(X_train, y_train)

model = gs.best_estimator_
gs.best_estimator_

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV 1/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 1/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.730 total time=   0.1s
[CV 2/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 2/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.786 total time=   0.1s
[CV 3/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 3/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.838 total time=   0.1s
[CV 4/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV 4/5; 1/135] END max_depth=5, max_features=10, min_samples_split=2, n_estimators=100;, score=0.765 total time=   0.1s
[CV 5/5; 1/135] START max_depth=5, max_features=10, min_samples_split=2, n_estimators=100
[CV

In [34]:
pred = model.predict(X_test)

In [35]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'random forest', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [36]:
import pickle

model_file = open('./models/random_forest.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### XGBoostRFRegressor

In [37]:
xgb_r = xg.XGBRFRegressor(objective='reg:squarederror', n_estimators=200, eta=0.05, seed=1234, tree_method="hist", device="cuda", n_jobs=-1, eval_metric="mape")

params = {
    'max_depth': [10, 30],
    'gamma': [0.5, 1],
    'subsample': [0.6, 1.0],
    'colsample_bytree': [0.6, 1.0],
}

gs = GridSearchCV(
    estimator = xgb_r,
    param_grid = params,
    verbose=10,
    cv=3,
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

model = gs.best_estimator_
gs.best_estimator_

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV 1/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 1/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.849 total time=   3.8s
[CV 2/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 2/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.907 total time=   4.0s
[CV 3/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 3/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.886 total time=   4.5s
[CV 1/3; 2/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0
[CV 1/3; 2/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0;, score=0.839 total time=   4.6s
[CV 2/3; 2/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0
[CV 2/3; 2/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1

In [38]:
pred = model.predict(X_test)

In [39]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'xgboostrf', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [40]:
import pickle

model_file = open('./models/xgboostrf.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### CatBoost

In [41]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(loss_function='RMSE')

model.fit(X_train, y_train, verbose=100)

Learning rate set to 0.044675
0:	learn: 177049.7207230	total: 89.1ms	remaining: 1m 28s
100:	learn: 61203.8531052	total: 281ms	remaining: 2.5s
200:	learn: 54040.3067082	total: 435ms	remaining: 1.73s
300:	learn: 50109.3597586	total: 666ms	remaining: 1.55s
400:	learn: 47433.7237082	total: 847ms	remaining: 1.26s
500:	learn: 45261.4419951	total: 1s	remaining: 1s
600:	learn: 43414.2971107	total: 1.16s	remaining: 768ms
700:	learn: 41972.1360387	total: 1.35s	remaining: 576ms
800:	learn: 40670.9337503	total: 1.53s	remaining: 379ms
900:	learn: 39482.6674956	total: 1.7s	remaining: 187ms
999:	learn: 38365.8077322	total: 1.87s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x14b9c8a2a10>

In [42]:
pred = model.predict(X_test)

In [43]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'catboost', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [44]:
import pickle

model_file = open('./models/catboost.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

#### LightGBM

In [45]:
from lightgbm import LGBMRegressor
 
model = LGBMRegressor(metric='mape')
 
model.fit(X_train, y_train)
 
pred = model.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000565 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 376
[LightGBM] [Info] Number of data points in the train set: 1737, number of used features: 47
[LightGBM] [Info] Start training from score 451456.712723


In [46]:
pred = model.predict(X_test)

In [47]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'lightgbm', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [48]:
import pickle

model_file = open('./models/lightgbm.model', 'wb')

# Save Decision tree model
pickle.dump(model, model_file)

# Close file
model_file.close()

### Result

In [49]:
result_df

Unnamed: 0,model,mae,mse,rmse,mape,r2
0,mean price,54430.452623,6043130000.0,77737.574557,0.122708,0.823955
1,ttb bluebook,64602.434698,8458107000.0,91967.969289,0.151228,0.753603
2,xgboost,37398.53319,3098090000.0,55660.489503,0.08804,0.909748
3,decision tree,42068.542744,3623734000.0,60197.454967,0.095959,0.894435
4,random forest,36090.358191,2571369000.0,50708.663122,0.083361,0.925092
5,xgboostrf,36595.347414,2743594000.0,52379.328157,0.084798,0.920075
6,catboost,36393.969129,2563294000.0,50628.982163,0.084451,0.925327
7,lightgbm,38014.908338,2816030000.0,53066.280099,0.088405,0.917965


In [50]:
result_df.to_excel('result_v1.xlsx')