## Appraisal model v.1
appraisal model using xgboost

### Import Libraries and Read preprocessed data

In [154]:
import pandas as pd
import numpy as np
import xgboost as xg
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 
from sklearn.metrics import mean_absolute_error as MAE 
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import statistics

In [155]:
df = pd.read_csv('../AppraisalDataPreprocessing/preprocessed_data.csv')
df.head(5)

Unnamed: 0,car_year,model_year_start,model_year_end,mile,cost,model_3,model_BT-50 PRO,model_CX-3,model_CX-30,model_CX-5,...,color_cream,color_gold,color_gray,color_green,color_other,color_red,color_silver,color_sky,color_white,car_model
0,-0.622813,-1.894351,-1.742805,0.831625,245000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Mazda 2 1.5 Sports Maxx Sports Hatchback
1,-1.572176,-1.189571,-1.742805,-0.053644,269000,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Mazda 3 2.0 Maxx Sports Hatchback
2,1.275912,0.21999,0.922832,-0.732859,390000,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Mazda 2 1.3 S Leather Sedan
3,1.275912,1.981942,0.922832,-0.046012,650000,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,Mazda CX-30 2.0 SP SUV
4,-0.306359,0.21999,0.922832,1.022416,357900,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,Mazda 2 1.3 High Connect Sedan


### Appraise cost using xgboost

get mean of each car model for evaluate

In [156]:
df['car_model_mean'] = df['cost'].groupby(df['car_model']).transform(np.mean)

seperate target column from dataframe

In [157]:
X = df.drop(columns=['cost', 'car_model'])
y = df['cost']

splitting data  
train : test = 80 : 20  
stratify by car model

In [158]:
car_model = df['car_model']
df.drop(columns=['car_model'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=car_model, random_state=1234)

In [159]:
X_train.drop(columns=['car_model_mean'], inplace=True)
car_model_mean = X_test['car_model_mean']
X_test.drop(columns=['car_model_mean'], inplace=True)

Use xgboost model to appraise the value

In [160]:
xgb_r = xg.XGBRegressor(objective='reg:squarederror', n_estimators=200, eta=0.05, seed=1234, tree_method="hist", device="cuda", n_jobs=-1, eval_metric="mape")

params = {
    'max_depth': [10, 30, 100],
    'gamma': [0.5, 1, 2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

gs = GridSearchCV(
    estimator = xgb_r,
    param_grid = params,
    verbose=10,
    cv=3,
    n_jobs=-1
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [161]:
model = gs.best_estimator_
gs.best_estimator_

In [162]:
pred = model.predict(X_test)

### Evaluate model

In [163]:
result_cols = ['model', 'mae', 'mse', 'rmse', 'mape', 'r2']
result_df = pd.DataFrame(columns=result_cols)

standard deviation of car price

In [164]:
sd = statistics.stdev(y_test)
print(sd)

185489.10530089893


#### Mean price of each car model

In [165]:
mae = MAE(y_test, car_model_mean)
mse = MSE(y_test, car_model_mean, squared=True)
rmse = MSE(y_test, car_model_mean, squared=False)
mape = MAPE(y_test, car_model_mean)
r2 = r2_score(y_test, car_model_mean)

result = {'model':'mean price', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### XGBoost

In [166]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'xgboost', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

### Other model

#### Decision tree

In [168]:
from sklearn.tree import DecisionTreeRegressor

clf = DecisionTreeRegressor(criterion = 'squared_error', random_state=42)

params = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 10, 20],
}

gs = GridSearchCV(
    estimator = clf,
    param_grid = params,
    n_jobs=-1,
    verbose=10
)
gs.fit(X_train, y_train)

model = gs.best_estimator_
gs.best_estimator_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [169]:
pred = model.predict(X_test)

In [170]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'decision tree', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### Random Forest

In [175]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(criterion = 'squared_error', random_state=42)

params = {
    'max_depth': [5, 10]+[None],
    'min_samples_split': [2, 10, 50],
    'n_estimators': [100, 200, 300],
    'max_features': [10, 30, 50, 100, 152],
}

gs = GridSearchCV(
    estimator = clf,
    param_grid = params,
    verbose=10
)

gs.fit(X_train, y_train)

model = gs.best_estimator_
gs.best_estimator_

In [None]:
pred = model.predict(X_test)

In [178]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'random forest', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### XGBoostRFRegressor

In [198]:
xgb_r = xg.XGBRFRegressor(objective='reg:squarederror', n_estimators=200, eta=0.05, seed=1234, tree_method="hist", device="cuda", n_jobs=-1, eval_metric="mape")

params = {
    'max_depth': [10, 30],
    'gamma': [0.5, 1],
    'subsample': [0.6, 1.0],
    'colsample_bytree': [0.6, 1.0],
}

gs = GridSearchCV(
    estimator = xgb_r,
    param_grid = params,
    verbose=10,
    cv=3,
)

gs.fit(X_train.to_numpy(), y_train.to_numpy())

model = gs.best_estimator_
gs.best_estimator_

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV 1/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 1/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.849 total time=   4.6s
[CV 2/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 2/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.907 total time=   4.4s
[CV 3/3; 1/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6
[CV 3/3; 1/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=0.6;, score=0.886 total time=   4.6s
[CV 1/3; 2/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0
[CV 1/3; 2/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0;, score=0.839 total time=   5.9s
[CV 2/3; 2/16] START colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1.0
[CV 2/3; 2/16] END colsample_bytree=0.6, gamma=0.5, max_depth=10, subsample=1

In [199]:
pred = model.predict(X_test)

In [200]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'xgboostrf', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### CatBoost

In [205]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(loss_function='RMSE')

model.fit(X_train, y_train, verbose=100)

Learning rate set to 0.044675
0:	learn: 177049.7207230	total: 151ms	remaining: 2m 30s
100:	learn: 61203.8531052	total: 314ms	remaining: 2.79s
200:	learn: 54040.3067082	total: 476ms	remaining: 1.89s
300:	learn: 50109.3597586	total: 626ms	remaining: 1.45s
400:	learn: 47433.7237082	total: 769ms	remaining: 1.15s
500:	learn: 45261.4419951	total: 921ms	remaining: 917ms
600:	learn: 43414.2971107	total: 1.06s	remaining: 707ms
700:	learn: 41972.1360387	total: 1.21s	remaining: 518ms
800:	learn: 40670.9337503	total: 1.35s	remaining: 335ms
900:	learn: 39482.6674956	total: 1.49s	remaining: 164ms
999:	learn: 38365.8077322	total: 1.63s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1a0e5cc7910>

In [206]:
pred = model.predict(X_test)

In [207]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'catboost', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### LightBGM

In [222]:
from lightgbm import LGBMRegressor
 
model = LGBMRegressor(metric='mape')
 
model.fit(X_train, y_train)
 
pred = model.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 376
[LightGBM] [Info] Number of data points in the train set: 1737, number of used features: 47
[LightGBM] [Info] Start training from score 451456.712723


In [223]:
pred = model.predict(X_test)

In [224]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'lightbgm', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

#### AdaBoost

In [262]:
from sklearn.ensemble import AdaBoostRegressor

clf = DecisionTreeRegressor(max_depth=15, random_state=1234)
model = AdaBoostRegressor(estimator=clf, n_estimators=200)

model.fit(X_train, y_train)
 
pred = model.predict(X_test)

In [267]:
mae = MAE(y_test, pred)
mse = MSE(y_test, pred, squared=True)
rmse = MSE(y_test, pred, squared=False)
mape = MAPE(y_test, pred)
r2 = r2_score(y_test, pred)

result = {'model':'adaboost', 'mae':mae, 'mse':mse, 'rmse':rmse, 'mape':mape, 'r2':r2}
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)

In [268]:
result_df

Unnamed: 0,model,mae,mse,rmse,mape,r2
0,mean price,54430.452623,6043130000.0,77737.574557,0.122708,0.823955
1,xgboost,37398.53319,3098090000.0,55660.489503,0.08804,0.909748
2,decision tree,42068.542744,3623734000.0,60197.454967,0.095959,0.894435
3,random forest,36090.358191,2571369000.0,50708.663122,0.083361,0.925092
4,xgboostrf,36595.347414,2743594000.0,52379.328157,0.084798,0.920075
5,catboost,36393.969129,2563294000.0,50628.982163,0.084451,0.925327
6,lightbgm,38014.908338,2816030000.0,53066.280099,0.088405,0.917965
7,adaboost,48631.148586,4978804000.0,70560.63749,0.122315,0.85496
