# Project - User Car Prices - Predictor

## Team Background

- Project Grp 08

### Team Members

- Team member 1
    - Name: Richard Anton
    - Email: [rna63@drexel.edu](mailto:rna63@drexel.edu)
    

## Inference

This notebook demonstrates how to load the saved model and fitted preprocessor and run the regression model.
It loads the test data and checks for predicted results as a sanity check, but in real life we would use new data where we did not necessarily have the data.

In [1]:
preprocessor_path = 'model_preprocessor_f_2023_03_13_14_22_04.pkl'
model_path = "model_xgboost_2023_03_13_16_35_43.pkl"
dataset_path = "craigslist_full_cleaned_2023_03_12_10_45_22.csv"

In [2]:
from IPython.display import display, HTML

In [3]:
import joblib
from datetime import datetime

def save_model(model_name, model, use_ts=True):
    if use_ts:
        rn = datetime.now()
        dts = '_' + rn.strftime("%Y_%m_%d_%H_%M_%S")
    else:
        dts = ''

    dest_path = f"model_{model_name}{dts}.pkl"
    print(f"Saving model to {dest_path}")
    joblib.dump(model, dest_path)
    
    return dest_path

def load_model(model_path):
    print(f"Loading model from {model_path}")
    model = joblib.load(model_path)
    return model


In [4]:
preprocess = load_model(preprocessor_path)

Loading model from model_preprocessor_f_2023_03_13_14_22_04.pkl


In [5]:
model = load_model(model_path)

Loading model from model_xgboost_2023_03_13_16_35_43.pkl


In [6]:
# load dataset
import pandas as pd

target_col = 'price'

orig_df = pd.read_csv(dataset_path)
df = orig_df.copy()
# show a sample for sanity check
df.head()


# split into input data and output values
X_all = df.drop(columns=[target_col])
y_all = df[target_col]

print("X.shape", X_all.shape)
print("y.shape", y_all.shape)

X.shape (393908, 14)
y.shape (393908,)


In [7]:
from sklearn import preprocessing
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

In [8]:
# Convert categorical columns to Pandas category type
import numpy as np

cats = X_all.select_dtypes(exclude=np.number).columns.tolist()
print("cats:")
display(cats)
for col in cats:
    X_all[col] = X_all[col].astype('category')

# show data so we can tell we are getting what's expected.
display(X_all.dtypes)
display(X_all.head())

cats:


['make',
 'model',
 'condition',
 'cylinders',
 'fuel',
 'title_status',
 'transmission',
 'drive',
 'size',
 'type',
 'paint_color',
 'state']

year             float64
make            category
model           category
condition       category
cylinders       category
fuel            category
odometer         float64
title_status    category
transmission    category
drive           category
size            category
type            category
paint_color     category
state           category
dtype: object

Unnamed: 0,year,make,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,2011.406247,ford,f-150,good,6 cylinders,gas,92647.432999,clean,automatic,4wd,full-size,sedan,white,az
1,2011.406247,ford,f-150,good,6 cylinders,gas,92647.432999,clean,automatic,4wd,full-size,sedan,white,ar
2,2011.406247,ford,f-150,good,6 cylinders,gas,92647.432999,clean,automatic,4wd,full-size,sedan,white,fl
3,2011.406247,ford,f-150,good,6 cylinders,gas,92647.432999,clean,automatic,4wd,full-size,sedan,white,ma
4,2011.406247,ford,f-150,good,6 cylinders,gas,92647.432999,clean,automatic,4wd,full-size,sedan,white,nc


In [9]:
# Train test split
from sklearn.model_selection import train_test_split

# It's important that the random_state matches the other notebook.
_, X_test, _, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=193)
# just keep test data

print("X_test.shape", X_test.shape)

X_test.shape (78782, 14)


In [10]:
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [11]:
import xgboost as xgb

In [12]:
numeric_cols = ['year', 'odometer']
cat_cols = ['make', 'model', 'condition', 'cylinders', 'fuel', 'title_status',
       'transmission', 'drive', 'size', 'type', 'paint_color', 'state']


In [13]:
def xgboost_predict(X_test):
    X_pre_test =  preprocess.transform(X_test)
    
    # Convert to the format XGBoost lib expects.
    dtest_reg = xgb.DMatrix(X_pre_test)
    predict_test = model.predict(dtest_reg)
    
    return predict_test

In [14]:
predict_test = xgboost_predict(X_test)

In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# NOTE: this is just a sanity check. Should not have test values for real life.
print('RMSE of test data: ',  mean_squared_error(y_test, predict_test)**(0.5))

r2 = r2_score(y_test, predict_test)
print('R2 Score of test data:', r2)

RMSE of test data:  4944.281393534352
R2 Score of test data: 0.8791911598987936


In [20]:
TEST_DATA = {
  "year": 2014,
  "make": "Ford",
  "model": "F150",
  "condition": "fair",
  "cylinders": "6 cylinders",
  "fuel": "gas",
  "odometer": 82000,
  "title_status": "clean",
  "transmission": "automatic",
  "drive": "4wd",
  "size": "full-size",
  "type": "pickup",
  "paint_color": "black",
  "state": "WI"
}


df = pd.DataFrame(data=TEST_DATA, index=[0])


In [21]:
results = xgboost_predict(df)

predicted = results[0]

display("${price:.2f}".format(price=predicted))

'$19679.92'