# Project - User Car Prices - Predictor

## Team Background

- Project Grp 08

### Team Members

- Team member 1
    - Name: Richard Anton
    - Email: [rna63@drexel.edu](mailto:rna63@drexel.edu)
    

In [14]:
from IPython.display import display, HTML

In [1]:
import joblib
from datetime import datetime

def save_model(model_name, model, use_ts=True):
    if use_ts:
        rn = datetime.now()
        dts = '_' + rn.strftime("%Y_%m_%d_%H_%M_%S")
    else:
        dts = ''

    dest_path = f"model_{model_name}{dts}.pkl"
    print(f"Saving model to {dest_path}")
    joblib.dump(model, dest_path)
    
    return dest_path

def load_model(model_path):
    print(f"Loading model from {model_path}")
    model = joblib.load(model_path)
    return model


In [39]:
# TODO: put in final model path

model_path = 'model_xgboost_s_2023_03_13_07_53_37.pkl'
model = load_model(model_path)

Loading model from model_xgboost_s_2023_03_13_07_53_37.pkl


In [27]:
# load dataset
import pandas as pd

# TODO: use full dataset.
dataset_path = "craigslist_sampled_cleaned_2023_03_05_19_07_36.csv"
target_col = 'price'

orig_df = pd.read_csv(dataset_path)
df = orig_df.copy()
# show a sample for sanity check
df.head()


# split into input data and output values
X_all = df.drop(columns=[target_col])
y_all = df[target_col]

print("X.shape", X_all.shape)
print("y.shape", y_all.shape)

X.shape (4034, 14)
y.shape (4034,)


In [28]:
from sklearn import preprocessing
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

In [29]:
# Convert categorical columns to Pandas category type
import numpy as np

cats = X_all.select_dtypes(exclude=np.number).columns.tolist()
print("cats:")
display(cats)
for col in cats:
    X_all[col] = X_all[col].astype('category')

display(X_all.dtypes)
display(X_all.head())

cats:


['make',
 'model',
 'condition',
 'cylinders',
 'fuel',
 'title_status',
 'transmission',
 'drive',
 'size',
 'type',
 'paint_color',
 'state']

year             float64
make            category
model           category
condition       category
cylinders       category
fuel            category
odometer         float64
title_status    category
transmission    category
drive           category
size            category
type            category
paint_color     category
state           category
dtype: object

Unnamed: 0,year,make,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
0,2016.0,chevrolet,silverado 1500 double,good,6 cylinders,gas,29499.0,clean,other,4wd,full-size,pickup,silver,al
1,2013.0,lincoln,mkz sedan 4d,good,6 cylinders,gas,61087.0,clean,other,fwd,full-size,sedan,red,al
2,2005.0,honda,accord ex-l,excellent,4 cylinders,gas,155000.0,clean,manual,4wd,full-size,sedan,white,al
3,2012.0,ford,f250 super duty,good,6 cylinders,gas,47380.0,clean,automatic,4wd,full-size,sedan,white,al
4,2021.0,ford,SPECIAL FINANCE PROGRAM 2020,fair,other,other,1400.0,clean,other,4wd,full-size,bus,green,al


In [54]:
# Train test split
from sklearn.model_selection import train_test_split

# It's important that the random_state matches the other notebook.
_, X_test, _, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=193)
# just keep test data

print("X_test.shape", X_test.shape)

X_test.shape (807, 14)
y_test.shape (807,)


In [31]:
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [32]:
import xgboost as xgb

In [33]:
numeric_cols = ['year', 'odometer']
cat_cols = ['make', 'model', 'condition', 'cylinders', 'fuel', 'title_status',
       'transmission', 'drive', 'size', 'type', 'paint_color', 'state']


In [34]:
preprocessor_path = 'model_preprocessor_s_2023_03_13_07_28_32.pkl'
preprocess = load_model(preprocessor_path)

Loading model from model_preprocessor_s_2023_03_13_07_28_32.pkl


In [48]:
def xgboost_predict(X_test):
    X_pre_test =  preprocess.transform(X_test)
    
    # Convert to the format XGBoost lib expects.
    dtest_reg = xgb.DMatrix(X_pre_test)
    predict_test = model.predict(dtest_reg)
    
    return predict_test

In [50]:
predict_test = xgboost_predict(X_test)

In [53]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# NOTE: this is just a sanity check. Should not have test values for real life.
print('RMSE of test data: ',  mean_squared_error(y_test, predict_test)**(0.5))

r2 = r2_score(y_test, predict_test)
print('R2 Score of test data:', r2)

RMSE of test data:  8068.864004696706
R2 Score of test data: 0.6542370459009877
