## What Drives the price of a used car?

### Predictive Model Development

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline

### Data preparation
- Cleanup missing and null values
- Remove redundant features
- Clean up Numeric Features for Modeling
- Convert Non-numeric Features to Numeric for modeling
- Perform Feature Engineering

#### Data Cleaning

In [2]:
# Load dataset
df = pd.read_csv('data/vehicles.csv')
df = df.set_index('id')

#drop features not to be used for the model
df = df.drop(['VIN'], axis = 1) # VIN number is not useful, can us id instead.
df = df.drop(['size'], axis = 1) # only 28% of dataset has information on the size,so drop. 

# drop object features not to be used
df = df.drop(['region','transmission','state','paint_color','model','cylinders','drive'], axis = 1) 

# Drop rows with null value in the main features to be used 
df = df.dropna(subset = ['year','odometer','manufacturer','type'])
df= df[df.year != 0]

# Change year to age of car based on 2023 being 0 years age
df['age'] = 2023-df['year']
df=df.drop(['year'], axis = 1)

# After reviewing prices there are some zero prices and some outliers up to 1 e9. only keep i the range of 100 < price < $160K
df = df.query('price > 100 and price < 160_000')

#Remove odometer readings under 100 and over 160000
df = df.query('odometer > 100 and odometer < 160000')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 251922 entries, 7316814884 to 7301591129
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         251922 non-null  int64  
 1   manufacturer  251922 non-null  object 
 2   condition     169838 non-null  object 
 3   fuel          250495 non-null  object 
 4   odometer      251922 non-null  float64
 5   title_status  246425 non-null  object 
 6   type          251922 non-null  object 
 7   age           251922 non-null  float64
dtypes: float64(2), int64(1), object(5)
memory usage: 17.3+ MB


#### Feature Engineering

In [3]:
#transform features to numeric 

#Condition - Fill in missing with 'good', use Ordinal Encoding
df[['condition']] = df[['condition']].fillna('good')
ordinal_features = ['condition']
ordinal_transformer = OrdinalEncoder(categories=[['salvage', 'fair', 'good','excellent','like new','new']])
df[ordinal_features] = ordinal_transformer.fit_transform(df[ordinal_features])

#Fuel - Fill in missing with 'good', use Ordinal Encoding
df[['fuel']] = df[['fuel']].fillna('gas')
ordinal_features = ['fuel']
ordinal_transformer = OrdinalEncoder(categories=[['hybrid', 'other', 'gas','electric','diesel']])
df[ordinal_features] = ordinal_transformer.fit_transform(df[ordinal_features])

#Title Status - Fill in missing with 'good', use Ordinal Encoding
df[['title_status']] = df[['title_status']].fillna('salvage')
ordinal_features = ['title_status']
ordinal_transformer = OrdinalEncoder(categories=[['missing', 'parts only', 'salvage','rebuilt','clean','lien']])
df[ordinal_features] = ordinal_transformer.fit_transform(df[ordinal_features])

# Creating dummy variables for the variable 'Manufacturer'. 
dp = pd.get_dummies(df['manufacturer'], prefix='manufacturer')
# Adding the results to the master dataframe
df = pd.concat([df,dp], axis=1)
df = df.drop(['manufacturer'], axis = 1)

# Creating dummy variables for the variable 'type'. 
dp = pd.get_dummies(df['type'], prefix='type')
# Adding the results to the master dataframe
df = pd.concat([df,dp], axis=1) 
df = df.drop(['type'], axis = 1)

### Modeling

In [17]:
#Split the data for training and testing
X = df.drop('price', axis = 1)
y = df['price']
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Linear Regression Model

In [21]:
#Multiple Linear Regression Model

reg_model=LinearRegression()
reg_model.fit(X_train, y_train)

#create predictions
train_preds = reg_model.predict(X_train)
test_preds = reg_model.predict(X_test)

#score model
print("multiple linear regression with degree = 1")
score = reg_model.score(X_train, y_train)
print("Train Accuracy: ", round(score*100, 2))
print("Train RMSE:     ",round(np.sqrt(mean_squared_error(y_train,train_preds)), 2))
score_test = reg_model.score(X_test, y_test)
print("Test Accuracy: ", round(score_test*100, 2))
print("Test RMSE:     ",round(np.sqrt(mean_squared_error(y_test,test_preds)), 2))
coef = pd.DataFrame(reg_model.coef_, X_train.columns, columns = ['Coefficients'])
coef_list = coef.sort_values(by = ['Coefficients'], ascending = False)
coef_list.head(10)

multiple linear regression with degree = 1
Train Accuracy:  54.8
Train RMSE:      9753.05
Test Accuracy:  55.07
Test RMSE:      9701.95


Unnamed: 0,Coefficients
manufacturer_ferrari,67239.758978
manufacturer_morgan,26739.581991
manufacturer_aston-martin,24025.544643
manufacturer_tesla,10550.0972
manufacturer_porsche,10197.509023
type_truck,8674.632585
type_pickup,7002.33257
manufacturer_datsun,5549.429314
manufacturer_rover,4730.828774
fuel,3313.259988


#### Polynomial Regression Model

In [24]:
#polynomial Regression

#setup pipeline and fit model
pipe = Pipeline([('quad_features', PolynomialFeatures(degree = 2)), 
                 ('model', LinearRegression())])

pipe.fit(X_train, y_train)

# scoring
print("Polynomial regression with degree = 2")
score_train = pipe.score(X_train, y_train)
score_test = pipe.score(X_test,y_test)
print("Train Accuracy: ", round(score_train*100, 2))
train_rmse = round(np.sqrt((mean_squared_error(y_train, pipe.predict(X_train)))),2)
test_rmse = round(np.sqrt((mean_squared_error(y_test, pipe.predict(X_test)))),2)
print("Train RMSE: ",train_rmse)
print("Test Accuracy: ", round(score_test*100, 2))
print("Test RMSE: ",test_rmse)
coef = pd.DataFrame(pipe.named_steps['model'].coef_, pipe.named_steps['quad_features'].get_feature_names_out(), columns = ['Coefficients'])
coef_list = coef.sort_values(by = ['Coefficients'], ascending = False)
coef_list.head(10)

Polynomial regression with degree = 2
Train Accuracy:  67.51
Train RMSE:  8269.35
Test Accuracy:  67.21
Test RMSE:  8288.27


Unnamed: 0,Coefficients
condition type_bus,351851.552167
condition type_offroad,351504.200228
condition type_van,351391.906696
condition type_coupe,350908.533183
condition type_mini-van,350368.439029
condition type_hatchback,350141.240518
condition type_wagon,350015.8894
condition type_truck,350004.965336
condition type_SUV,349903.321328
condition type_sedan,349726.4075


#### Sequential Feature Selection 
Polynomial features- multiple Linear Regression

In [26]:
#Sequential feature selection
#Features number set to reduce overfitting
#Create Pipeline
sequential_pipe = Pipeline([('scaler', StandardScaler()),
                            ('poly_features', PolynomialFeatures(degree = 2, include_bias = False)),
                           ('selector', SequentialFeatureSelector(LinearRegression(), 
                                                                  n_features_to_select=5)),
                           ('linreg', LinearRegression())])
#Fit Model
sequential_pipe.fit(X_train,y_train)

#Score Model
print("Sequential Selection Polynomial regression")
score_train = sequential_pipe.score(X_train, y_train)
score_test = sequential_pipe.score(X_test,y_test)
print("Train Accuracy: ", round(score_train*100, 2))
train_rmse = round(np.sqrt((mean_squared_error(y_train, sequential_pipe.predict(X_train)))),2)
test_rmse = round(np.sqrt((mean_squared_error(y_test, sequential_pipe.predict(X_test)))),2)
print("Train RMSE: ",train_rmse)
print("Test Accuracy: ", round(score_test*100, 2))
print("Test RMSE: ",test_rmse)

Sequential Selection Polynomial regression
Train Accuracy:  49.22
Train RMSE:  10337.64
Test Accuracy:  49.54
Test RMSE:  10282.11


#### Gridge Search over Ridge Regression

In [52]:
# Grid Search with Ridge Regression

# Set grid parameters, alpha values for Ridge Regression
ridge_param_dict = {'ridge__alpha': [0.1,1,10,100]}

# Create Pipeline
ridge_pipe = Pipeline([('scaler', StandardScaler()), 
                      ('ridge', Ridge())])
# Create Grid Search
ridge_grid = GridSearchCV(ridge_pipe, param_grid=ridge_param_dict)

#fit Model
ridge_grid.fit(X_train, y_train)

#Score Model
print("Ridge regression")
score_train = ridge_grid.score(X_train, y_train)
score_test = ridge_grid.score(X_test,y_test)
print("Train Accuracy: ", round(score_train*100, 2))
train_rmse = round(np.sqrt((mean_squared_error(y_train, ridge_grid.predict(X_train)))),2)
test_rmse = round(np.sqrt((mean_squared_error(y_test, ridge_grid.predict(X_test)))),2)
print("Train RMSE: ",train_rmse)
print("Test Accuracy: ", round(score_test*100, 2))
print("Test RMSE: ",test_rmse)


Ridge regression
Train Accuracy:  54.8
Train RMSE:  9753.05
Test Accuracy:  55.07
Test RMSE:  9701.95
