
# Machine Learning Project Proposal #

## Exploring Used Car Auction: *Prices and Predicting the Possible End Price*

### Simon Coessens, Rana Islek

### April 2024



### Necessary Libraries

In [None]:
!pip install numpy==1.23.5
!pip install pandas==1.5.2
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import skew
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pd.set_option('display.max_columns', None)
%matplotlib inline
sns.set(rc={'figure.figsize':(20.7,8.27)})
sns.set_style("whitegrid")
sns.color_palette("dark")
plt.style.use("fivethirtyeight")

### Import Data

In [None]:
dfcar = pd.read_csv('/content/drive/My Drive/ml-project/data/dfcar_processed.csv', on_bad_lines='skip')
dfcar.head(2)

Unnamed: 0,vin,year,brand,body_simple,model,trim,made_in,transmission,condition,color,...,mmr,is_sold_below_mmr,day_of_week,is_weekend,hour_only,month_year,sale_date,sale_year,latitude,longitude
0,wba3c1c51ek116351,2014,BMW,Sedan,3 Series,328I SULEV,DEU,Automatic,4.5,Gray,...,31900,1,Thu,0,4,Jan-15,2015-01-15,2015,37.271875,-119.270415
1,yv1612tb4f1310987,2015,VOLVO,Sedan,S60,T5,SWE,Automatic,4.1,White,...,27500,0,Thu,0,4,Jan-15,2015-01-29,2015,37.271875,-119.270415


In [None]:
columns_to_drop = ['year','vin','mmr', 'is_sold_below_mmr', 'day_of_week', 'is_weekend', 'is_weekend', 'month_year', 'sale_date', 'latitude', 'longitude', 'hour_only', 'seller']
dfcar = dfcar.drop(columns=columns_to_drop)
dfcar.head(2)

Unnamed: 0,brand,body_simple,model,trim,made_in,transmission,condition,color,interior,car_age,odometer,state,sellingprice,sale_year
0,BMW,Sedan,3 Series,328I SULEV,DEU,Automatic,4.5,Gray,Black,1,1331.0,CA,30000,2015
1,VOLVO,Sedan,S60,T5,SWE,Automatic,4.1,White,Black,0,14282.0,CA,27750,2015


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline


categorical_features = ['brand', 'body_simple', 'model', 'trim', 'made_in', 'transmission', 'color', 'state', 'interior']
numerical_features = ['condition', 'odometer', 'car_age', 'sale_year']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', DecisionTreeRegressor(random_state=0))])

y = dfcar['sellingprice']
X = dfcar.drop(columns=['sellingprice'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Selling Price')
plt.ylabel('Predicted Selling Price')
plt.title('Actual vs Predicted Selling Price')
plt.show()



NameError: name 'mean_squared_error' is not defined

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Selling Price')
plt.ylabel('Predicted Selling Price')
plt.title('Actual vs Predicted Selling Price')
plt.show()

pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df


In [None]:
from joblib import dump


model_filename = '/content/drive/My Drive/ml-project/models/decision_tree_model.joblib'
dump(pipeline, model_filename)
print(f"Model saved to {model_filename}")

In [None]:
from joblib import load

loaded_pipeline = load('/content/drive/My Drive/ml-project/models/decision_tree_model.joblib')
print("Model loaded successfully")

predictions = loaded_pipeline.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = mse ** 0.5
r2 = r2_score(y_test, predictions)

print(f"Model Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(n_estimators=50, max_depth=10, n_jobs=-1, random_state=0))])

y = dfcar['sellingprice']
X = dfcar.drop(columns=['sellingprice'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

In [None]:
from joblib import dump


model_filename = '/content/drive/My Drive/ml-project/models/random_forest_model.joblib'
dump(pipeline, model_filename)
print(f"Model saved to {model_filename}")

In [None]:
from xgboost import XGBRegressor


pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=0))])

y = dfcar['sellingprice']
X = dfcar.drop(columns=['sellingprice'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Absolute Error:", mae)

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

In [None]:
from joblib import dump


model_filename = '/content/drive/My Drive/ml-project/models/xgboost_model.joblib'
dump(pipeline, model_filename)
print(f"Model saved to {model_filename}")