In [310]:
# imports
import os
import numpy as np
import pandas as pd
import kagglehub
from IPython.display import display
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [311]:
# read the dataframe
df = pd.read_csv("vehicles.csv")

In [None]:
# trim and preprocess
df = df[['price',
                   'year',
                   'manufacturer',
                   'model',
                   'condition',
                   'fuel',
                   'odometer',
                   'drive',
                   'type',
                   'paint_color',
                   'state']].dropna()

# trim low price and mileage
df = df[df['price'] >= 1000]
df = df[df['price'] <= 60000]
df = df[df["odometer"] >= 5000]
df = df[df["odometer"] <= 300000]

# replace year with age
current_year = 2025
df["age"] = current_year - df["year"]
df.drop('year', axis=1, inplace=True)

# split into model and trim, fill empties with unknown
split = df["model"].str.split()
df["model"] = split.str[:2].str.join(" ")
df["trim"] = split.str[2:].str.join(" ").replace("", "Unknown")

# Define features and target
features_X = df[['age',
                   'manufacturer',
                   'model',
                   'trim',
                   'condition',
                   'fuel',
                   'odometer',
                   'drive',
                   'type',
                   'paint_color',
                   'state']]
target_y = df['price']
print(features_X.columns)

Index(['age', 'manufacturer', 'model', 'condition', 'fuel', 'odometer',
       'drive', 'type', 'paint_color', 'state'],
      dtype='object')


In [313]:
#split into train and test
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features_X, target_y, test_size=0.2)

In [314]:
# define categorical and numerical columns
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(exclude="object").columns.tolist()

In [315]:
# one-hot encode the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('numeric','passthrough', num_cols)
    ]
)

In [316]:
# # build pipeline with the linear regression model
# model = Pipeline(steps=[
#     ('preprocess', preprocessor),
#     ('regressor', LinearRegression())
# ])

In [317]:
# build pipeline with the random forest model
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=100,
        max_depth=20,
        random_state=42,
        n_jobs=-1))
])

In [318]:
# Try log values of y
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)
# fit the model
model.fit(X_train, y_train_log)

In [319]:
# predict and report error
y_pred_log = model.predict(X_test)
y_pred = np.exp(y_pred_log)
mae = mean_absolute_error(y_test, y_pred)
print("MAE: ", mae)


MAE:  2563.4800806681305


In [320]:
# Compare error
comparison = pd.DataFrame()
comparison["true_price"] = y_test.values
comparison["predicted_price"] = y_pred
comparison["error"] = comparison["predicted_price"] - comparison["true_price"]
comparison["absolute % error"] = abs(comparison["error"] / comparison["true_price"] * 100)

print(comparison.describe())


         true_price  predicted_price         error  absolute % error
count  25687.000000     25687.000000  25687.000000      2.568700e+04
mean   16695.034843     16170.918667   -524.116176      2.296981e+01
std    11883.521874     10940.205976   4197.068803      5.107174e+01
min     1000.000000      1000.000000 -44010.199508      1.213064e-14
25%     6995.000000      7132.779420  -1775.064814      4.643552e+00
50%    13881.000000     12977.312444    -66.623878      1.255839e+01
75%    24590.000000     23551.881933   1251.498896      2.668256e+01
max    60000.000000     59487.402079  34458.616325      3.445862e+03


In [337]:
my_car = pd.DataFrame([{
    'age': 20.0,
                   'manufacturer': 'jeep',
                   'model': 'grand cherokee',
                   'condition': 'good',
                   'fuel': 'gas',
                   'odometer': 170000,
                   'drive': '4wd',
                   'type': 'SUV',
                   'paint_color': 'red',
                   'state': 'ct'
}])

norah_car = pd.DataFrame([{
    'age': 13.0,
                   'manufacturer': 'honda',
                   'model': 'crv',
                   'condition': 'fair',
                   'fuel': 'gas',
                   'odometer': 264000,
                   'drive': 'fwd',
                   'type': 'SUV',
                   'paint_color': 'white',
                   'state': 'tx'
}])

jodi = pd.DataFrame([{
    'age': 4.0,
                   'manufacturer': 'hyundai',
                   'model': 'palisade',
                   'condition': 'excellent',
                   'fuel': 'gas',
                   'odometer': 70000,
                   'drive': '4wd',
                   'type': 'SUV',
                   'paint_color': 'black',
                   'state': 'ct'
}])

my_car_value = np.exp(model.predict(jodi))

print(f"Estimated value of my car: ${my_car_value[0]:.2f}")

print(f"Conditions: {df["condition"].unique()}")
print(f"Drives: {df["drive"].unique()}")
print(f"States: {df["state"].unique()}")
print(f"Colors: {df["paint_color"].unique()}")
print(f"Types: {df["type"].unique()}")
print(f"Fuels: {df["fuel"].unique()}")

Estimated value of my car: $19550.21
Conditions: ['excellent' 'good' 'new' 'fair' 'like new' 'salvage']
Drives: ['rwd' '4wd' 'fwd']
States: ['al' 'ak' 'az' 'ar' 'ca' 'co' 'ct' 'dc' 'de' 'fl' 'ga' 'hi' 'id' 'il'
 'in' 'ia' 'ks' 'ky' 'la' 'me' 'md' 'ma' 'mi' 'mn' 'ms' 'mo' 'mt' 'nc'
 'ne' 'nv' 'nj' 'nm' 'ny' 'nh' 'nd' 'oh' 'ok' 'or' 'pa' 'ri' 'sc' 'sd'
 'tn' 'tx' 'ut' 'vt' 'va' 'wa' 'wv' 'wi' 'wy']
Colors: ['black' 'silver' 'grey' 'red' 'blue' 'white' 'brown' 'yellow' 'green'
 'custom' 'orange' 'purple']
Types: ['truck' 'pickup' 'other' 'coupe' 'SUV' 'hatchback' 'mini-van' 'sedan'
 'offroad' 'convertible' 'wagon' 'van' 'bus']
Fuels: ['gas' 'other' 'diesel' 'hybrid' 'electric']
