In [184]:
# imports
import os
import pandas as pd
import kagglehub
from IPython.display import display
import sklearn
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [156]:
# Download latest version
path = kagglehub.dataset_download("sandeep1080/used-car-sales")
# read the dataframe
df = pd.read_csv(os.path.join(path, "used_car_sales.csv"))

In [157]:
display(df)

Unnamed: 0,ID,Distributor Name,Location,Car Name,Manufacturer Name,Car Type,Color,Gearbox,Number of Seats,Number of Doors,...,Purchased Date,Car Sale Status,Sold Date,Purchased Price-$,Sold Price-$,Margin-%,Sales Agent Name,Sales Rating,Sales Commission-$,Feedback
0,O2KE17,Carmudi,California,Fortuner,Toyota,SUV,Gray,Automatic,8,5,...,2022-10-26,Un Sold,1970-01-01,8296,0,0,Pranav,1,0,Average
1,EPMPC8,Carousell,Philadelphia,Creta,Hyundai,Hatchback,Blue,Automatic,5,5,...,2017-08-25,Sold,2021-03-03,5659,4770,-16,Vihaan,5,0,Good
2,SQKXAP,Carsome,North Carolina,Scorpio,Mahindra,SUV,Gray,Automatic,5,5,...,2018-06-13,Un Sold,1970-01-01,8430,0,0,Aarush,4,0,Good
3,PWP2QK,Trivett,North Carolina,Plato,Prazo,Convertible,Gray,Automatic,2,2,...,2023-05-14,Sold,2024-04-02,6919,7942,15,Anushka,1,205,Average
4,FNDDKM,Zupps,Portland,Dzire,Maruti,Sedan,Red,Automatic,5,5,...,2022-08-24,Un Sold,1970-01-01,6864,0,0,Pavan,3,0,Poor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ZHLCSG,APE,Texas,Yodha,Tata,Truck,Blue,Manual,3,2,...,2023-12-29,Sold,2024-03-23,6102,5041,-17,Supriya,3,0,Excellent
9996,2BJE0Y,Carsome,Portland,Scorpio,Mahindra,SUV,Black,Automatic,5,5,...,2019-06-13,Un Sold,1970-01-01,8108,0,0,Aarush,4,0,Excellent
9997,4OVJ83,Trust,North Carolina,Seltos,Kia,Hatchback,Black,Automatic,5,5,...,2020-02-17,Un Sold,1970-01-01,5945,0,0,Pranav,4,0,Poor
9998,M2ECXT,Carsome,Detroit,Swift,Maruti,Sedan,Black,Automatic,5,4,...,2018-05-03,Un Sold,1970-01-01,6893,0,0,Swathi,2,0,Average


In [158]:
# define features and target
features_X = df[['Location', 'Manufacturer Name', 'Car Type',
                 'Color', 'Gearbox', 'Number of Seats',
                 'Number of Doors', 'Energy', 'Manufactured Year',
                 'Mileage-KM','Engine Power-HP']]
target_y = df['Price-$']

print(features_X.dtypes)

Location             object
Manufacturer Name    object
Car Type             object
Color                object
Gearbox              object
Number of Seats       int64
Number of Doors       int64
Energy               object
Manufactured Year     int64
Mileage-KM            int64
Engine Power-HP       int64
dtype: object


In [159]:
#split into train and test
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(features_X, target_y, test_size=0.2)

In [160]:
# define categorical and numerical columns
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(include="int64").columns.tolist()

In [161]:
# one-hot encode the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('numeric','passthrough', num_cols)
    ]
)

In [None]:
# build pipeline with the linear regression model
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])

In [185]:
# build pipeline with the random forest model
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1))
])

In [186]:
# fit the model
model.fit(X_train, y_train)

In [187]:
# predict and report error
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("MAE: ", mae)

MAE:  0.016


In [188]:
# Compare error
comparison = pd.DataFrame()
comparison["true_price"] = y_test.values
comparison["predicted_price"] = y_pred
comparison["error"] = comparison["predicted_price"] - comparison["true_price"]
comparison["absolute % error"] = abs(comparison["error"] / comparison["true_price"] * 100)

print(max(comparison["absolute % error"]))
print(min(comparison["absolute % error"]))

display(comparison)

0.039473684210526314
0.0


Unnamed: 0,true_price,predicted_price,error,absolute % error
0,6500,6500.0,0.0,0.0
1,7700,7700.0,0.0,0.0
2,7200,7200.0,0.0,0.0
3,8200,8200.0,0.0,0.0
4,7600,7600.0,0.0,0.0
...,...,...,...,...
1995,9600,9600.0,0.0,0.0
1996,7000,7000.0,0.0,0.0
1997,7600,7600.0,0.0,0.0
1998,8200,8200.0,0.0,0.0


In [189]:
my_car = pd.DataFrame([{
    "Location": "Connecticut",
    "Manufacturer Name": "Jeep",
    "Car Type": "SUV",
    "Color": "Red",
    "Gearbox": "Automatic",
    "Number of Seats": 5,
    "Number of Doors": 4,
    "Energy": "Gasoline",
    "Manufactured Year": 2004,
    "Mileage-KM": 170000,
    "Engine Power-HP": 195
}])

my_car_value = model.predict(my_car)

print(f"Estimated value of my car: ${my_car_value[0]:.2f}")

Estimated value of my car: $8452.50


## Real Dataset

In [217]:
real_df = pd.read_csv("vehicles.csv")
display(real_df.head())
display(real_df.columns)

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
0,7222695916,https://prescott.craigslist.org/cto/d/prescott...,prescott,https://prescott.craigslist.org,6000,,,,,,...,,,,,,,az,,,
1,7218891961,https://fayar.craigslist.org/ctd/d/bentonville...,fayetteville,https://fayar.craigslist.org,11900,,,,,,...,,,,,,,ar,,,
2,7221797935,https://keys.craigslist.org/cto/d/summerland-k...,florida keys,https://keys.craigslist.org,21000,,,,,,...,,,,,,,fl,,,
3,7222270760,https://worcester.craigslist.org/cto/d/west-br...,worcester / central MA,https://worcester.craigslist.org,1500,,,,,,...,,,,,,,ma,,,
4,7210384030,https://greensboro.craigslist.org/cto/d/trinit...,greensboro,https://greensboro.craigslist.org,4900,,,,,,...,,,,,,,nc,,,


Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'manufacturer',
       'model', 'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'VIN', 'drive', 'size', 'type', 'paint_color',
       'image_url', 'description', 'county', 'state', 'lat', 'long',
       'posting_date'],
      dtype='object')

In [209]:
display(real_df['state'].dropna())

0         az
1         ar
2         fl
3         ma
4         nc
          ..
426875    wy
426876    wy
426877    wy
426878    wy
426879    wy
Name: state, Length: 426880, dtype: object

In [None]:
# Clean and preprocess
real_df = real_df[['price',
                   'year',
                   'manufacturer',
                   'model',
                   'condition',
                   'fuel',
                   'odometer',
                   'drive',
                   'type',
                   'paint_color',
                   'state']].dropna()


Unnamed: 0,price,year,manufacturer,model,condition,fuel,odometer,drive,type,paint_color,state
31,15000,2013.0,ford,f-150 xlt,excellent,gas,128000.0,rwd,truck,black,al
32,27990,2012.0,gmc,sierra 2500 hd extended cab,good,gas,68696.0,4wd,pickup,black,al
33,34590,2016.0,chevrolet,silverado 1500 double,good,gas,29499.0,4wd,pickup,silver,al
34,35000,2019.0,toyota,tacoma,excellent,gas,43000.0,4wd,truck,grey,al
35,29990,2016.0,chevrolet,colorado extended cab,good,gas,17302.0,4wd,pickup,red,al


Unnamed: 0,price,year,manufacturer,model,condition,fuel,odometer,drive,type,paint_color,state
1120,23900,2017.0,jeep,grand cherokee,good,gas,50543.0,rwd,SUV,black,al
1938,5700,2007.0,jeep,grand cherokee,excellent,gas,164000.0,4wd,SUV,red,al
2026,4000,2001.0,jeep,grand cherokee,fair,gas,117000.0,rwd,SUV,white,al
8568,1700,1993.0,jeep,grand cherokee,fair,gas,278000.0,fwd,SUV,green,az
8991,21995,2013.0,jeep,grand cherokee,like new,gas,83000.0,4wd,SUV,white,az
...,...,...,...,...,...,...,...,...,...,...,...
424876,3000,1998.0,jeep,grand cherokee,good,gas,190000.0,4wd,SUV,purple,wi
424923,123,2011.0,jeep,grand cherokee,like new,gas,1234.0,4wd,pickup,green,wi
425620,7995,2007.0,jeep,grand cherokee,good,gas,164744.0,4wd,SUV,grey,wi
426253,12899,2012.0,jeep,grand cherokee,excellent,gas,90673.0,4wd,SUV,red,wi
