In [32]:
# imports
import os
import pandas as pd
import kagglehub
from IPython.display import display
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [27]:
# Download latest version
path = kagglehub.dataset_download("sandeep1080/used-car-sales")
# read the dataframe
df = pd.read_csv(os.path.join(path, "used_car_sales.csv"))

In [28]:
display(df)

Unnamed: 0,ID,Distributor Name,Location,Car Name,Manufacturer Name,Car Type,Color,Gearbox,Number of Seats,Number of Doors,...,Purchased Date,Car Sale Status,Sold Date,Purchased Price-$,Sold Price-$,Margin-%,Sales Agent Name,Sales Rating,Sales Commission-$,Feedback
0,O2KE17,Carmudi,California,Fortuner,Toyota,SUV,Gray,Automatic,8,5,...,2022-10-26,Un Sold,1970-01-01,8296,0,0,Pranav,1,0,Average
1,EPMPC8,Carousell,Philadelphia,Creta,Hyundai,Hatchback,Blue,Automatic,5,5,...,2017-08-25,Sold,2021-03-03,5659,4770,-16,Vihaan,5,0,Good
2,SQKXAP,Carsome,North Carolina,Scorpio,Mahindra,SUV,Gray,Automatic,5,5,...,2018-06-13,Un Sold,1970-01-01,8430,0,0,Aarush,4,0,Good
3,PWP2QK,Trivett,North Carolina,Plato,Prazo,Convertible,Gray,Automatic,2,2,...,2023-05-14,Sold,2024-04-02,6919,7942,15,Anushka,1,205,Average
4,FNDDKM,Zupps,Portland,Dzire,Maruti,Sedan,Red,Automatic,5,5,...,2022-08-24,Un Sold,1970-01-01,6864,0,0,Pavan,3,0,Poor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ZHLCSG,APE,Texas,Yodha,Tata,Truck,Blue,Manual,3,2,...,2023-12-29,Sold,2024-03-23,6102,5041,-17,Supriya,3,0,Excellent
9996,2BJE0Y,Carsome,Portland,Scorpio,Mahindra,SUV,Black,Automatic,5,5,...,2019-06-13,Un Sold,1970-01-01,8108,0,0,Aarush,4,0,Excellent
9997,4OVJ83,Trust,North Carolina,Seltos,Kia,Hatchback,Black,Automatic,5,5,...,2020-02-17,Un Sold,1970-01-01,5945,0,0,Pranav,4,0,Poor
9998,M2ECXT,Carsome,Detroit,Swift,Maruti,Sedan,Black,Automatic,5,4,...,2018-05-03,Un Sold,1970-01-01,6893,0,0,Swathi,2,0,Average


In [29]:
# define features and target
features_X = df[['Location', 'Manufacturer Name', 'Car Type',
                 'Color', 'Gearbox', 'Number of Seats',
                 'Number of Doors', 'Energy', 'Manufactured Year',
                 'Mileage-KM','Engine Power-HP']]
target_y = df['Price-$']

print(features_X.dtypes)

Location             object
Manufacturer Name    object
Car Type             object
Color                object
Gearbox              object
Number of Seats       int64
Number of Doors       int64
Energy               object
Manufactured Year     int64
Mileage-KM            int64
Engine Power-HP       int64
dtype: object


In [30]:
#split into train and test
X_train = features_X[:8000]
X_test = features_X[8000:]

y_train = target_y[:8000]
y_test = target_y[8000:]

In [31]:
# define categorical and numerical columns
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(include="int64").columns.tolist()

In [None]:
# one-hot encode the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('numeric','passthrough', num_cols)
    ]
)

In [None]:
# build pipeline with the model
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])