In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
file_path = "../dataset/dataset.csv"
df = pd.read_csv(file_path)

# Display basic info
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1002 non-null   object 
 1   description     946 non-null    object 
 2   make            1002 non-null   object 
 3   model           1002 non-null   object 
 4   year            1002 non-null   int64  
 5   price           979 non-null    float64
 6   engine          1000 non-null   object 
 7   cylinders       897 non-null    float64
 8   fuel            995 non-null    object 
 9   mileage         968 non-null    float64
 10  transmission    1000 non-null   object 
 11  trim            1001 non-null   object 
 12  body            999 non-null    object 
 13  doors           995 non-null    float64
 14  exterior_color  997 non-null    object 
 15  interior_color  964 non-null    object 
 16  drivetrain      1002 non-null   object 
dtypes: float64(4), int64(1), object(1

(None,
                               name  \
 0     2024 Jeep Wagoneer Series II   
 1  2024 Jeep Grand Cherokee Laredo   
 2         2024 GMC Yukon XL Denali   
 3       2023 Dodge Durango Pursuit   
 4            2024 RAM 3500 Laramie   
 
                                          description   make           model  \
 0  \n      \n        Heated Leather Seats, Nav Sy...   Jeep        Wagoneer   
 1  Al West is committed to offering every custome...   Jeep  Grand Cherokee   
 2                                                NaN    GMC        Yukon XL   
 3  White Knuckle Clearcoat 2023 Dodge Durango Pur...  Dodge         Durango   
 4  \n      \n        2024 Ram 3500 Laramie Billet...    RAM            3500   
 
    year    price                                             engine  \
 0  2024  74600.0                            24V GDI DOHC Twin Turbo   
 1  2024  50170.0                                                OHV   
 2  2024  96410.0  6.2L V-8 gasoline direct injection, vari

In [3]:
# Fill missing values in numerical columns with median
num_cols = ['price', 'cylinders', 'mileage', 'doors']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill missing values in categorical columns with mode
cat_cols = ['fuel', 'transmission', 'body', 'drivetrain']
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [4]:
# Selecting relevant features
features = ['year', 'mileage', 'cylinders', 'fuel', 'transmission', 'body', 'drivetrain']
X = df[features]
y = df['price']

# Define ColumnTransformer for encoding and scaling
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ['year', 'mileage', 'cylinders']),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['fuel', 'transmission', 'body', 'drivetrain'])
])


In [5]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)


In [6]:
# Predictions
y_pred = model.predict(X_test)

# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'R² Score: {r2}')


MAE: 7011.721498919088
RMSE: 10714.1012750366
R² Score: 0.5496635649518904


In [8]:
# Save model
pickle.dump(model, open('../models/random_forest.pkl', 'wb'))
print('Model saved successfully!')


Model saved successfully!
