In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Define dataset path
dataset_path = os.path.join("dataset", "dataset.csv")

# Load dataset
df = pd.read_csv(dataset_path)


In [4]:
# Data Preprocessing
df = df[['year', 'mileage', 'cylinders', 'fuel', 'transmission', 'body', 'drivetrain', 'price']]

# Handle missing values
df.dropna(inplace=True)

In [5]:
df['fuel'] = df['fuel'].fillna('').str.strip().str.lower()
df['transmission'] = df['transmission'].fillna('').str.strip().str.lower()
df['body'] = df['body'].fillna('').str.strip().str.lower()
df['drivetrain'] = df['drivetrain'].fillna('').str.strip().str.lower()


In [6]:
# Define features and target variable
X = df.drop(columns=['price'])
y = df['price']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define preprocessing steps
categorical_features = ['fuel', 'transmission', 'body', 'drivetrain']
numerical_features = ['year', 'mileage', 'cylinders']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])


In [8]:
# Create a pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [9]:
# Train the model
model.fit(X_train, y_train)

# Define model path inside "models" folder
model_path = os.path.join("models", "random_forest.pkl")

# Save the model
pickle.dump(model, open(model_path, "wb"))
print(f"Model saved successfully at {model_path}!")

Model saved successfully at models\random_forest.pkl!
