STEP 1 — Import Libraries + Load ML-Ready Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import r2_score
import pickle


(414363, 22)

2. Load the FINAL cleaned dataset


In [None]:
df = pd.read_csv("../data/cleaned_vehicles_final.csv")
df.head()

3. Label Encode ALL categorical columns


In [None]:
le = LabelEncoder()

for col in df.select_dtypes(include="object").columns:
    df[col] = le.fit_transform(df[col])

4. Prepare X & y

In [None]:

X = df.drop(["price"], axis=1)
y = df["price"]

5. Train–Test Split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

6. Train all 3 models

Linear Regression

In [None]:


lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_r2 = r2_score(y_test, lr_pred)

Random Forest


rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_r2 = r2_score(y_test, rf_pred)

XGBoost Regressor

In [None]:

xgb_model = xgb.XGBRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_r2 = r2_score(y_test, xgb_pred)

7. Compare Models


In [None]:
results = {
    "Linear Regression R2": lr_r2,
    "Random Forest R2": rf_r2,
    "XGBoost R2": xgb_r2
}

results

8. Save BEST model

In [None]:


XGBoost is always the best here.

pickle.dump(xgb_model, open("../models/car_price_prediction.pkl", "wb"))
print("Model saved successfully!")