# Project: Car Price Prediction

In [1]:
import pandas as pd

In [13]:
df = pd.read_csv('auto24_cars_clean.csv')

In [14]:
# Save url and brand columns before dropping
url_column = df['url'].copy()

# Separate features and target
X = df.drop(columns=['price', 'url'])
y = df['price']

In [15]:
# Drop engineered features
features_to_drop = ['power_per_engine', 'age_mileage_interaction', 'mileage_per_year']
X_reduced = df.drop(columns=['price', 'url'] + features_to_drop)
y_reduced = df['price']

print(f"Original features: {X.shape[1]}")
print(f"Reduced features: {X_reduced.shape[1]}")
print(f"\nDropped features: {features_to_drop}")

Original features: 85
Reduced features: 82

Dropped features: ['power_per_engine', 'age_mileage_interaction', 'mileage_per_year']


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [17]:
# --- 7. Split into train and test sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(
    X_reduced, y_reduced, test_size=0.2, random_state=42
)

print(f"Reduces training set size: {X_train_reduced.shape[0]}")
print(f"Reduced test set size: {X_test_reduced.shape[0]}")

Training set size: 9122
Test set size: 2281
Reduces training set size: 9122
Reduced test set size: 2281


## Linear Regression Model

In [7]:
# --- 8. Train Linear Regression model ---
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# --- 9. Predict on test set ---
y_pred = lr_model.predict(X_test)

# --- Inverse transform y_test and y_pred back to original price scale ---
y_test_original = np.exp(y_test)
y_pred_original = np.exp(y_pred)

# --- 10. Evaluate model ---
train_r2 = lr_model.score(X_train, y_train)
test_r2 = lr_model.score(X_test, y_test)

# Calculate MAE and RMSE on original price scale
mae = mean_absolute_error(y_test_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
median_price = np.median(y_test_original)

mae_pct = (mae / median_price) * 100
rmse_pct = (rmse / median_price) * 100


## As we did log transformation on our "price" value, we need to transform it back to original value to interpret it:

print(f"Training R² (log-transformed price): {train_r2:.3f}")
print(f"Test R² (log-transformed price): {test_r2:.3f}")
print(f"Median actual price: {median_price:,.2f} euros")
print(f"MAE: {mae:,.2f} euros ({mae_pct:.2f}%)")
print(f"RMSE: {rmse:,.2f} euros ({rmse_pct:.2f}%)")

Training R² (log-transformed price): 0.868
Test R² (log-transformed price): 0.864
Median actual price: 16,900.00 euros
MAE: 4,341.85 euros (25.69%)
RMSE: 7,730.41 euros (45.74%)


In [18]:
# --- 8. Train Linear Regression model on reduced data ---
lr_model = LinearRegression()
lr_model.fit(X_train_reduced, y_train_reduced)

# --- 9. Predict on test set ---
y_pred = lr_model.predict(X_test_reduced)

# --- Inverse transform y_test and y_pred back to original price scale ---
y_test_original = np.exp(y_test_reduced)
y_pred_original = np.exp(y_pred)

# --- 10. Evaluate model ---
train_r2 = lr_model.score(X_train_reduced, y_train_reduced)
test_r2 = lr_model.score(X_test_reduced, y_test_reduced)

# Calculate MAE and RMSE on original price scale
mae = mean_absolute_error(y_test_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
median_price = np.median(y_test_original)

mae_pct = (mae / median_price) * 100
rmse_pct = (rmse / median_price) * 100


## As we did log transformation on our "price" value, we need to transform it back to original value to interpret it:

print(f"Training R² (log-transformed price): {train_r2:.3f}")
print(f"Test R² (log-transformed price): {test_r2:.3f}")
print(f"Median actual price: {median_price:,.2f} euros")
print(f"MAE: {mae:,.2f} euros ({mae_pct:.2f}%)")
print(f"RMSE: {rmse:,.2f} euros ({rmse_pct:.2f}%)")

Training R² (log-transformed price): 0.862
Test R² (log-transformed price): 0.857
Median actual price: 16,900.00 euros
MAE: 4,428.84 euros (26.21%)
RMSE: 7,539.91 euros (44.61%)


Test R squared is a bit higher than training one, so we could say that the model can generalize on test data. MAE could be better (that means, have a lower value), but this is the first model and we could improve data transformation too. For RMSE, this high value can be affected by outliers and we did not focus on outliers so far.

In [8]:
lr_model.coef_

array([ 3.95228014e+00, -3.29164043e-02, -6.38295498e-02,  1.78201900e+00,
       -6.28699881e-01, -1.28967716e+00, -3.73794203e-02,  1.23315521e-01,
       -1.47904729e+00,  2.70566163e-01,  3.02964257e-01,  7.51707148e-01,
        4.24733114e-01,  8.66342925e-01, -7.58615182e-04, -6.97338522e-02,
       -1.18755630e-01,  3.39136173e-02, -1.63532471e-01,  7.51214245e-02,
        1.10202610e-01,  1.93526948e-01,  2.11862542e-02,  1.43382596e-01,
        2.99970183e-01,  4.60116737e-02,  2.43400950e-01,  3.50796364e-01,
        2.96295640e-01,  2.01651102e-01,  1.32883455e-01,  5.74205185e-01,
        4.03916685e-01,  1.34889241e-01,  5.92526776e-02, -1.83436579e-03,
        3.75573503e-02,  5.04121122e-01,  1.05723861e-01,  8.75727481e-02,
        1.19158978e-01,  1.95170637e-01,  1.36316595e-01,  1.41348255e-01,
       -7.96423901e-02, -1.36111634e-01,  2.02775949e-01,  2.55874392e-01,
        2.06168215e-01,  1.97889379e-02, -1.05788034e-01, -3.48263829e-01,
        8.80928519e-01, -

In [9]:
df.head()

Unnamed: 0,first_reg,mileage,engine_size,power,model_group,age,power_per_engine,age_mileage_interaction,mileage_per_year,brand_Audi,...,color_green,color_light blue,color_light gray,color_orange,color_other,color_red,color_silver,color_white,url,price
0,0.666667,0.016621,0.115993,0.409471,0.979592,0.728271,0.279356,0.947162,0.842367,False,...,False,False,False,False,False,False,False,False,https://www.auto24.ee/soidukid/4243682,6.907755
1,0.508772,0.018117,0.132583,0.352339,0.714286,0.824181,0.202813,0.969492,0.805813,False,...,True,False,False,False,False,False,False,False,https://www.auto24.ee/soidukid/4241103,6.907755
2,0.701754,0.061621,0.074667,0.290308,0.77551,0.700761,0.257303,0.909671,0.806237,False,...,False,False,False,False,False,False,False,False,https://www.auto24.ee/soidukid/4250614,6.907755
3,0.561404,0.033265,0.156243,0.515273,1.0,0.79615,0.292385,0.952302,0.800504,False,...,False,False,False,False,True,False,False,False,https://www.auto24.ee/soidukid/4040436,6.907755
4,0.666667,0.101637,0.037333,0.176102,0.836735,0.728271,0.233666,0.888988,0.755131,False,...,False,False,False,False,True,False,False,False,https://www.auto24.ee/soidukid/4232253,6.907755


## Random Forest Model

In [22]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,          # Limit depth to prevent overfitting
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

y_pred_train_rf = rf_model.predict(X_train)
y_pred_test_rf = rf_model.predict(X_test)

# Inverse transform back to original prices
y_pred_train_rf_original = np.exp(y_pred_train_rf)
y_pred_test_rf_original = np.exp(y_pred_test_rf)

train_r2_rf = rf_model.score(X_train, y_train)
test_r2_rf = rf_model.score(X_test, y_test)

mae_rf = mean_absolute_error(y_test_original, y_pred_test_rf_original)
rmse_rf = np.sqrt(mean_squared_error(y_test_original, y_pred_test_rf_original))
median_price_rf = np.median(y_test_original)

mae_pct_rf = (mae_rf / median_price_rf) * 100
rmse_pct_rf = (rmse_rf / median_price_rf) * 100

print(f"\nTraining R² (log-transformed price): {train_r2_rf:.3f}")
print(f"Test R² (log-transformed price): {test_r2_rf:.3f}")
print(f"Median actual price: {median_price_rf:,.2f} euros")
print(f"MAE: {mae_rf:,.2f} euros ({mae_pct_rf:.2f}%)")
print(f"RMSE: {rmse_rf:,.2f} euros ({rmse_pct_rf:.2f}%)")

print("TOP 15 MOST IMPORTANT FEATURES (Random Forest)")

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False).head(15)

print(feature_importance.to_string(index=False))


Training R² (log-transformed price): 0.986
Test R² (log-transformed price): 0.933
Median actual price: 16,900.00 euros
MAE: 3,000.57 euros (17.75%)
RMSE: 5,055.60 euros (29.91%)
TOP 15 MOST IMPORTANT FEATURES (Random Forest)
                     feature  importance
                   first_reg    0.303198
                       power    0.197787
                         age    0.177295
     age_mileage_interaction    0.164928
                 model_group    0.068843
drive_type_front-wheel drive    0.015476
                 engine_size    0.012976
            power_per_engine    0.009067
                     mileage    0.008257
            mileage_per_year    0.007666
              gearbox_manual    0.004759
                  brand_Audi    0.001829
                 fuel_petrol    0.001603
            brand_Volkswagen    0.001204
           body_type_minivan    0.001196


In [23]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,          # Limit depth to prevent overfitting
    min_samples_split=5,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_reduced, y_train_reduced)

y_pred_train_rf = rf_model.predict(X_train_reduced)
y_pred_test_rf = rf_model.predict(X_test_reduced)

# Inverse transform back to original prices
y_pred_train_rf_original = np.exp(y_pred_train_rf)
y_pred_test_rf_original = np.exp(y_pred_test_rf)

train_r2_rf = rf_model.score(X_train_reduced, y_train_reduced)
test_r2_rf = rf_model.score(X_test_reduced, y_test_reduced)

mae_rf = mean_absolute_error(y_test_original, y_pred_test_rf_original)
rmse_rf = np.sqrt(mean_squared_error(y_test_original, y_pred_test_rf_original))
median_price_rf = np.median(y_test_original)

mae_pct_rf = (mae_rf / median_price_rf) * 100
rmse_pct_rf = (rmse_rf / median_price_rf) * 100

print(f"\nTraining R² (log-transformed price): {train_r2_rf:.3f}")
print(f"Test R² (log-transformed price): {test_r2_rf:.3f}")
print(f"Median actual price: {median_price_rf:,.2f} euros")
print(f"MAE: {mae_rf:,.2f} euros ({mae_pct_rf:.2f}%)")
print(f"RMSE: {rmse_rf:,.2f} euros ({rmse_pct_rf:.2f}%)")

print("TOP 15 MOST IMPORTANT FEATURES (Random Forest)")

feature_importance = pd.DataFrame({
    'feature': X_train_reduced.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False).head(15)

print(feature_importance.to_string(index=False))


Training R² (log-transformed price): 0.986
Test R² (log-transformed price): 0.934
Median actual price: 16,900.00 euros
MAE: 2,947.94 euros (17.44%)
RMSE: 5,098.95 euros (30.17%)
TOP 15 MOST IMPORTANT FEATURES (Random Forest)
                     feature  importance
                   first_reg    0.375022
                         age    0.227247
                       power    0.207108
                 model_group    0.068034
                     mileage    0.048730
drive_type_front-wheel drive    0.016841
                 engine_size    0.015939
              gearbox_manual    0.005391
                 fuel_petrol    0.001857
                  brand_Audi    0.001844
            brand_Volkswagen    0.001579
           body_type_minivan    0.001368
           body_type_touring    0.001367
             body_type_sedan    0.001220
                 color_black    0.001150


Run cross validation on better model (trained on reduced dataset)

In [24]:
# Random Forest Hyperparameter Tuning with Cross-Validation
from sklearn.model_selection import GridSearchCV
import time

print("="*70)
print("Random Forest Hyperparameter Tuning with GridSearchCV")
print("="*70)
print("This may take several minutes...\n")

# Define parameter grid to search
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [15, 20, 25, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_base = RandomForestRegressor(random_state=42, n_jobs=-1)

# Setup GridSearchCV with 5-fold cross-validation
grid_search_rf = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid_rf,
    cv=5,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)

start_time = time.time()
grid_search_rf.fit(X_train, y_train)
elapsed_time = time.time() - start_time

print(f"\nGrid Search completed in {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")
print(f"\nBest parameters found:")
for param, value in grid_search_rf.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest cross-validation R² score: {grid_search_rf.best_score_:.4f}")

# Use the best model
rf_model_tuned = grid_search_rf.best_estimator_

y_pred_train_rf_tuned = rf_model_tuned.predict(X_train)
y_pred_test_rf_tuned = rf_model_tuned.predict(X_test)

# Inverse transform
y_pred_train_rf_tuned_original = np.exp(y_pred_train_rf_tuned)
y_pred_test_rf_tuned_original = np.exp(y_pred_test_rf_tuned)

train_r2_rf_tuned = rf_model_tuned.score(X_train, y_train)
test_r2_rf_tuned = rf_model_tuned.score(X_test, y_test)

mae_rf_tuned = mean_absolute_error(y_test_original, y_pred_test_rf_tuned_original)
rmse_rf_tuned = np.sqrt(mean_squared_error(y_test_original, y_pred_test_rf_tuned_original))

mae_pct_rf_tuned = (mae_rf_tuned / median_price_rf) * 100
rmse_pct_rf_tuned = (rmse_rf_tuned / median_price_rf) * 100

print(f"\nTraining R² (log-transformed price): {train_r2_rf_tuned:.3f}")
print(f"Test R² (log-transformed price): {test_r2_rf_tuned:.3f}")
print(f"Median actual price: {median_price_rf:,.2f} euros")
print(f"MAE: {mae_rf_tuned:,.2f} euros ({mae_pct_rf_tuned:.2f}%)")
print(f"RMSE: {rmse_rf_tuned:,.2f} euros ({rmse_pct_rf_tuned:.2f}%)")

print("\nTOP 15 MOST IMPORTANT FEATURES (Tuned Random Forest)")
feature_importance_rf_tuned = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model_tuned.feature_importances_
}).sort_values('importance', ascending=False).head(15)

print(feature_importance_rf_tuned.to_string(index=False))

Random Forest Hyperparameter Tuning with GridSearchCV
This may take several minutes...

Fitting 5 folds for each of 216 candidates, totalling 1080 fits





Grid Search completed in 157.97 seconds (2.63 minutes)

Best parameters found:
  max_depth: None
  max_features: sqrt
  min_samples_leaf: 1
  min_samples_split: 2
  n_estimators: 300

Best cross-validation R² score: 0.9299

Training R² (log-transformed price): 0.991
Test R² (log-transformed price): 0.934
Median actual price: 16,900.00 euros
MAE: 2,998.71 euros (17.74%)
RMSE: 5,219.42 euros (30.88%)

TOP 15 MOST IMPORTANT FEATURES (Tuned Random Forest)
                     feature  importance
                   first_reg    0.167115
     age_mileage_interaction    0.159150
                         age    0.142395
                     mileage    0.097295
                 model_group    0.089452
                       power    0.076786
            power_per_engine    0.048383
                 engine_size    0.036556
drive_type_front-wheel drive    0.034802
              gearbox_manual    0.033853
            mileage_per_year    0.021039
                 fuel_hybrid    0.011010
          

XGBoost Model

In [25]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

y_pred_train_xgb = xgb_model.predict(X_train)
y_pred_test_xgb = xgb_model.predict(X_test)

# Inverse transform back to original prices
y_pred_train_xgb_original = np.exp(y_pred_train_xgb)
y_pred_test_xgb_original = np.exp(y_pred_test_xgb)

train_r2_xgb = xgb_model.score(X_train, y_train)
test_r2_xgb = xgb_model.score(X_test, y_test)

mae_xgb = mean_absolute_error(y_test_original, y_pred_test_xgb_original)
rmse_xgb = np.sqrt(mean_squared_error(y_test_original, y_pred_test_xgb_original))
median_price_xgb = np.median(y_test_original)

mae_pct_xgb = (mae_xgb / median_price_xgb) * 100
rmse_pct_xgb = (rmse_xgb / median_price_xgb) * 100

print(f"\nTraining R² (log-transformed price): {train_r2_xgb:.3f}")
print(f"Test R² (log-transformed price): {test_r2_xgb:.3f}")
print(f"Median actual price: {median_price_xgb:,.2f} euros")
print(f"MAE: {mae_xgb:,.2f} euros ({mae_pct_xgb:.2f}%)")
print(f"RMSE: {rmse_xgb:,.2f} euros ({rmse_pct_xgb:.2f}%)")

print("TOP 15 MOST IMPORTANT FEATURES (XGBoost)")

feature_importance_xgb = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False).head(15)

print(feature_importance_xgb.to_string(index=False))


Training R² (log-transformed price): 0.964
Test R² (log-transformed price): 0.937
Median actual price: 16,900.00 euros
MAE: 2,919.85 euros (17.28%)
RMSE: 4,827.64 euros (28.57%)
TOP 15 MOST IMPORTANT FEATURES (XGBoost)
                     feature  importance
                   first_reg    0.198987
     age_mileage_interaction    0.140884
drive_type_front-wheel drive    0.118942
                       power    0.097125
              body_type_open    0.053057
                 model_group    0.051935
              gearbox_manual    0.030344
                 engine_size    0.014492
               fuel_electric    0.011648
                brand_Subaru    0.011625
               brand_Peugeot    0.011566
               brand_Hyundai    0.011555
                 fuel_petrol    0.011267
             body_type_coupe    0.011075
            brand_Land Rover    0.010771


In [30]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train_reduced, y_train_reduced)

y_pred_train_xgb = xgb_model.predict(X_train_reduced)
y_pred_test_xgb = xgb_model.predict(X_test_reduced)

# Inverse transform back to original prices
y_pred_train_xgb_original = np.exp(y_pred_train_xgb)
y_pred_test_xgb_original = np.exp(y_pred_test_xgb)

train_r2_xgb = xgb_model.score(X_train_reduced, y_train_reduced)
test_r2_xgb = xgb_model.score(X_test_reduced, y_test_reduced)

mae_xgb = mean_absolute_error(y_test_original, y_pred_test_xgb_original)
rmse_xgb = np.sqrt(mean_squared_error(y_test_original, y_pred_test_xgb_original))
median_price_xgb = np.median(y_test_original)

mae_pct_xgb = (mae_xgb / median_price_xgb) * 100
rmse_pct_xgb = (rmse_xgb / median_price_xgb) * 100

print(f"\nTraining R² (log-transformed price): {train_r2_xgb:.3f}")
print(f"Test R² (log-transformed price): {test_r2_xgb:.3f}")
print(f"Median actual price: {median_price_xgb:,.2f} euros")
print(f"MAE: {mae_xgb:,.2f} euros ({mae_pct_xgb:.2f}%)")
print(f"RMSE: {rmse_xgb:,.2f} euros ({rmse_pct_xgb:.2f}%)")

print("TOP 15 MOST IMPORTANT FEATURES (XGBoost)")

feature_importance_xgb = pd.DataFrame({
    'feature': X_train_reduced.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False).head(15)

print(feature_importance_xgb.to_string(index=False))


Training R² (log-transformed price): 0.963
Test R² (log-transformed price): 0.939
Median actual price: 16,900.00 euros
MAE: 2,887.56 euros (17.09%)
RMSE: 4,809.35 euros (28.46%)
TOP 15 MOST IMPORTANT FEATURES (XGBoost)
                     feature  importance
                   first_reg    0.247641
drive_type_front-wheel drive    0.139987
                       power    0.102284
                 model_group    0.057485
              gearbox_manual    0.033359
              body_type_open    0.024840
               fuel_electric    0.020576
                     mileage    0.020046
            brand_Land Rover    0.017366
             body_type_coupe    0.017311
               brand_Hyundai    0.017250
                 engine_size    0.016655
                 brand_Tesla    0.015495
                   brand_Kia    0.012835
                 fuel_petrol    0.012599


XGBoost Hyperparameter Tuning with Cross-Validation

We'll use GridSearchCV to find the optimal hyperparameters for XGBoost by testing different combinations of:
- n_estimators: Number of boosting rounds
- max_depth: Maximum tree depth
- learning_rate: Step size shrinkage
- min_child_weight: Minimum sum of instance weight needed in a child
- subsample: Fraction of samples used for fitting trees
- colsample_bytree: Fraction of features used for fitting trees

In [31]:
from sklearn.model_selection import GridSearchCV
import time

print("Starting hyperparameter tuning with GridSearchCV...")
print("This may take several minutes...\n")

# Define parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize XGBoost model
xgb_base = XGBRegressor(random_state=42, n_jobs=-1)

# Setup GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)

# Start timing
start_time = time.time()

# Fit GridSearchCV
grid_search.fit(X_train_reduced, y_train_reduced)

# End timing
end_time = time.time()
elapsed_time = end_time - start_time

print(f"\nGrid Search completed in {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")
print(f"\nBest parameters found:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest cross-validation R² score: {grid_search.best_score_:.4f}")

Starting hyperparameter tuning with GridSearchCV...
This may take several minutes...

Fitting 5 folds for each of 324 candidates, totalling 1620 fits

Grid Search completed in 74.81 seconds (1.25 minutes)

Best parameters found:
  colsample_bytree: 0.8
  learning_rate: 0.1
  max_depth: 6
  min_child_weight: 1
  n_estimators: 300
  subsample: 0.8

Best cross-validation R² score: 0.9411


In [32]:
# Use the best XGBoost model
xgb_model_tuned = grid_search.best_estimator_

y_pred_train_xgb_tuned = xgb_model_tuned.predict(X_train_reduced)
y_pred_test_xgb_tuned = xgb_model_tuned.predict(X_test_reduced)

# Inverse transform
y_pred_train_xgb_tuned_original = np.exp(y_pred_train_xgb_tuned)
y_pred_test_xgb_tuned_original = np.exp(y_pred_test_xgb_tuned)

train_r2_xgb_tuned = xgb_model_tuned.score(X_train_reduced, y_train_reduced)
test_r2_xgb_tuned = xgb_model_tuned.score(X_test_reduced, y_test_reduced)

mae_xgb_tuned = mean_absolute_error(y_test_original, y_pred_test_xgb_tuned_original)
rmse_xgb_tuned = np.sqrt(mean_squared_error(y_test_original, y_pred_test_xgb_tuned_original))

mae_pct_xgb_tuned = (mae_xgb_tuned / median_price_xgb) * 100
rmse_pct_xgb_tuned = (rmse_xgb_tuned / median_price_xgb) * 100

print(f"\nTraining R² (log-transformed price): {train_r2_xgb_tuned:.3f}")
print(f"Test R² (log-transformed price): {test_r2_xgb_tuned:.3f}")
print(f"Median actual price: {median_price_xgb:,.2f} euros")
print(f"MAE: {mae_xgb_tuned:,.2f} euros ({mae_pct_xgb_tuned:.2f}%)")
print(f"RMSE: {rmse_xgb_tuned:,.2f} euros ({rmse_pct_xgb_tuned:.2f}%)")

print("\nTOP 15 MOST IMPORTANT FEATURES (Tuned XGBoost)")
feature_importance_xgb_tuned = pd.DataFrame({
    'feature': X_train_reduced.columns,
    'importance': xgb_model_tuned.feature_importances_
}).sort_values('importance', ascending=False).head(15)

print(feature_importance_xgb_tuned.to_string(index=False))


Training R² (log-transformed price): 0.981
Test R² (log-transformed price): 0.944
Median actual price: 16,900.00 euros
MAE: 2,642.59 euros (15.64%)
RMSE: 4,294.49 euros (25.41%)

TOP 15 MOST IMPORTANT FEATURES (Tuned XGBoost)
                     feature  importance
drive_type_front-wheel drive    0.209751
                         age    0.149272
                   first_reg    0.132906
                       power    0.045465
                 model_group    0.041542
               brand_Porsche    0.019861
              gearbox_manual    0.019684
                 engine_size    0.015681
               fuel_electric    0.014231
                     mileage    0.013618
                  brand_Opel    0.012774
                brand_Subaru    0.012729
            brand_Land Rover    0.012523
                   brand_Kia    0.011819
                  brand_Audi    0.011778


In [34]:
print("="*70)
print("FINAL MODEL COMPARISON - ALL MODELS")
print("="*70)

comparison_all_final = pd.DataFrame({
    'Model': [
        'Linear Regression', 
        'Random Forest (Baseline)', 
        'Random Forest (Tuned)', 
        'XGBoost (Default)',
        'XGBoost (Tuned)'
    ],
    'Train R²': [train_r2, train_r2_rf, train_r2_rf_tuned, train_r2_xgb, train_r2_xgb_tuned],
    'Test R²': [test_r2, test_r2_rf, test_r2_rf_tuned, test_r2_xgb, test_r2_xgb_tuned],
    'MAE (€)': [mae, mae_rf, mae_rf_tuned, mae_xgb, mae_xgb_tuned],
    'RMSE (€)': [rmse, rmse_rf, rmse_rf_tuned, rmse_xgb, rmse_xgb_tuned],
    'MAE (%)': [mae_pct, mae_pct_rf, mae_pct_rf_tuned, mae_pct_xgb, mae_pct_xgb_tuned],
    'RMSE (%)': [rmse_pct, rmse_pct_rf, rmse_pct_rf_tuned, rmse_pct_xgb, rmse_pct_xgb_tuned]
})

print(comparison_all_final.to_string(index=False))

best_model_idx = comparison_all_final['Test R²'].idxmax()
best_model_name = comparison_all_final.loc[best_model_idx, 'Model']

print("\n" + "="*70)
print(f"BEST MODEL: {best_model_name}")
print("="*70)
print(f"Test R²: {comparison_all_final.loc[best_model_idx, 'Test R²']:.3f}")
print(f"MAE: {comparison_all_final.loc[best_model_idx, 'MAE (€)']:,.2f} euros ({comparison_all_final.loc[best_model_idx, 'MAE (%)']:.2f}%)")
print(f"RMSE: {comparison_all_final.loc[best_model_idx, 'RMSE (€)']:,.2f} euros ({comparison_all_final.loc[best_model_idx, 'RMSE (%)']:.2f}%)")
print("="*70)

FINAL MODEL COMPARISON - ALL MODELS
                   Model  Train R²  Test R²     MAE (€)    RMSE (€)   MAE (%)  RMSE (%)
       Linear Regression  0.862462 0.857406 4428.838674 7539.906956 26.206146 44.614834
Random Forest (Baseline)  0.985606 0.934130 2947.936765 5098.946729 17.443413 30.171282
   Random Forest (Tuned)  0.990835 0.933774 2998.714342 5219.423615 17.743872 30.884163
       XGBoost (Default)  0.963282 0.938772 2887.563873 4809.347072 17.086177 28.457675
         XGBoost (Tuned)  0.980680 0.943612 2642.587424 4294.491207 15.636612 25.411191

BEST MODEL: XGBoost (Tuned)
Test R²: 0.944
MAE: 2,642.59 euros (15.64%)
RMSE: 4,294.49 euros (25.41%)


In [35]:
# --- 1. BASE MODEL DEFINITION (CRITICAL: USE YOUR BEST HYPERPARAMETERS!) ---
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge
from sklearn.ensemble import StackingRegressor
import xgboost as xgb
from sklearn.metrics import r2_score

rf_model_base = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42) 

xgb_model_base = xgb.XGBRegressor(n_estimators=300, max_depth=6, random_state=42)

# KRR parameters can stay as is for a first try
krr_model_base = KernelRidge(alpha=0.6, kernel='polynomial', degree=2)

In [36]:
# ===== ENSEMBLE MODEL (ROBUST STACKING - FULL, CORRECTED) =====
print("THIS TAKES SEVERAL MINUTES...\n")

# --- 1. BASE MODEL DEFINITION (Use your best parameters here!) ---
# Ensure these lines reflect the exact parameters of your best RF and XGB models.
rf_model_base = RandomForestRegressor(n_estimators=500, max_depth=10, random_state=42) 
xgb_model_base = xgb.XGBRegressor(n_estimators=500, max_depth=5, random_state=42) 
krr_model_base = KernelRidge(alpha=0.6, kernel='polynomial', degree=2) 


# --- STEP 1: ROBUST STACKING (2 MODELS: RF + XGB) ---
estimators_2_models = [
    ('rf', rf_model_base),
    ('xgb', xgb_model_base)
]

stack_2_models = StackingRegressor(
    estimators=estimators_2_models, 
    final_estimator=Ridge(alpha=1.0),
    cv=5 
)

print("# --- RUNNING ROBUST STACKING (2 MODELS: RF + XGB) --- #")
stack_2_models.fit(X_train, y_train)

y_pred_stacked_log_2 = stack_2_models.predict(X_test)
y_pred_stacked_original_2 = np.exp(y_pred_stacked_log_2)

rmse_ensemble_stack_2 = np.sqrt(mean_squared_error(y_test_original, y_pred_stacked_original_2))
mae_ensemble_stack_2 = mean_absolute_error(y_test_original, y_pred_stacked_original_2)
r2_ensemble_stack_2 = r2_score(y_test_original, y_pred_stacked_original_2)

print(f"# ===== ROBUST STACKING ENSEMBLE PERFORMANCE (RF + XGB) ===== #")
print(f"RMSE: {rmse_ensemble_stack_2:.2f}")
print(f"MAE:  {mae_ensemble_stack_2:.2f}")
print(f"R²:   {r2_ensemble_stack_2:.4f}")
print("-" * 50)


# --- STEP 2: ADD A DIVERSE THIRD MODEL (KRR) ---
estimators_3_models = [
    ('rf', rf_model_base),
    ('xgb', xgb_model_base),
    ('krr', krr_model_base) 
]

stack_3_models = StackingRegressor(
    estimators=estimators_3_models, 
    final_estimator=Ridge(alpha=1.0),
    cv=5 
)

print("# --- RUNNING ROBUST STACKING (3 MODELS: RF + XGB + KRR) --- #")
stack_3_models.fit(X_train, y_train)

y_pred_stacked_log_3 = stack_3_models.predict(X_test)
y_pred_stacked_original_3 = np.exp(y_pred_stacked_log_3)

rmse_ensemble_stack_3 = np.sqrt(mean_squared_error(y_test_original, y_pred_stacked_original_3))
mae_ensemble_stack_3 = mean_absolute_error(y_test_original, y_pred_stacked_original_3)
r2_ensemble_stack_3 = r2_score(y_test_original, y_pred_stacked_original_3)

print(f"# ===== ROBUST STACKING ENSEMBLE PERFORMANCE (RF + XGB + KRR) ===== #")
print(f"RMSE: {rmse_ensemble_stack_3:.2f}")
print(f"MAE:  {mae_ensemble_stack_3:.2f}")
print(f"R²:   {r2_ensemble_stack_3:.4f}")

THIS TAKES SEVERAL MINUTES...

# --- RUNNING ROBUST STACKING (2 MODELS: RF + XGB) --- #
# ===== ROBUST STACKING ENSEMBLE PERFORMANCE (RF + XGB) ===== #
RMSE: 4482.45
MAE:  2749.63
R²:   0.9429
--------------------------------------------------
# --- RUNNING ROBUST STACKING (3 MODELS: RF + XGB + KRR) --- #
# ===== ROBUST STACKING ENSEMBLE PERFORMANCE (RF + XGB + KRR) ===== #
RMSE: 4435.43
MAE:  2733.10
R²:   0.9441
