In [119]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [120]:
df = pd.read_csv('auto24_cars_clean.csv')

In [None]:
# Save url and brand columns before dropping
url_column = df['url'].copy()
brand_column = df['brand'].copy()

# Separate features and target
X = df.drop(columns=['price', 'url'])  # Drop both price and url
y = df['price']

In [122]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 9122
Test set size: 2281


In [123]:
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
# Train XGBoost model
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

print("Training XGBoost model...")
model.fit(X_train, y_train)
print("Model training completed!")

Training XGBoost model...
Model training completed!


In [124]:
# Make predictions
y_pred = model.predict(X_test)

print(f"Predictions made: {len(y_pred)}")
print(f"\nSample predictions vs actual:")
for i in range(5):
    print(f"Predicted: {y_pred[i]:.4f}, Actual: {y_test.iloc[i]:.4f}")

Predictions made: 2281

Sample predictions vs actual:
Predicted: 10.1973, Actual: 10.1616
Predicted: 9.1487, Actual: 8.6993
Predicted: 9.0082, Actual: 9.3057
Predicted: 11.3028, Actual: 11.2885
Predicted: 8.7079, Actual: 8.6123


In [125]:
# Inverse transform predictions and actual values back to original price scale
y_test_original = np.exp(y_test)
y_pred_original = np.exp(y_pred)

# Calculate R¬≤ on log-transformed prices
r2 = r2_score(y_test, y_pred)

# Calculate MAE and RMSE on original price scale
mae = mean_absolute_error(y_test_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
median_price = np.median(y_test_original)

# Calculate percentage metrics
mae_pct = (mae / median_price) * 100
rmse_pct = (rmse / median_price) * 100

print("=" * 70)
print("MODEL PERFORMANCE METRICS (XGBoost)")
print("=" * 70)
print(f"R¬≤ Score (log-transformed price):  {r2:.4f}")
print(f"Median actual price:                {median_price:,.2f} euros")
print(f"MAE:                                {mae:,.2f} euros ({mae_pct:.2f}%)")
print(f"RMSE:                               {rmse:,.2f} euros ({rmse_pct:.2f}%)")
print("=" * 70)

MODEL PERFORMANCE METRICS (XGBoost)
R¬≤ Score (log-transformed price):  0.9373
Median actual price:                16,900.00 euros
MAE:                                2,919.85 euros (17.28%)
RMSE:                               4,827.64 euros (28.57%)


In [126]:
# Drop engineered features
features_to_drop = ['power_per_engine', 'age_mileage_interaction', 'mileage_per_year']
X_reduced = df.drop(columns=['price', 'url'] + features_to_drop)  # Also drop url
y_reduced = df['price']

print(f"Original features: {X.shape[1]}")
print(f"Reduced features: {X_reduced.shape[1]}")
print(f"\nDropped features: {features_to_drop}")

Original features: 85
Reduced features: 82

Dropped features: ['power_per_engine', 'age_mileage_interaction', 'mileage_per_year']


In [127]:
# Split into train and test sets (same random_state for fair comparison)
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(
    X_reduced, y_reduced, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train_reduced.shape[0]}")
print(f"Test set size: {X_test_reduced.shape[0]}")

Training set size: 9122
Test set size: 2281


In [128]:
# Train XGBoost model with reduced features
model_reduced = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

print("Training XGBoost model (without engineered features)...")
model_reduced.fit(X_train_reduced, y_train_reduced)
print("Model training completed!")

Training XGBoost model (without engineered features)...
Model training completed!


In [129]:
# Make predictions with reduced model
y_pred_reduced = model_reduced.predict(X_test_reduced)

# Inverse transform to original prices
y_test_reduced_original = np.exp(y_test_reduced)
y_pred_reduced_original = np.exp(y_pred_reduced)

# Calculate metrics
r2_reduced = r2_score(y_test_reduced, y_pred_reduced)
mae_reduced = mean_absolute_error(y_test_reduced_original, y_pred_reduced_original)
rmse_reduced = np.sqrt(mean_squared_error(y_test_reduced_original, y_pred_reduced_original))
median_price_reduced = np.median(y_test_reduced_original)

mae_pct_reduced = (mae_reduced / median_price_reduced) * 100
rmse_pct_reduced = (rmse_reduced / median_price_reduced) * 100

print("=" * 70)
print("MODEL PERFORMANCE METRICS (XGBoost - Without Engineered Features)")
print("=" * 70)
print(f"R¬≤ Score (log-transformed price):  {r2_reduced:.4f}")
print(f"Median actual price:                {median_price_reduced:,.2f} euros")
print(f"MAE:                                {mae_reduced:,.2f} euros ({mae_pct_reduced:.2f}%)")
print(f"RMSE:                               {rmse_reduced:,.2f} euros ({rmse_pct_reduced:.2f}%)")
print("=" * 70)

MODEL PERFORMANCE METRICS (XGBoost - Without Engineered Features)
R¬≤ Score (log-transformed price):  0.9388
Median actual price:                16,900.00 euros
MAE:                                2,887.56 euros (17.09%)
RMSE:                               4,809.35 euros (28.46%)


## Model Comparison

In [130]:
# Compare both models
comparison = pd.DataFrame({
    'Model': ['XGBoost (All Features)', 'XGBoost (Without Engineered Features)'],
    'Features': [X.shape[1], X_reduced.shape[1]],
    'R¬≤ Score': [r2, r2_reduced],
    'MAE (‚Ç¨)': [mae, mae_reduced],
    'RMSE (‚Ç¨)': [rmse, rmse_reduced],
    'MAE (%)': [mae_pct, mae_pct_reduced],
    'RMSE (%)': [rmse_pct, rmse_pct_reduced]
})

print("=" * 90)
print("MODEL COMPARISON")
print("=" * 90)
print(comparison.to_string(index=False))
print("=" * 90)

# Calculate performance difference
r2_diff = r2 - r2_reduced
mae_diff = mae - mae_reduced
rmse_diff = rmse - rmse_reduced

print("\nPerformance Difference (Positive = All Features Better):")
print(f"  R¬≤ difference:    {r2_diff:+.4f}")
print(f"  MAE difference:   {mae_diff:+.2f} euros")
print(f"  RMSE difference:  {rmse_diff:+.2f} euros")
print("=" * 90)

MODEL COMPARISON
                                Model  Features  R¬≤ Score     MAE (‚Ç¨)    RMSE (‚Ç¨)   MAE (%)  RMSE (%)
               XGBoost (All Features)        85  0.937262 2919.848738 4827.641115 17.277211 28.565924
XGBoost (Without Engineered Features)        82  0.938772 2887.563873 4809.347072 17.086177 28.457675

Performance Difference (Positive = All Features Better):
  R¬≤ difference:    -0.0015
  MAE difference:   +32.28 euros
  RMSE difference:  +18.29 euros


## Hyperparameter Tuning with GridSearchCV

Now let's find the best hyperparameters for both models using GridSearchCV with 5-fold cross-validation.

In [131]:
from sklearn.model_selection import GridSearchCV
import time

# Define parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

print("Parameter grid:")
for param, values in param_grid.items():
    print(f"  {param}: {values}")
print(f"\nTotal combinations: {3 * 3 * 3 * 3 * 2 * 2} = 324")
print("With 5-fold cross-validation: 324 √ó 5 = 1,620 model fits")
print("This will take several minutes...")

Parameter grid:
  n_estimators: [100, 200, 300]
  max_depth: [4, 6, 8]
  learning_rate: [0.01, 0.05, 0.1]
  min_child_weight: [1, 3, 5]
  subsample: [0.8, 1.0]
  colsample_bytree: [0.8, 1.0]

Total combinations: 324 = 324
With 5-fold cross-validation: 324 √ó 5 = 1,620 model fits
This will take several minutes...


In [132]:
# Hyperparameter tuning for Model 1 (All Features)
print("=" * 70)
print("TUNING MODEL 1: XGBoost with All Features")
print("=" * 70)

xgb_base_all = xgb.XGBRegressor(random_state=42, n_jobs=-1)

grid_search_all = GridSearchCV(
    estimator=xgb_base_all,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)

start_time = time.time()
grid_search_all.fit(X_train, y_train)
elapsed_time = time.time() - start_time

print(f"\nGrid Search completed in {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")
print(f"\nBest parameters:")
for param, value in grid_search_all.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest cross-validation R¬≤ score: {grid_search_all.best_score_:.4f}")

TUNING MODEL 1: XGBoost with All Features
Fitting 5 folds for each of 324 candidates, totalling 1620 fits





Grid Search completed in 97.46 seconds (1.62 minutes)

Best parameters:
  colsample_bytree: 0.8
  learning_rate: 0.05
  max_depth: 8
  min_child_weight: 3
  n_estimators: 300
  subsample: 0.8

Best cross-validation R¬≤ score: 0.9385


In [133]:
# Hyperparameter tuning for Model 2 (Without Engineered Features)
print("=" * 70)
print("TUNING MODEL 2: XGBoost without Engineered Features")
print("=" * 70)

xgb_base_reduced = xgb.XGBRegressor(random_state=42, n_jobs=-1)

grid_search_reduced = GridSearchCV(
    estimator=xgb_base_reduced,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)

start_time = time.time()
grid_search_reduced.fit(X_train_reduced, y_train_reduced)
elapsed_time = time.time() - start_time

print(f"\nGrid Search completed in {elapsed_time:.2f} seconds ({elapsed_time/60:.2f} minutes)")
print(f"\nBest parameters:")
for param, value in grid_search_reduced.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest cross-validation R¬≤ score: {grid_search_reduced.best_score_:.4f}")

TUNING MODEL 2: XGBoost without Engineered Features
Fitting 5 folds for each of 324 candidates, totalling 1620 fits





Grid Search completed in 80.46 seconds (1.34 minutes)

Best parameters:
  colsample_bytree: 0.8
  learning_rate: 0.1
  max_depth: 6
  min_child_weight: 1
  n_estimators: 300
  subsample: 0.8

Best cross-validation R¬≤ score: 0.9411


## Evaluate Tuned Models

In [134]:
# Evaluate tuned Model 1 (All Features)
y_pred_tuned_all = grid_search_all.best_estimator_.predict(X_test)
y_pred_tuned_all_original = np.exp(y_pred_tuned_all)

r2_tuned_all = r2_score(y_test, y_pred_tuned_all)
mae_tuned_all = mean_absolute_error(y_test_original, y_pred_tuned_all_original)
rmse_tuned_all = np.sqrt(mean_squared_error(y_test_original, y_pred_tuned_all_original))
mae_pct_tuned_all = (mae_tuned_all / median_price) * 100
rmse_pct_tuned_all = (rmse_tuned_all / median_price) * 100

print("=" * 70)
print("TUNED MODEL 1 PERFORMANCE (All Features)")
print("=" * 70)
print(f"R¬≤ Score (log-transformed):  {r2_tuned_all:.4f}")
print(f"MAE:                         {mae_tuned_all:,.2f} euros ({mae_pct_tuned_all:.2f}%)")
print(f"RMSE:                        {rmse_tuned_all:,.2f} euros ({rmse_pct_tuned_all:.2f}%)")
print("=" * 70)

TUNED MODEL 1 PERFORMANCE (All Features)
R¬≤ Score (log-transformed):  0.9426
MAE:                         2,667.96 euros (15.79%)
RMSE:                        4,388.70 euros (25.97%)


In [135]:
# Evaluate tuned Model 2 (Without Engineered Features)
y_pred_tuned_reduced = grid_search_reduced.best_estimator_.predict(X_test_reduced)
y_pred_tuned_reduced_original = np.exp(y_pred_tuned_reduced)

r2_tuned_reduced = r2_score(y_test_reduced, y_pred_tuned_reduced)
mae_tuned_reduced = mean_absolute_error(y_test_reduced_original, y_pred_tuned_reduced_original)
rmse_tuned_reduced = np.sqrt(mean_squared_error(y_test_reduced_original, y_pred_tuned_reduced_original))
mae_pct_tuned_reduced = (mae_tuned_reduced / median_price_reduced) * 100
rmse_pct_tuned_reduced = (rmse_tuned_reduced / median_price_reduced) * 100

print("=" * 70)
print("TUNED MODEL 2 PERFORMANCE (Without Engineered Features)")
print("=" * 70)
print(f"R¬≤ Score (log-transformed):  {r2_tuned_reduced:.4f}")
print(f"MAE:                         {mae_tuned_reduced:,.2f} euros ({mae_pct_tuned_reduced:.2f}%)")
print(f"RMSE:                        {rmse_tuned_reduced:,.2f} euros ({rmse_pct_tuned_reduced:.2f}%)")
print("=" * 70)

TUNED MODEL 2 PERFORMANCE (Without Engineered Features)
R¬≤ Score (log-transformed):  0.9436
MAE:                         2,642.59 euros (15.64%)
RMSE:                        4,294.49 euros (25.41%)


## Final Comparison: All Models

In [136]:
# Compare all four models
final_comparison = pd.DataFrame({
    'Model': [
        'XGBoost (All Features - Default)',
        'XGBoost (Reduced Features - Default)',
        'XGBoost (All Features - Tuned)',
        'XGBoost (Reduced Features - Tuned)'
    ],
    'Features': [X.shape[1], X_reduced.shape[1], X.shape[1], X_reduced.shape[1]],
    'R¬≤ Score': [r2, r2_reduced, r2_tuned_all, r2_tuned_reduced],
    'MAE (‚Ç¨)': [mae, mae_reduced, mae_tuned_all, mae_tuned_reduced],
    'RMSE (‚Ç¨)': [rmse, rmse_reduced, rmse_tuned_all, rmse_tuned_reduced],
    'MAE (%)': [mae_pct, mae_pct_reduced, mae_pct_tuned_all, mae_pct_tuned_reduced],
    'RMSE (%)': [rmse_pct, rmse_pct_reduced, rmse_pct_tuned_all, rmse_pct_tuned_reduced]
})

print("=" * 105)
print("FINAL MODEL COMPARISON - ALL MODELS")
print("=" * 105)
print(final_comparison.to_string(index=False))
print("=" * 105)

# Find best model
best_idx = final_comparison['R¬≤ Score'].idxmax()
best_model_name = final_comparison.loc[best_idx, 'Model']
best_r2 = final_comparison.loc[best_idx, 'R¬≤ Score']
best_mae = final_comparison.loc[best_idx, 'MAE (‚Ç¨)']
best_rmse = final_comparison.loc[best_idx, 'RMSE (‚Ç¨)']

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"   R¬≤ Score: {best_r2:.4f}")
print(f"   MAE:      {best_mae:,.2f} euros ({final_comparison.loc[best_idx, 'MAE (%)']:.2f}%)")
print(f"   RMSE:     {best_rmse:,.2f} euros ({final_comparison.loc[best_idx, 'RMSE (%)']:.2f}%)")
print("=" * 105)

# Store the best model for predictions
best_model = grid_search_reduced.best_estimator_

FINAL MODEL COMPARISON - ALL MODELS
                               Model  Features  R¬≤ Score     MAE (‚Ç¨)    RMSE (‚Ç¨)   MAE (%)  RMSE (%)
    XGBoost (All Features - Default)        85  0.937262 2919.848738 4827.641115 17.277211 28.565924
XGBoost (Reduced Features - Default)        82  0.938772 2887.563873 4809.347072 17.086177 28.457675
      XGBoost (All Features - Tuned)        85  0.942597 2667.961147 4388.701431 15.786752 25.968648
  XGBoost (Reduced Features - Tuned)        82  0.943612 2642.587424 4294.491207 15.636612 25.411191

üèÜ BEST MODEL: XGBoost (Reduced Features - Tuned)
   R¬≤ Score: 0.9436
   MAE:      2,642.59 euros (15.64%)
   RMSE:     4,294.49 euros (25.41%)


## Generate Predictions for All Data

In [143]:
# Use the best model to predict on the entire dataset
# Best model is: XGBoost (Reduced Features - Tuned)
predictions_log = best_model.predict(X_reduced)

# Convert predictions back to original price scale
predictions_original = np.exp(predictions_log)
actual_prices_original = np.exp(y_reduced)


# Create a dataframe with url, brand, actual price, and predicted price
results_df = pd.DataFrame({
    'url': url_column,
    'actual_price': actual_prices_original,
    'predicted_price': predictions_original
})

# Sort by predicted_price in ascending order
results_df = results_df.sort_values('predicted_price', ascending=True).reset_index(drop=True)

print("=" * 70)
print("PREDICTIONS GENERATED FOR ALL DATA (Sorted by Predicted Price)")
print("=" * 70)
print(f"Total predictions: {len(results_df)}")
print(f"\nTop 10 cheapest cars by predicted price:")
print(results_df.head(10).to_string(index=False))
print("=" * 70)

# Save to CSV
results_df.to_csv('predictions_with_urls.csv', index=False)
print("\nPredictions saved to: predictions_with_urls.csv")

PREDICTIONS GENERATED FOR ALL DATA (Sorted by Predicted Price)
Total predictions: 11403

Top 10 cheapest cars by predicted price:
                                   url  actual_price  predicted_price
https://www.auto24.ee/soidukid/4248624        1000.0       971.655762
https://www.auto24.ee/soidukid/4236712        1099.0      1090.134399
https://www.auto24.ee/soidukid/4218207        1090.0      1163.249756
https://www.auto24.ee/soidukid/4232253        1000.0      1192.568481
https://www.auto24.ee/soidukid/4243210        2500.0      1194.049072
https://www.auto24.ee/soidukid/4239747        1250.0      1197.863647
https://www.auto24.ee/soidukid/4249475        1100.0      1199.565918
https://www.auto24.ee/soidukid/4240679        1100.0      1201.640015
https://www.auto24.ee/soidukid/4241103        1000.0      1207.896606
https://www.auto24.ee/soidukid/4224403        1190.0      1213.549194

Predictions saved to: predictions_with_urls.csv
