In [34]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import joblib

In [35]:
df = pd.read_parquet('../04_EDA/car_ads_model.parquet')

In [23]:
df.shape

(182826, 20)

In [24]:
# Step 2: Lists to store results
all_results = []
performance_metrics = []

In [25]:
# Function to encode 'cv' (horsepower) column
def encode_cv(group, max_unique_cv=5):
    if group['cv'].nunique() <= max_unique_cv:
        # One-hot encoding if the number of unique 'cv' values is small
        return pd.get_dummies(group['cv'], prefix='cv')
    else:
        # Bin and label-encode if there are too many unique values
        bins = pd.qcut(group['cv'], q=5, duplicates='drop', labels=False)  # Adjust 'q' for the number of bins
        return bins

# Function to encode 'fuel' and other categorical variables
def encode_categorical_features(group):
    # One-hot encoding for 'fuel'
    fuel_encoded = pd.get_dummies(group['fuel'], prefix='fuel')
    
    # Handle CV encoding if 'cv' is present
    if group['cv'].notnull().all():
        cv_encoded = encode_cv(group)
    else:
        cv_encoded = pd.DataFrame(index=group.index)  # Empty if no 'cv'
    
    # Combine encoded features
    return pd.concat([fuel_encoded, cv_encoded], axis=1)

# Function to select a model based on the number of samples
def select_model(n_samples):
    if n_samples < 30:
        return SVR(kernel='poly', degree=3, C=0.2)  # L2 Regularization
    elif n_samples < 2_000:
        return XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=7)  # Tree-based regularization
    else:
        return RandomForestRegressor(n_estimators=500, max_depth=7)  # Tree-based with max depth

# Function to choose and apply model
def choose_model(group):
    X = group[['km', 'age_years', 'is_automatic']]  # Base features
    
    # Encode categorical features (fuel and cv)
    cat_encoded = encode_categorical_features(group)
    X = pd.concat([X, cat_encoded], axis=1)

    y = group['price']  # Target (Price)
    n_samples = len(group)
    
    # If very few samples, predict the mean price
    if n_samples < 5:
        group['predicted_price'] = [group['price'].mean()] * n_samples
        group['price_diff'] = group['price'] - group['predicted_price']
        rmse = root_mean_squared_error(y, group['predicted_price'])
        mape = mean_absolute_percentage_error(y, group['predicted_price'])
        performance_metrics.append({
            'brand': group['brand'].iloc[0],
            'model': group['model'].iloc[0],
            'rmse': rmse,
            'mape': mape
        })
        all_results.append(group)
        return

    # Train-test split logic
    test_size = 0.3 if n_samples >= 40 else 0.4
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Select the model based on dataset size
    model = select_model(n_samples)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predicting on test set
    y_pred_test = model.predict(X_test)
    
    # Train on the entire dataset to predict for all data
    model.fit(X, y)
    y_pred_all = model.predict(X)

    # Save the trained model to disk
    brand = group['brand'].iloc[0]
    model_name = group['model'].iloc[0]
    model_file_name = f'saved_models/{brand}_{model_name}.pkl'
    joblib.dump(model, model_file_name)
    
    # Calculate performance metrics
    rmse = root_mean_squared_error(y_test, y_pred_test)
    mape = mean_absolute_percentage_error(y_test, y_pred_test) * 100
    performance_metrics.append({
        'brand': brand,
        'model': model_name,
        'rmse': round(rmse, 2),
        'mape': round(mape, 2)
    })

    # Add predictions for all data points
    group['predicted_price'] = y_pred_all
    group['price_diff'] = group['price'] - group['predicted_price']
    
    all_results.append(group)

In [171]:
results_df = pd.DataFrame(performance_metrics)  # Model performance metrics
final_results_df = pd.concat(all_results, ignore_index=True)  # Test data with predictions and price diff

In [172]:
final_cols_order = ['price','predicted_price', 'price_diff', 'brand', 'model', 'is_automatic','km', 'fuel',
                    'year', 'age_years', 'cv', 'kw', 'body_type', 'cubic_capacity', 'is_seller_pro','is_certified', 'offer_type',
                    'location', 'ad_days_creation', 'ad_days_published', 'title']

In [173]:
final_results_df = final_results_df[final_cols_order]

numeric_cols_list = final_results_df.select_dtypes(include='number').columns.values
final_results_df[numeric_cols_list] =  final_results_df[numeric_cols_list].round(0).astype('Int32')


In [174]:
results_df['rmse'].describe(percentiles=np.arange(0,1.1,0.1)).astype(int)

count          1202
mean      137962126
std     -2147483648
min               0
0%                0
10%             250
20%            1131
30%            1504
40%            1909
50%            2409
60%            3147
70%            4051
80%            5831
90%            9224
100%    -2147483648
max     -2147483648
Name: rmse, dtype: int32

In [175]:
final_results_df['price_diff'].abs().describe(percentiles=np.arange(0,1.1,0.1)).astype(int)

count    182826
mean        824
std        2056
min           0
0%            0
10%           3
20%          19
30%          54
40%         114
50%         214
60%         373
70%         633
80%        1083
90%        2054
100%      98815
max       98815
Name: price_diff, dtype: int32

In [190]:
overall_accuracy = (100 * (1- mean_absolute_percentage_error(final_results_df['price'], final_results_df['predicted_price']))).round(2)

In [191]:
f"Model overall accuracy: {overall_accuracy}%"

'Model overall accuracy: 91.32%'

In [192]:
overall_error = mean_absolute_error(final_results_df['price'], final_results_df['predicted_price']).astype(int)

In [193]:
f"Model overall price-vs-prediction difference: {overall_error}€"

'Model overall price-vs-prediction difference: 824€'

In [194]:
root_mean_squared_error(final_results_df['price'], final_results_df['predicted_price']).astype(int)

2215

In [181]:
# Save performance metrics to CSV
performance_df = pd.DataFrame(performance_metrics)
performance_df.to_csv('../app/app_files/performance_metrics.csv', index=False)

In [4]:
final_results_df.to_csv('../app/app_files/final_results_df.csv', index=False)

In [3]:
final_results_df = pd.read_csv('../app/app_files/final_results_df.csv')

In [184]:
performance_df = pd.read_csv('../app/app_files/performance_metrics.csv')