In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.metrics import root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import joblib

In [2]:
df = pd.read_parquet('../04_EDA/car_ads_model.parquet')

In [3]:
df.shape

(182826, 20)

In [4]:
# Function to apply polynomial regression and add predicted prices
def reg_outliers(group):
    # Independent variable (e.g., km)
    X = group[['km']]
    y = group['price']  # Dependent variable

    # Polynomial transformation (degree 3)
    poly = PolynomialFeatures(degree=3)
    X_poly = poly.fit_transform(X)
    
    # Fit the linear regression model
    model = LinearRegression()
    model.fit(X_poly, y)
    
    # Make predictions
    y_pred = model.predict(X_poly)
    
    # Assign the predicted price to a new column
    group['reg_price'] = y_pred.astype(int)
    
    return group

In [5]:
# Assuming 'df' is your original DataFrame
df = df.groupby(['brand', 'model']).apply(reg_outliers).reset_index(drop=True)

  df = df.groupby(['brand', 'model']).apply(reg_outliers).reset_index(drop=True)


In [6]:
df['reg_diff_abs'] = (df['price'] - df['reg_price']).abs().astype(int)

In [7]:
df = df[((df['reg_diff_abs'] / df['price']) < 1) | (df['reg_diff_abs'] < 7_000)].reset_index(drop=True)

In [9]:
# Step 2: Lists to store results
all_results = []
performance_metrics = []

In [10]:
# Function to encode 'cv' (horsepower) column
def encode_cv(group):
    # Drop null values and convert to int for categorical representation
    non_null_cv = group['cv'].dropna().astype(int)

    # Create categorical labels
    categories = non_null_cv.unique()
    cat_labels = [f'cv_{cat}' for cat in categories]

    # Create a new DataFrame with one-hot encoding for existing categories
    cv_encoded = pd.get_dummies(non_null_cv, prefix='', prefix_sep='', dtype=int)

    # Rename columns to desired format
    cv_encoded.columns = [f'cv_{col}' for col in cv_encoded.columns]

    # Return the encoded DataFrame
    return cv_encoded


# Function to encode 'fuel' and other categorical variables
def encode_categorical_features(group):
    # One-hot encoding for 'fuel'
    fuel_encoded = pd.get_dummies(group['fuel'], prefix='fuel')
    cv_encoded = encode_cv(group)
    # # Handle CV encoding if 'cv' is present
    # if group['cv'].notnull().all():
    #     cv_encoded = encode_cv(group)
    # else:
    #     cv_encoded = pd.DataFrame(index=group.index)  # Empty if no 'cv'
    
    # Combine encoded features
    return pd.concat([fuel_encoded, cv_encoded], axis=1)

# Function to select a model based on the number of samples
def select_model(n_samples):
    if n_samples < 30:
        return SVR(kernel='poly', degree=3, C=0.2)  # L2 Regularization
    elif n_samples < 2_000:
        return XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=7)  # Tree-based regularization
    else:
        return RandomForestRegressor(n_estimators=500, max_depth=7)  # Tree-based with max depth


# Function to choose and apply model
def choose_model(group):
    X = group[['km', 'age_years', 'is_automatic']]  # Base features
    
    # Encode categorical features (fuel and cv)
    cat_encoded = encode_categorical_features(group)
    X = pd.concat([X, cat_encoded], axis=1)

    # Ensure that there are no NaN values in X
    if X.isnull().values.any():
        # You can choose to fill NaNs or drop them
        X.fillna(0, inplace=True)  # Example: fill NaNs with 0
        
    # Get target variable
    y = group['price']  # Target (Price)
    n_samples = len(group)
    
    # If very few samples, predict the mean price
    if n_samples < 5:
        group['predicted_price'] = [group['price'].median()] * n_samples
        group['price_diff'] = group['price'] - group['predicted_price']
        rmse = root_mean_squared_error(y, group['predicted_price'])
        mape = mean_absolute_percentage_error(y, group['predicted_price'])
        mae = mean_absolute_error(y, group['predicted_price'])
        performance_metrics.append({
            'brand': group['brand'].iloc[0],
            'model': group['model'].iloc[0],
            'rmse': rmse,
            'mape': mape,
            'mae':mae
        })
        all_results.append(group)
        return

    # Train-test split logic
    test_size = 0.3 if n_samples >= 40 else 0.4
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Select the model based on dataset size
    model = select_model(n_samples)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predicting on test set
    y_pred_test = model.predict(X_test)
    
    # Train on the entire dataset to predict for all data
    model.fit(X, y)
    y_pred_all = model.predict(X)

    # Save the trained model, feature names, and other metadata to disk
    brand = group['brand'].iloc[0]
    model_name = group['model'].iloc[0]
    model_file_name = f'saved_models/{brand}_{model_name}.pkl'
    
    # Create a dictionary to store model, feature names, and any other metadata
    model_info = {
        'model': model,
        'feature_names': X.columns.tolist(),  # Save the column names
        'brand': brand,
        'model_name': model_name,
        'training_samples': n_samples  # Optional metadata
    }
    
    # Save the model and additional info
    joblib.dump(model_info, model_file_name)
    
    # Calculate performance metrics
    rmse = root_mean_squared_error(y_test, y_pred_test)
    mape = mean_absolute_percentage_error(y_test, y_pred_test) * 100
    mae = mean_absolute_error(y_test, y_pred_test)
    performance_metrics.append({
        'brand': brand,
        'model': model_name,
        'rmse': round(rmse, 2),
        'mape': round(mape, 2),
        'mae': int(mae)
    })

    # Add predictions for all data points
    group['predicted_price'] = y_pred_all
    group['price_diff'] = group['price'] - group['predicted_price']
    
    all_results.append(group)


In [11]:
df.groupby(['brand','model']).apply(choose_model, include_groups=True)

  df.groupby(['brand','model']).apply(choose_model, include_groups=True)


In [12]:
results_df = pd.DataFrame(performance_metrics)  # Model performance metrics
final_results_df = pd.concat(all_results, ignore_index=True)  # Test data with predictions and price diff

In [13]:
final_cols_order = ['price','predicted_price', 'price_diff', 'brand', 'model', 'is_automatic','km', 'fuel',
                    'year', 'age_years', 'cv', 'kw', 'body_type', 'cubic_capacity', 'is_seller_pro','is_certified', 'offer_type',
                    'location', 'ad_days_creation', 'ad_days_published', 'title']

In [14]:
final_results_df = final_results_df[final_cols_order]

numeric_cols_list = final_results_df.select_dtypes(include='number').columns.values
final_results_df[numeric_cols_list] =  final_results_df[numeric_cols_list].round(0).astype('Int32')

In [15]:
results_df['rmse'].describe(percentiles=np.arange(0,1.1,0.1)).astype(int)

count            1202
mean        137961955
std        4782989627
min                 0
0%                  0
10%               250
20%              1126
30%              1500
40%              1810
50%              2293
60%              2985
70%              3859
80%              5458
90%              8953
100%     165825640328
max      165825640328
Name: rmse, dtype: int64

In [16]:
final_results_df['price_diff'].abs().describe(percentiles=np.arange(0,1.1,0.1)).astype(int)

count    177440
mean        712
std        1870
min           0
0%            0
10%           2
20%          16
30%          45
40%          96
50%         180
60%         314
70%         537
80%         924
90%        1754
100%      98815
max       98815
Name: price_diff, dtype: int64

In [17]:
overall_accuracy = (100 * (1- mean_absolute_percentage_error(final_results_df['price'], final_results_df['predicted_price']))).round(2)

In [18]:
f"Model overall accuracy: {overall_accuracy}%"

'Model overall accuracy: 93.12%'

In [19]:
overall_error = mean_absolute_error(final_results_df['price'], final_results_df['predicted_price']).astype(int)

In [20]:
f"Model overall price-vs-prediction difference: {overall_error}€"

'Model overall price-vs-prediction difference: 712€'

In [21]:
root_mean_squared_error(final_results_df['price'], final_results_df['predicted_price']).astype(int)

np.int64(2001)

In [22]:
performance_df = pd.DataFrame(performance_metrics)

In [23]:
# Save performance metrics to CSV
performance_df.to_csv('../app/app_files/performance_metrics.csv', index=False)

In [24]:
final_results_df.to_csv('../app/app_files/final_results_df.csv', index=False)

In [25]:
final_results_df = pd.read_csv('../app/app_files/final_results_df.csv')

In [26]:
performance_df = pd.read_csv('../app/app_files/performance_metrics.csv')