# Model Selection - CARS

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV,train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error,r2_score
from scipy.stats import randint, uniform,loguniform
import os
os.chdir("../..")
from sibr_module import BigQuery, Logger, CStorage

import torch
from torch import nn



In [2]:
dataset = 'cars'
logger = Logger(f'model_selection{dataset.capitalize()}')
bq = BigQuery(logger=logger,dataset=dataset)
cs = CStorage(logger=logger, bucket_name='sibr-market-bucket')
logger.debug(f'Dataset: {dataset}')

2025-07-09 14:47:07,171 - model_selectionCars - INFO - Cloud Logging is disabled. Using local logging to /Users/sigvardbratlie/Documents/Projects/sibr_market_training/logfiles/model_selectionCars.log
2025-07-09 14:47:07,175 - model_selectionCars - INFO - BigQuery client initialized with project_id: sibr-market
2025-07-09 14:47:07,178 - model_selectionCars - INFO - Google Cloud Storage client initialized with bucket: sibr-market-bucket
2025-07-09 14:47:07,179 - model_selectionCars - DEBUG - Dataset: cars


In [3]:
df_el = bq.read_preprocessed(table = 'cars_el',random_sample=5000,coordinates=False)
df_fossil = bq.read_preprocessed(table = 'cars_fossil',random_sample=5000,coordinates=False)

2025-07-09 14:47:17,981 - model_selectionCars - INFO - 5000 rows read from cars. Query: SELECT a.* FROM `sibr-market.pre_processed.cars_el` a ORDER BY RAND() LIMIT 5000... (truncated)
2025-07-09 14:47:25,523 - model_selectionCars - INFO - 5000 rows read from cars. Query: SELECT a.* FROM `sibr-market.pre_processed.cars_fossil` a ORDER BY RAND() LIMIT 5000... (truncated)


In [5]:
columns = ['price_excl_transfer','clean_date','pre_processed_date','warranty',
    'color_interior',
    'gearbox_type',
    'warranty_length',
    'condition_report']
df_el.drop(columns = columns,inplace=True,errors='ignore')
df_fossil.drop(columns = columns,inplace=True,errors='ignore')
df_el.dropna(inplace=True)
df_fossil.dropna(inplace=True)

In [35]:
def get_cat(df):
    cat_cols = []
    for col, dtype in df.dtypes.items():  # Use .items() to get (column_name, dtype) pairs
        if dtype == 'object' or dtype == 'category' or dtype == 'string':
            cat_cols.append(col)
    return cat_cols

cat_cols_el = get_cat(df_el)
cat_cols_fossil = get_cat(df_fossil)

In [12]:
params_rf = {
    # RandomForest har allerede 5 parametere, noe som er innenfor grensen.
    # Disse er generelt ansett som viktige for Random Forest.
    'model__n_estimators': randint(100, 1500),  # Litt redusert øvre grense for raskere søk
    'model__max_depth': randint(5, 50),        # Redusert øvre grense
    'model__min_samples_leaf': randint(1, 10),
    'model__bootstrap': [True, False],
    'model__random_state': randint(0, 100)     # Inkludert som i din originale kode
}

params_xgb = {
    # Velger 4 kjerne-ytelsesparametere + random_state (totalt 5)
    'model__n_estimators': randint(100, 1500),
    'model__learning_rate': loguniform(0.01, 0.3),
    'model__max_depth': randint(3, 15),           # Litt justert range
    'model__subsample': uniform(0.6, 0.4),        # Genererer fra 0.6 til 1.0
    # 'model__colsample_bytree': uniform(0.6, 0.4), # Kan legges til hvis du vil ha 5 ytelsesparams + random_state
    # 'model__reg_lambda': loguniform(1e-8, 1.0),   # Alternativ til subsample/colsample
    'model__random_state': randint(0, 100)
}

params_cat = {
    # Velger 4 kjerne-ytelsesparametere. random_state er fast.
    'model__iterations': randint(100, 1500),      # Samme som n_estimators
    'model__learning_rate': loguniform(0.01, 0.3),
    'model__depth': randint(3, 10),               # Samme som max_depth
    'model__l2_leaf_reg': loguniform(1, 10),      # Viktig regulariseringsparameter for CatBoost
    # 'model__bagging_temperature': uniform(0, 1), # Kan legges til for en femte parameter
    'model__random_state': randint(0, 100)                     # Fast verdi, teller ikke mot justerbare
}

models = {
    # 'RandomForest': (RandomForestRegressor(), params_rf),
    # 'XGBoost': (XGBRegressor(), params_xgb),
    'CatBoost': (CatBoostRegressor(silent=True), params_cat),
}

In [38]:
def model_selection(df_sample,models:dict,target ,dataset_name):
    logger.info(f'MODEL SELECTION FOR {dataset_name.upper()} \n \n')
    X_train, X_test, y_train, y_test = train_test_split(
        df_sample.drop(columns=[target], axis=1),
        df_sample[target],
        test_size=0.2,
        # stratify=y_binned,
        random_state=42)
    logger.info(f"Train set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")
    logger.info(f'Columns in train set: {X_train.columns.tolist()}')


    for model_name, (model, params) in models.items():
        logger.info(f'Training {model_name} model with hyperparameter tuning. Dataset: {dataset_name}')
        pipe = Pipeline([
            #('impute', SimpleImputer()),
            #('scaler', StandardScaler()),
            ('model', model),
        ])
        cv = RandomizedSearchCV(pipe,
                                    param_distributions=params,
                                    n_iter=50,
                                    cv=3,
                                    scoring='neg_mean_squared_error',
                                    verbose=1,
                                    random_state=42,
                                    n_jobs=-1,
                                refit = 'neg_mean_squared_error',
                                return_train_score=True)
        try:
            cv.fit(X_train, y_train)
            y_pred = cv.predict(X_test)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            logger.info(f'{model_name} model best parameters: {cv.best_params_} on {dataset_name}')
            logger.info(f'Best score for {model_name}: Train: {-cv.best_score_}, Test: {mse}, R2 (test): {r2} on {dataset_name}')
        except Exception as e:
            logger.error(f'Error training {model_name} model: {e}')
    return cv.best_estimator_, cv.best_params_

def param_tuning_catboost(df_sample, params, target, cat_cols, dataset_name):
    logger.info(f'Training CatBoost model for {target}')
    model_name = 'CatBoost'
    X_train, X_test, y_train, y_test = train_test_split(
        df_sample.drop(columns=[target], axis=1),
        df_sample[target],
        test_size=0.2,
        random_state=42)

    # Remove 'model__' prefix from parameters
    clean_params = {}
    for key, value in params.items():
        if key.startswith('model__'):
            clean_params[key.replace('model__', '')] = value
        else:
            clean_params[key] = value

    model = CatBoostRegressor(silent=True, cat_features=cat_cols)
    cv = RandomizedSearchCV(model,
                            param_distributions=clean_params,
                            n_iter=50,
                            cv=3,
                            scoring='neg_mean_squared_error',
                            verbose=0,
                            random_state=42,
                            n_jobs=-1,
                            refit='neg_mean_squared_error',
                            return_train_score=True)
    try:
        cv.fit(X_train, y_train)
        y_pred = cv.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        logger.info(f'{model_name} model best parameters: {cv.best_params_} on {dataset_name}')
        logger.info(f'Best score for {model_name}: Train: {-cv.best_score_}, Test: {mse}, R2 (test): {r2} on {dataset_name}')
    except Exception as e:
        logger.error(f'Error training {model_name} model: {e}')

    return cv.best_estimator_, cv.best_params_

In [39]:
best_model_el,best_params_el = param_tuning_catboost(df_sample=df_el, params=params_cat, target='total_price', cat_cols=cat_cols_el, dataset_name='electric')
print(f'Best params for apartments: {best_params_el}')

2025-06-19 16:52:54,918 - model_selectionCars - INFO - Training CatBoost model for total_price
2025-06-19 16:55:30,086 - model_selectionCars - INFO - CatBoost model best parameters: {'depth': 7, 'iterations': 1223, 'l2_leaf_reg': np.float64(1.3895264494261033), 'learning_rate': np.float64(0.09150423569345205), 'random_state': 52} on electric
2025-06-19 16:55:30,089 - model_selectionCars - INFO - Best score for CatBoost: Train: 2189645470.267216, Test: 1408833243.3819115, R2 (test): 0.9592987893879528 on electric


Best params for apartments: {'depth': 7, 'iterations': 1223, 'l2_leaf_reg': np.float64(1.3895264494261033), 'learning_rate': np.float64(0.09150423569345205), 'random_state': 52}


In [37]:
best_model_fossil,best_params_fossil = param_tuning_catboost(df_sample=df_fossil, params=params_cat, target='total_price', cat_cols=cat_cols_fossil, dataset_name='fossil')
print(f'Best params for houses: {best_params_fossil}')

2025-06-19 16:50:01,303 - model_selectionCars - INFO - Training CatBoost model for total_price


Fitting 3 folds for each of 50 candidates, totalling 150 fits


2025-06-19 16:52:43,832 - model_selectionCars - INFO - CatBoost model best parameters: {'depth': 4, 'iterations': 1463, 'l2_leaf_reg': np.float64(1.7419090119209117), 'learning_rate': np.float64(0.10215580871496355), 'random_state': 43} on fossil
2025-06-19 16:52:43,834 - model_selectionCars - INFO - Best score for CatBoost: Train: 4775219318.095918, Test: 3763327950.501858, R2 (test): 0.9229957916461713 on fossil


Best params for houses: {'depth': 4, 'iterations': 1463, 'l2_leaf_reg': np.float64(1.7419090119209117), 'learning_rate': np.float64(0.10215580871496355), 'random_state': 43}
