# Regression of Used Car Prices Dataset
Run after following notebooks are run:
1. **01_Data_Cleaning**
2. **02_EDA**

## Development Notes/Ideas



## Libraries

In [1]:
import numpy as np
import pandas as pd
import optuna
from optuna_integration import LightGBMPruningCallback

import lightgbm as lgb

## Load Data

In [2]:
train = pd.read_pickle('train_fe.pkl')
test = pd.read_pickle('test_fe.pkl')

## separate in to features and response variable
x_train = train.drop('price', axis=1)
y_train = train['price']

x_test = test

# package specific format
dataset_train = lgb.Dataset(x_train, label=y_train)

## Optuna
### LightGBM

In [3]:
def objective_lgbm(trial):

    param = {
        "verbosity": 0,
        "objective": "regression",
        "metric": "rmse",
        "boosting": "gbdt",
        # learning rate
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1.0, log=True),
        # L1 regularization weight
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 1.0, log=True),
        # L2 regularization weight
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 1.0, log=True),
        # max leaves in one tree
        "num_leaves": trial.suggest_int("num_leaves", 10, 400),
        # subset of features on each tree
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        # randomly select part of data without resampling
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        # frequency for bagging, at every kth tree
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        # # minimum leaf weight, larger the term more conservative the tree
        # "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100),
        # "feature_pre_filter": False,
        # maximum depth of the tree, signifies complexity of the tree.
        "max_depth": trial.suggest_int("max_depth", 4, 10, step=2),
    }

    # pruning to increase efficiency by stopping unpromising trials at early stages
    pruning_callback = LightGBMPruningCallback(trial, "valid rmse")
    # cross-validate and return mean of test scores
    lgbm = lgb.cv(train_set=dataset_train, params=param, num_boost_round=1000, nfold=5, stratified=True, seed=123, callbacks=[pruning_callback])
    score = lgbm["valid rmse-mean"][-1]
    return score

In [4]:
# specify sampler, seed and pruner
# note if optimizing a study in distributed or parallel mode, there is inherent non-determinism i.e. won't be able to reproduce results
sampler=optuna.samplers.TPESampler(seed=123) 
pruner=optuna.pruners.MedianPruner(n_warmup_steps=10)

study = optuna.create_study(direction="minimize", sampler=sampler, pruner=pruner, study_name='lgbm cv')
study.optimize(objective_lgbm, n_trials=300, timeout=10000, show_progress_bar=True, n_jobs=8)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-09-15 13:13:02,194] A new study created in memory with name: lgbm cv


  0%|          | 0/300 [00:00<?, ?it/s]

[I 2024-09-15 13:15:24,127] Trial 0 finished with value: 76650.47519387968 and parameters: {'learning_rate': 0.07289789442461049, 'lambda_l1': 0.010565148600759506, 'lambda_l2': 6.279173555077436e-08, 'num_leaves': 95, 'feature_fraction': 0.5191083603789208, 'bagging_fraction': 0.5794621103245552, 'bagging_freq': 4, 'max_depth': 4}. Best is trial 0 with value: 76650.47519387968.
[I 2024-09-15 13:15:28,241] Trial 4 finished with value: 92034.47916793804 and parameters: {'learning_rate': 0.8204953156195421, 'lambda_l1': 1.969129015994745e-05, 'lambda_l2': 7.826422830908603e-08, 'num_leaves': 280, 'feature_fraction': 0.8676972463609514, 'bagging_fraction': 0.8402074358230696, 'bagging_freq': 7, 'max_depth': 4}. Best is trial 0 with value: 76650.47519387968.
[I 2024-09-15 13:17:44,986] Trial 9 finished with value: 73787.4213424163 and parameters: {'learning_rate': 0.00778449517406473, 'lambda_l1': 0.00010342203014953207, 'lambda_l2': 2.2878539917518365e-05, 'num_leaves': 101, 'feature_frac