# Setup

In [1]:
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import optuna
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.base import clone
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, StratifiedKFold
from IPython.display import clear_output

SEED = 2024

In [2]:
# Check GPU availability
import subprocess
try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

Available device: cuda


In [3]:
DATA_DIR = '/kaggle/input/autoam-car-price-prediction'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

# Data preparation

In [4]:
TARGET = 'price'

train = train.drop(['wheel'], axis=1)
test = test.drop(['Id', 'wheel'], axis=1)

In [5]:
price_bins = [0, 1e4, 1.5e4, 2e4, 1e5]

train['price_range'] = pd.cut(train['price'], price_bins, labels=False)

In [6]:
# remove low outlier (462)
# train = train.query("price > 2000")

In [7]:
# test does not have diesel and hybrid motor_type
# train = train.query("motor_type in ['petrol', 'gas', 'petrol and gas']")

In [8]:
# test does not have minivan/minibus type
# train = train.query("type != 'minivan / minibus'")

In [9]:
def convert_miles_to_km(distance):
    km_per_mile = 1.609344
    if distance.endswith('miles'):
        return int(distance.split(' ')[0]) * km_per_mile
    else:
        return int(distance.split(' ')[0])
    
train['running'] = train.running.apply(convert_miles_to_km)
test['running'] = test.running.apply(convert_miles_to_km)

In [10]:
FEATURES = [f for f in test.columns]
CAT_FEATURES = ['model', 'motor_type', 'color', 'type', 'status']

train[CAT_FEATURES] = train[CAT_FEATURES].astype('category')
test[CAT_FEATURES] = test[CAT_FEATURES].astype('category')

# Hyperparameter tuning

In [11]:
def comp_metric(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

In [12]:
def objective(trial, features, estimator, folds, stratify_col, seed):
    oof_preds = {}
    X, y = train[features], train[TARGET]
    
    param_grid = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 64),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.1), #L1-reg
        'lambda': trial.suggest_float('lambda', 5e-3, 5e3, log=True), #L2-reg
        'max_cat_to_onehot': trial.suggest_categorical('max_cat_to_onehot', [2, 4, 6]),
    }
    
    if stratify_col:
        stratifier = train[stratify_col]
        cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    else:
        stratifier = None
        cv = KFold(n_splits=folds, shuffle=True, random_state=seed)
    
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, stratifier)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        model = clone(estimator)
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
        
        val_preds = model.predict(X_val, iteration_range=(0, model.best_iteration+1))
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
    oof_preds = pd.Series(oof_preds).sort_index()
    return comp_metric(y, oof_preds)

In [13]:
def tune_params(features, estimator, folds, stratify_col, seed, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction)
    study.optimize(
        func=lambda trial: objective(
            trial, features, estimator, folds, stratify_col, seed),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Cross-validation + Prediction

In [14]:
def custom_cv(features, estimator, folds, stratify_col, seed, verbose=True):
    oof_preds, test_preds = {}, {}
    scores = []
    
    X_test = test[features]
    X, y = train[features], train[TARGET]
    
    if stratify_col:
        stratifier = train[stratify_col]
        cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    else:
        stratifier = None
        cv = KFold(n_splits=folds, shuffle=True, random_state=seed)
    
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, stratifier)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
        
        val_preds = model.predict(X_val, iteration_range=(0, model.best_iteration+1))
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test, iteration_range=(0, model.best_iteration+1))
        
        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration:>4} rounds)')
            
        _ = gc.collect()
        
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(y, oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [15]:
def run_experiment(
    features=FEATURES, 
    folds=5, 
    stratify_col=None, 
    seed=SEED, 
    n_trials=100
):    
    base_params = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'learning_rate': 0.01,
        'n_estimators': 10000,
        'early_stopping_rounds': 200,
        'device': DEVICE,
        'enable_categorical': True,
        'verbosity': 0,
        'n_jobs': None,
        'seed': seed
    }
        
    estimator = xgb.XGBRegressor(**base_params)
    
    start = time.time()
    study = tune_params(
        features, estimator, folds, stratify_col, seed, n_trials, direction='minimize') 
    # metric: MAE -> lower is better
    end = time.time()
    clear_output(wait=True)
    print(f'----------Hyperparameter tuning----------')
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    
    estimator.set_params(**study.best_params)
    oof_preds, test_preds = custom_cv(features, estimator, folds, stratify_col, seed)
    
    end = time.time()
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

In [16]:
def create_submission_files(preds, config, notebook='02'):
    sub = sample_sub.copy()
    sub[TARGET] = preds['mean'].clip(0)
    sub.to_csv(f'nb{notebook}_{config}.csv', index=False)

**Trial run**

In [17]:
optuna.logging.set_verbosity(optuna.logging.INFO)
_ , _ = run_experiment(n_trials=3)

----------Hyperparameter tuning----------
Best trial: 2 -> Best value: 2069.12151
Best hyperparameters:
max_depth            - 5
min_child_weight     - 8
subsample            - 1.0
colsample_bytree     - 0.7
gamma                - 5.6000000000000005
alpha                - 3.9000000000000004
lambda               - 306.5563997676988
max_cat_to_onehot    - 2

[Time taken: 141.08s]

-----Cross-validation and prediction-----
Fold # 0: 1745.54900 (3288 rounds)
Fold # 1: 2030.32073 (3810 rounds)
Fold # 2: 2056.26027 (2085 rounds)
Fold # 3: 2318.54220 (4285 rounds)
Fold # 4: 2196.04014 (2699 rounds)

Avg score: 2069.34247 +/- 192.19201
OOF score: 2069.12151


[Time taken: 56.22s]



# Experiments

In [18]:
op = {}  # OOF preds
tp = {}  # Test preds

In [19]:
%%time
expt = 1
cv_type = 'KF'
folds = 5
seed = SEED

config = f'expt{expt}_cv{cv_type}_folds{folds}_seed{seed}'
op[config], tp[config] = run_experiment(
    features=FEATURES,
    folds=folds,
    stratify_col=None,
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 62 -> Best value: 1985.41521
Best hyperparameters:
max_depth            - 3
min_child_weight     - 16
subsample            - 0.9
colsample_bytree     - 0.9
gamma                - 7.9
alpha                - 4.7
lambda               - 18.204304881213627
max_cat_to_onehot    - 4

[Time taken: 2488.46s]

-----Cross-validation and prediction-----
Fold # 0: 1708.00730 ( 789 rounds)
Fold # 1: 1962.66297 (1530 rounds)
Fold # 2: 1984.69564 ( 656 rounds)
Fold # 3: 2151.29753 (1778 rounds)
Fold # 4: 2121.32775 ( 847 rounds)

Avg score: 1985.59824 +/- 157.13836
OOF score: 1985.41521


[Time taken: 15.56s]

CPU times: user 43min 42s, sys: 7.32 s, total: 43min 50s
Wall time: 41min 44s


In [20]:
%%time
expt = 2
cv_type = 'SKF'
folds = 5
seed = SEED

config = f'expt{expt}_cv{cv_type}_MODEL_folds{folds}_seed{seed}'
op[config], tp[config] = run_experiment(
    features=FEATURES,
    folds=folds,
    stratify_col='model',
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 84 -> Best value: 1975.55395
Best hyperparameters:
max_depth            - 4
min_child_weight     - 7
subsample            - 0.75
colsample_bytree     - 0.95
gamma                - 15.8
alpha                - 2.0
lambda               - 5.706585654646334
max_cat_to_onehot    - 2

[Time taken: 2880.52s]

-----Cross-validation and prediction-----
Fold # 0: 2198.56510 (1316 rounds)
Fold # 1: 2026.34763 ( 693 rounds)
Fold # 2: 1880.31355 ( 844 rounds)
Fold # 3: 1960.96785 ( 544 rounds)
Fold # 4: 1810.74087 ( 615 rounds)

Avg score: 1975.38700 +/- 133.23714
OOF score: 1975.55395


[Time taken: 14.52s]

CPU times: user 50min 20s, sys: 8.33 s, total: 50min 29s
Wall time: 48min 15s


In [21]:
%%time
expt = 3
cv_type = 'SKF'
folds = 5
seed = SEED

config = f'expt{expt}_cv{cv_type}_MOTOR_{folds}_seed{seed}'
op[config], tp[config] = run_experiment(
    features=FEATURES,
    folds=folds,
    stratify_col='motor_type',
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 43 -> Best value: 1966.84492
Best hyperparameters:
max_depth            - 3
min_child_weight     - 15
subsample            - 0.95
colsample_bytree     - 0.9
gamma                - 5.300000000000001
alpha                - 0.7000000000000001
lambda               - 45.807831543354894
max_cat_to_onehot    - 2

[Time taken: 3347.01s]

-----Cross-validation and prediction-----
Fold # 0: 1873.17249 (1206 rounds)
Fold # 1: 1874.01557 (2323 rounds)
Fold # 2: 1828.27118 (1090 rounds)
Fold # 3: 2028.62144 (1252 rounds)
Fold # 4: 2230.71250 (1560 rounds)

Avg score: 1966.95864 +/- 148.34882
OOF score: 1966.84492


[Time taken: 20.23s]

CPU times: user 58min 14s, sys: 9.08 s, total: 58min 23s
Wall time: 56min 7s


In [22]:
%%time
expt = 4
cv_type = 'SKF'
folds = 5
seed = SEED

config = f'expt{expt}_cv{cv_type}_TYPE_{folds}_seed{seed}'
op[config], tp[config] = run_experiment(
    features=FEATURES,
    folds=folds,
    stratify_col='type',
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 83 -> Best value: 1961.13938
Best hyperparameters:
max_depth            - 4
min_child_weight     - 14
subsample            - 0.9
colsample_bytree     - 0.95
gamma                - 7.1000000000000005
alpha                - 0.0
lambda               - 59.89584209238878
max_cat_to_onehot    - 4

[Time taken: 2573.27s]

-----Cross-validation and prediction-----
Fold # 0: 1787.73609 (1019 rounds)
Fold # 1: 1935.31381 (1527 rounds)
Fold # 2: 2302.07457 ( 976 rounds)
Fold # 3: 1925.48813 ( 782 rounds)
Fold # 4: 1855.69170 (1247 rounds)

Avg score: 1961.26086 +/- 178.52877
OOF score: 1961.13938


[Time taken: 19.02s]

CPU times: user 45min 14s, sys: 7.14 s, total: 45min 21s
Wall time: 43min 12s


In [23]:
%%time
expt = 5
cv_type = 'SKF'
folds = 5
seed = SEED

config = f'expt{expt}_cv{cv_type}_STATUS_{folds}_seed{seed}'
op[config], tp[config] = run_experiment(
    features=FEATURES,
    folds=folds,
    stratify_col='status',
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 69 -> Best value: 1965.16845
Best hyperparameters:
max_depth            - 3
min_child_weight     - 17
subsample            - 0.8
colsample_bytree     - 0.9
gamma                - 12.100000000000001
alpha                - 2.5
lambda               - 45.26664424557256
max_cat_to_onehot    - 2

[Time taken: 3505.37s]

-----Cross-validation and prediction-----
Fold # 0: 1951.79807 (2029 rounds)
Fold # 1: 1860.29406 (1206 rounds)
Fold # 2: 2318.40517 (1423 rounds)
Fold # 3: 1747.11949 ( 890 rounds)
Fold # 4: 1948.58597 (1690 rounds)

Avg score: 1965.24055 +/- 191.71535
OOF score: 1965.16845


[Time taken: 19.29s]

CPU times: user 1h 50s, sys: 8.3 s, total: 1h 59s
Wall time: 58min 44s


In [24]:
%%time
expt = 6
cv_type = 'SKF'
folds = 5
seed = SEED

config = f'expt{expt}_cv{cv_type}_PRICERANGE_{folds}_seed{seed}'
op[config], tp[config] = run_experiment(
    features=FEATURES,
    folds=folds,
    stratify_col='price_range',
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 99 -> Best value: 1928.54545
Best hyperparameters:
max_depth            - 7
min_child_weight     - 13
subsample            - 0.85
colsample_bytree     - 1.0
gamma                - 19.3
alpha                - 4.800000000000001
lambda               - 3.5497666009659663
max_cat_to_onehot    - 2

[Time taken: 2394.48s]

-----Cross-validation and prediction-----
Fold # 0: 1897.72238 ( 456 rounds)
Fold # 1: 1759.55798 ( 397 rounds)
Fold # 2: 1858.98290 ( 414 rounds)
Fold # 3: 2061.82216 ( 852 rounds)
Fold # 4: 2065.25104 ( 342 rounds)

Avg score: 1928.66729 +/- 118.99311
OOF score: 1928.54545


[Time taken: 14.92s]

CPU times: user 42min 8s, sys: 6.14 s, total: 42min 14s
Wall time: 40min 9s
