# Setup

In [1]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

import catboost as cb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone

SEED = np.random.randint(1, 1e5)

In [2]:
!mkdir -p ~/.kaggle
!cp /kaggle/input/kaggle-api-key/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Data preparation

In [3]:
DATA_DIR = '/kaggle/input/autoam-car-price-prediction'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

In [4]:
TARGET = 'price'

train = train.drop(['wheel'], axis=1)
test = test.drop(['Id', 'wheel'], axis=1)

In [5]:
def convert_miles_to_km(distance):
    km_per_mile = 1.609344
    if distance.endswith('miles'):
        return int(distance.split(' ')[0]) * km_per_mile
    else:
        return int(distance.split(' ')[0])
    
train['running'] = train.running.apply(convert_miles_to_km)
test['running'] = test.running.apply(convert_miles_to_km)

In [6]:
features = [f for f in test.columns]
cat_features = ['model', 'motor_type', 'color', 'type', 'status']

# Modeling

In [7]:
def comp_metric(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

In [8]:
NUM_FOLDS = np.random.randint(5, 10)

def custom_cv(estimator, num_folds=NUM_FOLDS, seed=SEED, verbose=True):
    X_test = test[features]
    
    oof_preds, test_preds = {}, {}
    scores = []

    cv = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train)):
        X_train, y_train = train[features].iloc[train_ids], train[TARGET].iloc[train_ids]
        X_val, y_val = train[features].iloc[val_ids], train[TARGET].iloc[val_ids]
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            cat_features=cat_features,
            verbose=False)

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [9]:
BASE_PARAMS = {
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 1,
    'iterations': 5000,
    'learning_rate': 0.01,
    'early_stopping_rounds': 100,
    'use_best_model': True,
    'task_type': 'CPU',
    'thread_count': -1,
    'random_seed': SEED
}

In [10]:
print(f'NUM_FOLDS: {NUM_FOLDS}, SEED: {SEED}')

NUM_FOLDS: 6, SEED: 56631


In [11]:
%%time
model = cb.CatBoostRegressor(**BASE_PARAMS)
op, tp = custom_cv(model)

Fold # 0: 2027.79660 (1183 rounds)
Fold # 1: 1879.81817 (1231 rounds)
Fold # 2: 1814.67850 (1049 rounds)
Fold # 3: 2226.35555 (1826 rounds)
Fold # 4: 1833.59599 (1831 rounds)
Fold # 5: 1768.90657 (2117 rounds)

Avg score: 1925.19190 +/- 157.22286
OOF score: 1925.34286

CPU times: user 1min 47s, sys: 19.3 s, total: 2min 7s
Wall time: 35.2 s


# Submissions

In [12]:
pd.to_datetime('today')

Timestamp('2024-04-28 04:31:21.109910')

In [13]:
DATE = pd.to_datetime('today').strftime('%Y_%m_%d')

In [14]:
def make_submissions(preds, notebook='01'):
    for col in preds:
        sub = sample_sub.copy()
        sub[TARGET] = preds[col].clip(lower=0)
        config = f"catboost-folds{NUM_FOLDS}-seed{SEED}-{col}"
        filename = f"nb{notebook}_date{DATE}_col{col}.csv"
        sub.to_csv(filename, index=False)
        !kaggle competitions submit autoam-car-price-prediction -f $filename -m $config
        print(f'\nSubmitted file: {filename} with configuration: {config}')
    print(f'\nMade {NUM_FOLDS + 1} submissions.')

In [15]:
make_submissions(tp)

100%|██████████████████████████████████████| 8.95k/8.95k [00:00<00:00, 26.5kB/s]
Successfully submitted to Car price prediction
Submitted file: nb01_date2024_04_28_colfold0.csv with configuration: catboost-folds6-seed56631-fold0
100%|██████████████████████████████████████| 8.95k/8.95k [00:00<00:00, 26.1kB/s]
Successfully submitted to Car price prediction
Submitted file: nb01_date2024_04_28_colfold1.csv with configuration: catboost-folds6-seed56631-fold1
100%|██████████████████████████████████████| 8.97k/8.97k [00:00<00:00, 26.6kB/s]
Successfully submitted to Car price prediction
Submitted file: nb01_date2024_04_28_colfold2.csv with configuration: catboost-folds6-seed56631-fold2
100%|██████████████████████████████████████| 8.95k/8.95k [00:00<00:00, 11.3kB/s]
Successfully submitted to Car price prediction
Submitted file: nb01_date2024_04_28_colfold3.csv with configuration: catboost-folds6-seed56631-fold3
100%|██████████████████████████████████████| 8.93k/8.93k [00:00<00:00, 26.4kB/s]

### ----- END -----