# Setup

In [1]:
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.precision', 4)
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import clear_output

import optuna
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

SEED = 2024

In [2]:
# Check GPU availability
import subprocess
try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

Available device: cuda


In [3]:
DATA_DIR = '/kaggle/input/bitgrit-crypto-price-prediction'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/solution_format.csv')

# Data preparation

In [4]:
TARGET = 'Target'

In [5]:
BASE_FEATURES = [f for f in test.columns if f not in ('ID', 'feature_10_F')]

In [6]:
feats_A = [f'feature_{i}_A' for i in range(1, 10)]
feats_F = [f'feature_{i}_F' for i in range(1, 10)]

CAT_FEATURES = [
    'TR_1_EventInd', 'TR_2_EventInd', 'TR_3_EventInd', 'feature_X_A',
    'feature_10_A', 'feature_10_F_missing', 'feature_10_G'
] + feats_A + feats_F

In [7]:
def preprocess_data(df):
    fill_zero_cols = ['TR_1_EventInd', 'TR_2_EventInd', 'TR_3_EventInd']
    df[fill_zero_cols] = df[fill_zero_cols].fillna(0).astype('int')
    
#     fill_neg1_cols = ['index_1', 'index_2', 'index_3']
#     df[fill_neg1_cols] = df[fill_neg1_cols].fillna(-1).astype('int')
    
    df['feature_10_F_missing'] = df.feature_10_F.isna().astype('int')
    
    df[CAT_FEATURES] = df[CAT_FEATURES].astype('category')
    
    return df


train = preprocess_data(train.copy())
test = preprocess_data(test.copy())

In [8]:
market_features = [f for f in BASE_FEATURES if f.startswith('feature_')]

# Hyperparameter tuning

In [9]:
def comp_metric(y_true, y_pred):
    return roc_auc_score(y_true, y_pred)

In [10]:
def objective(trial, features, model, folds, seed):
    oof_preds = {}
    X, y = train[features], train[TARGET]
    
    param_grid = {
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, step=0.01),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 64),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.1), #complexity-control
        'alpha': trial.suggest_float('alpha', 0, 5, step=0.1), #L1-reg
        'lambda': trial.suggest_float('lambda', 5e-3, 5e3, log=True), #L2-reg
        'max_cat_to_onehot': trial.suggest_categorical('max_cat_to_onehot', [1, 2, 6]),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 1.5, step=0.05)
    }
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
        
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
    oof_preds = pd.Series(oof_preds).sort_index()
    return comp_metric(y, oof_preds)

In [11]:
def tune_params(features, model, folds, seed, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction)
    study.optimize(
        func=lambda trial: objective(
            trial, features, model, folds, seed),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Cross-validation framework

In [12]:
def probs_to_labels(probs, threshold=0.5):
    return (probs >= threshold).astype('int')

In [13]:
def get_best_threshold(y_true, y_probs):
    candidates = np.arange(0.001, 0.999, 0.001)
    scores = [
        f1_score(y_true, probs_to_labels(y_probs, t)) 
        for t in candidates]
    
    return candidates[np.argmax(scores)]

In [14]:
def custom_cv(features, model, folds, seed, verbose=True):
    oof_preds = {}
    test_preds = {}
    scores = []
    
    X, y = train[features], train[TARGET]
    X_test = test[features]
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)
        
        val_probs = model.predict_proba(X_val)[:, 1]
        best_threshold = get_best_threshold(y_val, val_probs)
        val_preds = probs_to_labels(val_probs, best_threshold)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
        test_probs = model.predict_proba(X_test)[:, 1]
        test_preds[f'fold{fold}'] = probs_to_labels(test_probs, best_threshold)
        
        
        f1_best = f1_score(y_val, val_preds)
        scores.append(f1_best)
        if verbose:
            auc = comp_metric(y_val, val_probs)
            f1 = f1_score(y_val, probs_to_labels(val_probs, 0.5))
            print(f'Fold #{fold}: AUC = {auc:.5f}, F1 @0.5 = {f1: .5f}, F1 @best = {f1_best:.5f}' \
                  f' ({model.best_iteration:>3} rounds, best threshold: {best_threshold:.3f})')
            
        _ = gc.collect()
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int')  # mode of fold-wise predictions
    oof_preds = pd.Series(oof_preds).sort_index()
    
    print(f'\nOOF F1 @best: {f1_score(y, oof_preds):.5f}, ' \
          f'Avg F1 @best: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    
    return oof_preds, test_preds

In [15]:
def run_experiment(eval_metric='auc', features=BASE_FEATURES, folds=5, seed=SEED, n_trials=200):
    
    base_params = {
        'booster': 'gbtree',
        'tree_method': 'hist',
        'objective': 'binary:logistic',
        'eval_metric': eval_metric,
        'learning_rate': 0.01,
        'n_estimators': 1000,
        'early_stopping_rounds': 100,
#         'scale_pos_weight': (43.23 / 56.77),
        'device': DEVICE,
        'enable_categorical': True,
        'verbosity': 0,
        'n_jobs': -1,
        'seed': seed
    }
    model = xgb.XGBClassifier(**base_params)
    
    start = time.time()
    study = tune_params(features, model, folds, seed, n_trials, direction='maximize') 
    # metric: auc/aucpr/f1_score -> higher is better
    end = time.time()
    clear_output(wait=True)
    print(f'----------Hyperparameter tuning----------')
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    
    model.set_params(**study.best_params)
    oof_preds, test_preds = custom_cv(features, model, folds, seed)
    
    end = time.time()
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

**Submission files**

In [16]:
def create_submission_files(preds, config, notebook='04'):
    sub = sample_sub.copy()
    sub[TARGET] = preds['mode']
    sub.to_csv(f'nb{notebook}_{config}.csv', index=False)

In [17]:
op = {}  # OOF predictions
tp = {}  # Test predictions

Trial run:

In [18]:
optuna.logging.set_verbosity(optuna.logging.INFO)
_ , _ = run_experiment(n_trials=3)

----------Hyperparameter tuning----------
Best trial: 2 -> Best value: 0.62580
Best hyperparameters:
max_depth            - 12
min_child_weight     - 17
subsample            - 0.7
colsample_bytree     - 0.9
gamma                - 16.0
alpha                - 2.7
lambda               - 0.9890443998013065
max_cat_to_onehot    - 6
scale_pos_weight     - 0.75

[Time taken: 9.51s]

-----Cross-validation and prediction-----
Fold #0: AUC = 0.62839, F1 @0.5 =  0.39524, F1 @best = 0.73935 ( 51 rounds, best threshold: 0.444)
Fold #1: AUC = 0.65571, F1 @0.5 =  0.37905, F1 @best = 0.73504 (106 rounds, best threshold: 0.486)
Fold #2: AUC = 0.67492, F1 @0.5 =  0.35714, F1 @best = 0.74900 ( 64 rounds, best threshold: 0.454)
Fold #3: AUC = 0.62411, F1 @0.5 =  0.66769, F1 @best = 0.72422 ( 87 rounds, best threshold: 0.001)
Fold #4: AUC = 0.63784, F1 @0.5 =  0.68389, F1 @best = 0.72638 ( 58 rounds, best threshold: 0.434)

OOF F1 @best: 0.73450, Avg F1 @best: 0.73480 +/- 0.00900

[Time taken: 12.08s]



# Experiments

In [19]:
%%time
eval_metric = 'auc'
feature_set = 'BASE'
folds = 5
seed = SEED
config = f'eval{eval_metric.upper()}_feat{feature_set}_folds{folds}_seed{seed}'

op[config], tp[config] = run_experiment(
    eval_metric=eval_metric,
    features=BASE_FEATURES,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 160 -> Best value: 0.80364
Best hyperparameters:
max_depth            - 12
min_child_weight     - 2
subsample            - 0.95
colsample_bytree     - 1.0
gamma                - 0.5
alpha                - 0.1
lambda               - 0.2356584595440954
max_cat_to_onehot    - 1
scale_pos_weight     - 1.15

[Time taken: 2726.78s]

-----Cross-validation and prediction-----
Fold #0: AUC = 0.82836, F1 @0.5 =  0.79755, F1 @best = 0.81009 (381 rounds, best threshold: 0.450)
Fold #1: AUC = 0.82489, F1 @0.5 =  0.78222, F1 @best = 0.80109 (335 rounds, best threshold: 0.386)
Fold #2: AUC = 0.79489, F1 @0.5 =  0.78455, F1 @best = 0.78697 (281 rounds, best threshold: 0.367)
Fold #3: AUC = 0.77730, F1 @0.5 =  0.74320, F1 @best = 0.77792 (345 rounds, best threshold: 0.300)
Fold #4: AUC = 0.79542, F1 @0.5 =  0.75455, F1 @best = 0.77524 (479 rounds, best threshold: 0.341)

OOF F1 @best: 0.78981, Avg F1 @best: 0.79026 +/- 0.01341

[Time taken: 39.32s]


In [20]:
%%time
eval_metric = 'aucpr'
feature_set = 'BASE'
folds = 5
seed = SEED
config = f'eval{eval_metric.upper()}_feat{feature_set}_folds{folds}_seed{seed}'

op[config], tp[config] = run_experiment(
    eval_metric=eval_metric,
    features=BASE_FEATURES,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 150 -> Best value: 0.79803
Best hyperparameters:
max_depth            - 12
min_child_weight     - 4
subsample            - 0.95
colsample_bytree     - 0.7
gamma                - 0.0
alpha                - 0.30000000000000004
lambda               - 0.02322180676976179
max_cat_to_onehot    - 2
scale_pos_weight     - 0.8500000000000001

[Time taken: 2820.31s]

-----Cross-validation and prediction-----
Fold #0: AUC = 0.83118, F1 @0.5 =  0.77796, F1 @best = 0.80290 (496 rounds, best threshold: 0.386)
Fold #1: AUC = 0.81948, F1 @0.5 =  0.76656, F1 @best = 0.79614 (493 rounds, best threshold: 0.379)
Fold #2: AUC = 0.79529, F1 @0.5 =  0.75329, F1 @best = 0.78680 (520 rounds, best threshold: 0.342)
Fold #3: AUC = 0.76214, F1 @0.5 =  0.71242, F1 @best = 0.77124 (665 rounds, best threshold: 0.247)
Fold #4: AUC = 0.79126, F1 @0.5 =  0.75520, F1 @best = 0.76923 (388 rounds, best threshold: 0.358)

OOF F1 @best: 0.78487, Avg F1 @best: 0.78526 +/-

In [21]:
%%time
eval_metric = 'auc'
feature_set = 'MRKT'
folds = 5
seed = SEED
config = f'eval{eval_metric.upper()}_feat{feature_set}_folds{folds}_seed{seed}'

op[config], tp[config] = run_experiment(
    eval_metric=eval_metric,
    features=market_features,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 112 -> Best value: 0.80477
Best hyperparameters:
max_depth            - 11
min_child_weight     - 2
subsample            - 0.95
colsample_bytree     - 0.85
gamma                - 0.4
alpha                - 0.8
lambda               - 0.08478755463963011
max_cat_to_onehot    - 2
scale_pos_weight     - 1.3

[Time taken: 3140.05s]

-----Cross-validation and prediction-----
Fold #0: AUC = 0.82974, F1 @0.5 =  0.80775, F1 @best = 0.80952 (321 rounds, best threshold: 0.496)
Fold #1: AUC = 0.82390, F1 @0.5 =  0.79886, F1 @best = 0.80668 (322 rounds, best threshold: 0.467)
Fold #2: AUC = 0.79643, F1 @0.5 =  0.77485, F1 @best = 0.78378 (332 rounds, best threshold: 0.525)
Fold #3: AUC = 0.77661, F1 @0.5 =  0.76968, F1 @best = 0.77714 (320 rounds, best threshold: 0.476)
Fold #4: AUC = 0.80062, F1 @0.5 =  0.75780, F1 @best = 0.77657 (544 rounds, best threshold: 0.387)

OOF F1 @best: 0.79060, Avg F1 @best: 0.79074 +/- 0.01443

[Time taken: 37.25s]

In [22]:
%%time
eval_metric = 'aucpr'
feature_set = 'MRKT'
folds = 5
seed = SEED
config = f'eval{eval_metric.upper()}_feat{feature_set}_folds{folds}_seed{seed}'

op[config], tp[config] = run_experiment(
    eval_metric=eval_metric,
    features=market_features,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 180 -> Best value: 0.80145
Best hyperparameters:
max_depth            - 10
min_child_weight     - 2
subsample            - 0.9
colsample_bytree     - 0.75
gamma                - 0.2
alpha                - 0.2
lambda               - 0.4192195606443289
max_cat_to_onehot    - 1
scale_pos_weight     - 1.25

[Time taken: 3189.34s]

-----Cross-validation and prediction-----
Fold #0: AUC = 0.82627, F1 @0.5 =  0.80120, F1 @best = 0.80519 (353 rounds, best threshold: 0.458)
Fold #1: AUC = 0.82312, F1 @0.5 =  0.79480, F1 @best = 0.80614 (464 rounds, best threshold: 0.461)
Fold #2: AUC = 0.79401, F1 @0.5 =  0.78499, F1 @best = 0.78904 (320 rounds, best threshold: 0.427)
Fold #3: AUC = 0.76955, F1 @0.5 =  0.74854, F1 @best = 0.78411 (327 rounds, best threshold: 0.349)
Fold #4: AUC = 0.79886, F1 @0.5 =  0.75188, F1 @best = 0.77188 (773 rounds, best threshold: 0.297)

OOF F1 @best: 0.79090, Avg F1 @best: 0.79127 +/- 0.01302

[Time taken: 39.35s]


In [23]:
%%time
eval_metric = 'aucpr'
feature_set = 'MRKT'
folds = 5
seed = 808
config = f'eval{eval_metric.upper()}_feat{feature_set}_folds{folds}_seed{seed}'

op[config], tp[config] = run_experiment(
    eval_metric=eval_metric,
    features=market_features,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 152 -> Best value: 0.80624
Best hyperparameters:
max_depth            - 8
min_child_weight     - 2
subsample            - 1.0
colsample_bytree     - 0.6
gamma                - 0.4
alpha                - 0.0
lambda               - 0.008744495514163888
max_cat_to_onehot    - 1
scale_pos_weight     - 0.95

[Time taken: 3164.51s]

-----Cross-validation and prediction-----
Fold #0: AUC = 0.82032, F1 @0.5 =  0.78684, F1 @best = 0.79635 (203 rounds, best threshold: 0.537)
Fold #1: AUC = 0.80131, F1 @0.5 =  0.75851, F1 @best = 0.79888 (564 rounds, best threshold: 0.369)
Fold #2: AUC = 0.81465, F1 @0.5 =  0.77895, F1 @best = 0.79096 (214 rounds, best threshold: 0.442)
Fold #3: AUC = 0.82276, F1 @0.5 =  0.77691, F1 @best = 0.80580 (370 rounds, best threshold: 0.435)
Fold #4: AUC = 0.79755, F1 @0.5 =  0.76543, F1 @best = 0.77616 (678 rounds, best threshold: 0.305)

OOF F1 @best: 0.79331, Avg F1 @best: 0.79363 +/- 0.00995

[Time taken: 30.72s]


In [24]:
%%time
eval_metric = 'aucpr'
feature_set = 'MRKT'
folds = 7
seed = SEED
config = f'eval{eval_metric.upper()}_feat{feature_set}_folds{folds}_seed{seed}'

op[config], tp[config] = run_experiment(
    eval_metric=eval_metric,
    features=market_features,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 126 -> Best value: 0.81367
Best hyperparameters:
max_depth            - 12
min_child_weight     - 3
subsample            - 0.9
colsample_bytree     - 0.8
gamma                - 0.8
alpha                - 0.5
lambda               - 0.01033338273929496
max_cat_to_onehot    - 6
scale_pos_weight     - 0.9

[Time taken: 3690.48s]

-----Cross-validation and prediction-----
Fold #0: AUC = 0.87095, F1 @0.5 =  0.80556, F1 @best = 0.83156 (389 rounds, best threshold: 0.449)
Fold #1: AUC = 0.77646, F1 @0.5 =  0.76231, F1 @best = 0.78835 (161 rounds, best threshold: 0.416)
Fold #2: AUC = 0.83954, F1 @0.5 =  0.78959, F1 @best = 0.80608 (258 rounds, best threshold: 0.367)
Fold #3: AUC = 0.82450, F1 @0.5 =  0.77263, F1 @best = 0.79619 (282 rounds, best threshold: 0.361)
Fold #4: AUC = 0.79398, F1 @0.5 =  0.75785, F1 @best = 0.78491 (364 rounds, best threshold: 0.317)
Fold #5: AUC = 0.77501, F1 @0.5 =  0.71163, F1 @best = 0.76983 (472 rounds, best 

In [25]:
%%time
eval_metric = 'aucpr'
feature_set = 'MRKT'
folds = 10
seed = SEED
config = f'eval{eval_metric.upper()}_feat{feature_set}_folds{folds}_seed{seed}'

op[config], tp[config] = run_experiment(
    eval_metric=eval_metric,
    features=market_features,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 122 -> Best value: 0.81466
Best hyperparameters:
max_depth            - 12
min_child_weight     - 3
subsample            - 0.95
colsample_bytree     - 0.9
gamma                - 0.4
alpha                - 0.2
lambda               - 0.08371954962359997
max_cat_to_onehot    - 1
scale_pos_weight     - 1.4500000000000002

[Time taken: 5035.66s]

-----Cross-validation and prediction-----
Fold #0: AUC = 0.85799, F1 @0.5 =  0.80851, F1 @best = 0.82799 (512 rounds, best threshold: 0.448)
Fold #1: AUC = 0.83899, F1 @0.5 =  0.79202, F1 @best = 0.81553 (155 rounds, best threshold: 0.624)
Fold #2: AUC = 0.81676, F1 @0.5 =  0.79452, F1 @best = 0.80109 (152 rounds, best threshold: 0.494)
Fold #3: AUC = 0.84331, F1 @0.5 =  0.82486, F1 @best = 0.82486 (216 rounds, best threshold: 0.487)
Fold #4: AUC = 0.83945, F1 @0.5 =  0.80115, F1 @best = 0.80347 (319 rounds, best threshold: 0.503)
Fold #5: AUC = 0.81515, F1 @0.5 =  0.79656, F1 @best = 0.80000 (2