# Setup

In [1]:
import gc
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.precision', 4)
pd.set_option('display.max_columns', None)

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

SEED = 2024

In [2]:
lgb.__version__

'4.2.0'

In [3]:
DATA_DIR = '/kaggle/input/bitgrit-crypto-price-prediction'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/solution_format.csv')

# Data overview

In [4]:
sample_sub.columns

Index(['ID', 'Target'], dtype='object')

In [5]:
train.head()

Unnamed: 0,ID,TR_1_EventInd,TR_2_EventInd,TR_3_EventInd,feature_10_A,feature_10_B,feature_10_F,feature_10_G,feature_1_A,feature_1_B,feature_1_C,feature_1_D,feature_1_E,feature_1_F,feature_1_G,feature_2_A,feature_2_B,feature_2_C,feature_2_D,feature_2_E,feature_2_F,feature_2_G,feature_3_A,feature_3_B,feature_3_C,feature_3_D,feature_3_E,feature_3_F,feature_3_G,feature_4_A,feature_4_B,feature_4_C,feature_4_E,feature_4_F,feature_4_G,feature_5_A,feature_5_B,feature_5_C,feature_5_D,feature_5_E,feature_5_F,feature_5_G,feature_6_A,feature_6_B,feature_6_C,feature_6_D,feature_6_E,feature_6_F,feature_6_G,feature_7_A,feature_7_B,feature_7_C,feature_7_D,feature_7_E,feature_7_F,feature_7_G,feature_8_A,feature_8_B,feature_8_C,feature_8_D,feature_8_E,feature_8_F,feature_8_G,feature_9_A,feature_9_B,feature_9_C,feature_9_D,feature_9_E,feature_9_F,feature_9_G,feature_X_A,feature_X_B,feature_X_C,feature_X_D,index_1,index_2,index_3,Target
0,1,,,,0.0,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0.049,1.5425,-0.2747,0.0,0.0,1.0,0
1,2,,,,0.0,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0.015,1.2875,-1.1406,,,,0
2,3,,,,0.0,0.023,0.0,0.0,1.0,0.004,-0.2244,-1.3548,122.7936,1.0,1.0,0.0,0.011,-1.2904,1.2931,41.9933,1.0,1.0,0.0,0.01,-1.7287,0.3817,114.9724,1.0,1.0,0.0,0.01,0.1208,106.7018,0.0,1.0,1.0,0.018,-1.0708,-0.6021,11.4924,1.0,1.0,1.0,0.009,-0.7593,0.2421,110.2917,1.0,1.0,0.0,0.01,-1.7708,0.5285,209.7878,1.0,0.0,1.0,0.016,-0.8865,-0.941,14.8012,0.0,1.0,0.0,0.032,-1.938,0.73,2.1619,0.0,1.0,1,0.006,0.2515,-1.2216,,,,0
3,4,,,,1.0,0.019,0.0,2.0,0.0,0.005,-1.3725,-1.4973,116.5155,0.0,0.0,1.0,0.005,1.891,0.1048,45.5508,1.0,2.0,1.0,0.012,0.0778,-0.9358,134.0235,1.0,1.0,1.0,0.004,1.5798,112.4135,1.0,3.0,1.0,0.068,3.358,3.2775,24.7031,1.0,2.0,0.0,0.004,1.1889,-0.0793,125.4054,1.0,5.0,1.0,0.003,1.0441,-0.761,228.4739,1.0,1.0,1.0,0.023,1.6473,-0.3779,16.2052,1.0,2.0,0.0,0.033,0.657,-0.9823,2.4165,0.0,0.0,1,0.039,1.6738,0.5278,,,,1
4,5,,,1.0,1.0,0.023,0.0,2.0,1.0,0.014,1.6641,2.3542,123.7083,1.0,1.0,1.0,0.014,0.8111,0.7767,56.9611,1.0,2.0,1.0,0.013,0.257,0.1122,150.7831,1.0,1.0,0.0,0.004,0.7826,,0.0,2.0,0.0,0.019,0.7324,-0.8972,52.2727,1.0,0.0,0.0,0.015,-1.1118,0.2605,156.6009,1.0,0.0,0.0,0.011,-1.0811,-0.6182,263.8278,1.0,0.0,1.0,0.039,-1.1132,0.149,22.299,0.0,1.0,1.0,0.084,2.2816,1.2752,3.4386,1.0,2.0,0,0.037,0.2634,-0.302,,,,0


In [6]:
train.shape, test.shape, sample_sub.shape

((2660, 78), (666, 77), (666, 2))

**Target distribution**

In [7]:
TARGET = 'Target'
train[TARGET].value_counts(normalize=True)

Target
1    0.5677
0    0.4323
Name: proportion, dtype: float64

**% of missing values**

In [8]:
(train.isna().sum() / len(train) * 100).to_frame().T

Unnamed: 0,ID,TR_1_EventInd,TR_2_EventInd,TR_3_EventInd,feature_10_A,feature_10_B,feature_10_F,feature_10_G,feature_1_A,feature_1_B,feature_1_C,feature_1_D,feature_1_E,feature_1_F,feature_1_G,feature_2_A,feature_2_B,feature_2_C,feature_2_D,feature_2_E,feature_2_F,feature_2_G,feature_3_A,feature_3_B,feature_3_C,feature_3_D,feature_3_E,feature_3_F,feature_3_G,feature_4_A,feature_4_B,feature_4_C,feature_4_E,feature_4_F,feature_4_G,feature_5_A,feature_5_B,feature_5_C,feature_5_D,feature_5_E,feature_5_F,feature_5_G,feature_6_A,feature_6_B,feature_6_C,feature_6_D,feature_6_E,feature_6_F,feature_6_G,feature_7_A,feature_7_B,feature_7_C,feature_7_D,feature_7_E,feature_7_F,feature_7_G,feature_8_A,feature_8_B,feature_8_C,feature_8_D,feature_8_E,feature_8_F,feature_8_G,feature_9_A,feature_9_B,feature_9_C,feature_9_D,feature_9_E,feature_9_F,feature_9_G,feature_X_A,feature_X_B,feature_X_C,feature_X_D,index_1,index_2,index_3,Target
0,0.0,97.2556,97.7068,96.3534,16.8421,30.7143,16.8421,16.8421,30.6767,30.6767,30.8271,30.8271,34.5113,30.6767,30.6767,30.6767,30.6767,30.8271,30.8271,34.5113,30.6767,30.6767,30.6767,30.6767,30.8271,30.8271,34.5113,30.6767,30.6767,28.0075,28.1203,29.1729,43.1203,28.0075,28.0075,30.7143,30.7143,30.8647,30.8647,34.5489,30.7143,30.7143,30.6767,30.6767,30.8271,30.8271,34.5113,30.6767,30.6767,30.6767,30.6767,30.8271,30.8271,34.5113,30.6767,30.6767,30.6767,30.6767,30.8271,30.8271,34.5113,30.6767,30.6767,30.7143,30.7143,30.8647,30.8647,34.5489,30.7143,30.7143,0.0,0.0,0.0,0.0,96.4286,96.4286,96.4286,0.0


In [9]:
(test.isna().sum() / len(test) * 100).to_frame().T

Unnamed: 0,ID,TR_1_EventInd,TR_2_EventInd,TR_3_EventInd,feature_10_A,feature_10_B,feature_10_F,feature_10_G,feature_1_A,feature_1_B,feature_1_C,feature_1_D,feature_1_E,feature_1_F,feature_1_G,feature_2_A,feature_2_B,feature_2_C,feature_2_D,feature_2_E,feature_2_F,feature_2_G,feature_3_A,feature_3_B,feature_3_C,feature_3_D,feature_3_E,feature_3_F,feature_3_G,feature_4_A,feature_4_B,feature_4_C,feature_4_E,feature_4_F,feature_4_G,feature_5_A,feature_5_B,feature_5_C,feature_5_D,feature_5_E,feature_5_F,feature_5_G,feature_6_A,feature_6_B,feature_6_C,feature_6_D,feature_6_E,feature_6_F,feature_6_G,feature_7_A,feature_7_B,feature_7_C,feature_7_D,feature_7_E,feature_7_F,feature_7_G,feature_8_A,feature_8_B,feature_8_C,feature_8_D,feature_8_E,feature_8_F,feature_8_G,feature_9_A,feature_9_B,feature_9_C,feature_9_D,feature_9_E,feature_9_F,feature_9_G,feature_X_A,feature_X_B,feature_X_C,feature_X_D,index_1,index_2,index_3
0,0.0,98.048,98.3483,95.3453,16.5165,33.033,16.5165,16.5165,32.8829,32.8829,33.033,33.033,37.2372,32.8829,32.8829,32.8829,32.8829,33.033,33.033,37.2372,32.8829,32.8829,32.8829,32.8829,33.033,33.033,37.2372,32.8829,32.8829,30.7808,30.7808,33.033,45.7958,30.7808,30.7808,32.8829,32.8829,33.033,33.033,37.2372,32.8829,32.8829,32.8829,32.8829,33.033,33.033,37.2372,32.8829,32.8829,32.8829,32.8829,33.033,33.033,37.2372,32.8829,32.8829,32.8829,32.8829,33.033,33.033,37.2372,32.8829,32.8829,32.8829,32.8829,33.033,33.033,37.2372,32.8829,32.8829,0.0,0.0,0.0,0.0,97.8979,97.8979,97.8979


**Number of unique values**

In [10]:
train.nunique().to_frame().T  # not including NaN

Unnamed: 0,ID,TR_1_EventInd,TR_2_EventInd,TR_3_EventInd,feature_10_A,feature_10_B,feature_10_F,feature_10_G,feature_1_A,feature_1_B,feature_1_C,feature_1_D,feature_1_E,feature_1_F,feature_1_G,feature_2_A,feature_2_B,feature_2_C,feature_2_D,feature_2_E,feature_2_F,feature_2_G,feature_3_A,feature_3_B,feature_3_C,feature_3_D,feature_3_E,feature_3_F,feature_3_G,feature_4_A,feature_4_B,feature_4_C,feature_4_E,feature_4_F,feature_4_G,feature_5_A,feature_5_B,feature_5_C,feature_5_D,feature_5_E,feature_5_F,feature_5_G,feature_6_A,feature_6_B,feature_6_C,feature_6_D,feature_6_E,feature_6_F,feature_6_G,feature_7_A,feature_7_B,feature_7_C,feature_7_D,feature_7_E,feature_7_F,feature_7_G,feature_8_A,feature_8_B,feature_8_C,feature_8_D,feature_8_E,feature_8_F,feature_8_G,feature_9_A,feature_9_B,feature_9_C,feature_9_D,feature_9_E,feature_9_F,feature_9_G,feature_X_A,feature_X_B,feature_X_C,feature_X_D,index_1,index_2,index_3,Target
0,2660,1,1,1,2,110,1,5,2,34,1840,1840,1742,2,11,2,45,1840,1840,1742,2,12,2,66,1840,1840,1742,2,11,2,35,1884,1513,2,12,2,101,1839,1839,1741,2,11,2,61,1840,1840,1742,2,12,2,56,1840,1840,1742,2,9,2,124,1840,1840,1741,2,14,2,164,1838,1839,1721,2,12,2,189,2660,2660,7,25,35,2


In [11]:
test.nunique().to_frame().T

Unnamed: 0,ID,TR_1_EventInd,TR_2_EventInd,TR_3_EventInd,feature_10_A,feature_10_B,feature_10_F,feature_10_G,feature_1_A,feature_1_B,feature_1_C,feature_1_D,feature_1_E,feature_1_F,feature_1_G,feature_2_A,feature_2_B,feature_2_C,feature_2_D,feature_2_E,feature_2_F,feature_2_G,feature_3_A,feature_3_B,feature_3_C,feature_3_D,feature_3_E,feature_3_F,feature_3_G,feature_4_A,feature_4_B,feature_4_C,feature_4_E,feature_4_F,feature_4_G,feature_5_A,feature_5_B,feature_5_C,feature_5_D,feature_5_E,feature_5_F,feature_5_G,feature_6_A,feature_6_B,feature_6_C,feature_6_D,feature_6_E,feature_6_F,feature_6_G,feature_7_A,feature_7_B,feature_7_C,feature_7_D,feature_7_E,feature_7_F,feature_7_G,feature_8_A,feature_8_B,feature_8_C,feature_8_D,feature_8_E,feature_8_F,feature_8_G,feature_9_A,feature_9_B,feature_9_C,feature_9_D,feature_9_E,feature_9_F,feature_9_G,feature_X_A,feature_X_B,feature_X_C,feature_X_D,index_1,index_2,index_3
0,666,1,1,1,2,72,1,5,2,28,446,446,418,2,9,2,30,446,446,417,2,9,2,47,446,446,418,2,8,2,26,446,361,2,11,2,79,446,446,418,2,9,2,47,446,446,418,2,10,2,41,446,446,417,2,8,2,89,446,446,418,2,10,2,122,446,446,417,2,10,2,126,666,666,6,5,12


# Data preparation

**Feature sets**

In [12]:
# ID is unique identifier. Others have only 1 unique value i.e., zero variance.
all_features = [f for f in test.columns if f not in ('ID')]

# less than 90% missing
features_90 = [f for f in all_features if train[f].isna().sum() / len(train) < 0.9]

# less than 25% missing
features_25 = [f for f in all_features if train[f].isna().sum() / len(train) < 0.25]

# no values missing
features_0 = [f for f in all_features if train[f].isna().sum() == 0]

In [13]:
len(all_features), len(features_90), len(features_25), len(features_0)

(76, 70, 7, 4)

**Categorical features**

In [14]:
CAT_FEAT = [f for f in all_features if train[f].nunique() == 2]

# Modeling framework

In [15]:
def comp_metric(y_true, y_pred):
    return f1_score(y_true, y_pred)

In [16]:
def lgbm_eval_metric(y_true, y_pred):
    is_higher_better = True
    y_pred = (y_pred >= 0.5).astype('int')
    return 'f1_score', f1_score(y_true, y_pred), is_higher_better

In [17]:
def custom_cv(features, model, folds=7, seed=SEED):
    X, y = train[features], train[TARGET]
    X_test = test[features]
    cat_feat = [f for f in features if f in CAT_FEAT]
    
    oof_preds = {}
    test_preds = {}
    scores = []

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric=lgbm_eval_metric,
            feature_name=features,
            categorical_feature=cat_feat)

        val_preds = model.predict(X_val)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
        test_preds[f'fold{fold}'] = model.predict(X_test)

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF: {comp_metric(y, oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [18]:
def create_submission_files(preds, config, notebook='00'):
    sub = sample_sub.copy()
    sub[TARGET] = preds['mode']
    sub.to_csv(f'nb{notebook}_{config}.csv', index=False)

# Experiments

In [19]:
op = {}  # OOF preds
tp = {}  # Test preds

In [20]:
BASE_PARAMS = {
    'objective': 'binary',
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'early_stopping_rounds': 200,
    'data_sample_strategy': 'goss',
    'max_cat_to_onehot': 2,
    'verbosity': -1,
    'device_type': 'cpu',
    'num_threads': 4,
}

In [21]:
%%time

feature_set = 'ALL'
folds = 7
seed = SEED
config = f'feat{feature_set}_fold{folds}_seed{seed}'

model = lgb.LGBMClassifier(**BASE_PARAMS, seed=seed)
op[config], tp[config] = custom_cv(all_features, model, folds=folds, seed=seed)

create_submission_files(tp[config], config)

Fold # 0: 0.79755 (  67 rounds)
Fold # 1: 0.76923 ( 106 rounds)
Fold # 2: 0.78873 ( 117 rounds)
Fold # 3: 0.77642 ( 112 rounds)
Fold # 4: 0.75781 (  40 rounds)
Fold # 5: 0.76074 (  98 rounds)
Fold # 6: 0.76680 ( 101 rounds)

Avg: 0.77390 +/- 0.01357
OOF: 0.77379

CPU times: user 33.8 s, sys: 11.7 s, total: 45.5 s
Wall time: 29.4 s


In [22]:
%%time

feature_set = 'F90'
folds = 7
seed = SEED
config = f'feat{feature_set}_fold{folds}_seed{seed}'

model = lgb.LGBMClassifier(**BASE_PARAMS, seed=seed)
op[config], tp[config] = custom_cv(features_90, model, folds=folds, seed=seed)

create_submission_files(tp[config], config)

Fold # 0: 0.79755 (  67 rounds)
Fold # 1: 0.76923 ( 106 rounds)
Fold # 2: 0.78873 ( 117 rounds)
Fold # 3: 0.77642 ( 112 rounds)
Fold # 4: 0.75781 (  40 rounds)
Fold # 5: 0.76074 (  98 rounds)
Fold # 6: 0.76680 ( 101 rounds)

Avg: 0.77390 +/- 0.01357
OOF: 0.77379

CPU times: user 30.7 s, sys: 9.23 s, total: 40 s
Wall time: 24.7 s


In [23]:
%%time

feature_set = 'F25'
folds = 7
seed = SEED
config = f'feat{feature_set}_fold{folds}_seed{seed}'

model = lgb.LGBMClassifier(**BASE_PARAMS, seed=seed)
op[config], tp[config] = custom_cv(features_25, model, folds=folds, seed=seed)

create_submission_files(tp[config], config)

Fold # 0: 0.72269 (   1 rounds)
Fold # 1: 0.71986 (  28 rounds)
Fold # 2: 0.72483 (   1 rounds)
Fold # 3: 0.72483 (   1 rounds)
Fold # 4: 0.72483 (   1 rounds)
Fold # 5: 0.72483 (   1 rounds)
Fold # 6: 0.72483 (   1 rounds)

Avg: 0.72382 +/- 0.00177
OOF: 0.72382

CPU times: user 11.9 s, sys: 6.81 s, total: 18.7 s
Wall time: 15 s


In [24]:
%%time

feature_set = 'F0'
folds = 7
seed = SEED
config = f'feat{feature_set}_fold{folds}_seed{seed}'

model = lgb.LGBMClassifier(**BASE_PARAMS, seed=seed)
op[config], tp[config] = custom_cv(features_0, model, folds=folds, seed=seed)

create_submission_files(tp[config], config)

Fold # 0: 0.72269 (   1 rounds)
Fold # 1: 0.72572 (  38 rounds)
Fold # 2: 0.72483 (   1 rounds)
Fold # 3: 0.72483 (   1 rounds)
Fold # 4: 0.72483 (   1 rounds)
Fold # 5: 0.72483 (   1 rounds)
Fold # 6: 0.72483 (   1 rounds)

Avg: 0.72465 +/- 0.00086
OOF: 0.72465

CPU times: user 11.6 s, sys: 6.61 s, total: 18.2 s
Wall time: 14.8 s


In [25]:
%%time

feature_set = 'ALL'
folds = 10
seed = SEED
config = f'feat{feature_set}_fold{folds}_seed{seed}'

model = lgb.LGBMClassifier(**BASE_PARAMS, seed=seed)
op[config], tp[config] = custom_cv(all_features, model, folds=folds, seed=seed)

create_submission_files(tp[config], config)

Fold # 0: 0.78466 (  99 rounds)
Fold # 1: 0.78916 ( 119 rounds)
Fold # 2: 0.78338 ( 202 rounds)
Fold # 3: 0.79775 (  49 rounds)
Fold # 4: 0.78824 (  90 rounds)
Fold # 5: 0.76972 ( 417 rounds)
Fold # 6: 0.75532 (  27 rounds)
Fold # 7: 0.76246 ( 118 rounds)
Fold # 8: 0.75921 ( 111 rounds)
Fold # 9: 0.77212 (  31 rounds)

Avg: 0.77620 +/- 0.01368
OOF: 0.77598

CPU times: user 53.8 s, sys: 19.3 s, total: 1min 13s
Wall time: 47.1 s


In [26]:
%%time

feature_set = 'ALL'
folds = 5
seed = SEED
config = f'feat{feature_set}_fold{folds}_seed{seed}'

model = lgb.LGBMClassifier(**BASE_PARAMS, seed=seed)
op[config], tp[config] = custom_cv(all_features, model, folds=folds, seed=seed)

create_submission_files(tp[config], config)

Fold # 0: 0.77219 ( 106 rounds)
Fold # 1: 0.79102 ( 109 rounds)
Fold # 2: 0.75543 ( 116 rounds)
Fold # 3: 0.75654 (  24 rounds)
Fold # 4: 0.75637 (  80 rounds)

Avg: 0.76631 +/- 0.01384
OOF: 0.76620

CPU times: user 23.9 s, sys: 8.1 s, total: 32.1 s
Wall time: 20.9 s


In [27]:
%%time

feature_set = 'ALL'
folds = 10
seed = 55
config = f'feat{feature_set}_fold{folds}_seed{seed}'

model = lgb.LGBMClassifier(**BASE_PARAMS, seed=seed)
op[config], tp[config] = custom_cv(all_features, model, folds=folds, seed=seed)

create_submission_files(tp[config], config)

Fold # 0: 0.75718 (  27 rounds)
Fold # 1: 0.76571 (  41 rounds)
Fold # 2: 0.76385 ( 105 rounds)
Fold # 3: 0.77059 ( 107 rounds)
Fold # 4: 0.76133 ( 144 rounds)
Fold # 5: 0.79882 ( 108 rounds)
Fold # 6: 0.80251 ( 221 rounds)
Fold # 7: 0.79213 ( 103 rounds)
Fold # 8: 0.78857 (  72 rounds)
Fold # 9: 0.75287 (  82 rounds)

Avg: 0.77536 +/- 0.01739
OOF: 0.77501

CPU times: user 51.5 s, sys: 17.7 s, total: 1min 9s
Wall time: 44.2 s


In [28]:
%%time

feature_set = 'ALL'
folds = 10
seed = 666
config = f'feat{feature_set}_fold{folds}_seed{seed}'

model = lgb.LGBMClassifier(**BASE_PARAMS, seed=seed)
op[config], tp[config] = custom_cv(all_features, model, folds=folds, seed=seed)

create_submission_files(tp[config], config)

Fold # 0: 0.78846 ( 428 rounds)
Fold # 1: 0.80912 (  76 rounds)
Fold # 2: 0.78363 ( 123 rounds)
Fold # 3: 0.77128 (  38 rounds)
Fold # 4: 0.75145 ( 109 rounds)
Fold # 5: 0.75862 (  99 rounds)
Fold # 6: 0.76923 (  91 rounds)
Fold # 7: 0.77419 ( 105 rounds)
Fold # 8: 0.77937 (  73 rounds)
Fold # 9: 0.77101 (  90 rounds)

Avg: 0.77564 +/- 0.01524
OOF: 0.77552

CPU times: user 53.2 s, sys: 19.2 s, total: 1min 12s
Wall time: 45.8 s
