# Setup

In [1]:
%%capture
!pip install --upgrade optuna_integration

In [26]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import optuna.integration.lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.base import clone
from lightgbm import LGBMClassifier

SEED = 2024

In [27]:
DATA_DIR = '/kaggle/input/bitgrit-crypto-price-prediction'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/solution_format.csv')

# Data preparation

In [28]:
TARGET = 'Target'

In [29]:
feats_A = [f'feature_{i}_A' for i in range(1, 10)]
feats_F = [f'feature_{i}_F' for i in range(1, 10)]

CAT_FEATURES = [
    'TR_1_EventInd', 'TR_2_EventInd', 'TR_3_EventInd', 'feature_X_A',
    'feature_10_A', 'feature_10_F_missing', 'feature_10_G'
] + feats_A + feats_F

In [30]:
def preprocess_data(df):
    fill_zero_cols = ['TR_1_EventInd', 'TR_2_EventInd', 'TR_3_EventInd']
    df[fill_zero_cols] = df[fill_zero_cols].fillna(0).astype('int')
    
#     fill_neg1_cols = ['index_1', 'index_2', 'index_3']
#     df[fill_neg1_cols] = df[fill_neg1_cols].fillna(-1).astype('int')
    
    df['feature_10_F_missing'] = df.feature_10_F.isna().astype('int')
    df['feats_A_sum'] = df[feats_A].sum(axis=1)
    df['feats_F_sum'] = df[feats_F].sum(axis=1)
    
    return df


train = preprocess_data(train.copy())
test = preprocess_data(test.copy())

In [31]:
FEATURES = [f for f in test.columns if f not in ('ID', 'feature_10_F')]

In [32]:
# LightGBM dataset
dtrain = lgb.Dataset(
    data=train[FEATURES],
    label=train[TARGET],
    feature_name=FEATURES,
    categorical_feature=CAT_FEATURES)

# LightGBMTuner - tuning

In [33]:
base_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'force_row_wise': True,
    'verbosity': -1,
    'n_jobs': -1,
    'deterministic': True,
    'random_state': SEED
}

early_stopping = lgb.early_stopping(
    stopping_rounds=100,
    first_metric_only=True,
    verbose=False,
    min_delta=1e-4)

In [48]:
BUDGET = 60 * 60 * 3
NUM_FOLDS = 10

tuner = lgb.LightGBMTunerCV(
    time_budget=BUDGET,
    optuna_seed=SEED,
    params=base_params,
    train_set=dtrain,
    num_boost_round=10000,
    folds=StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED),
    seed=SEED,
    feature_name=FEATURES,
    categorical_feature=CAT_FEATURES,
    callbacks=[early_stopping])

[I 2024-05-29 12:58:48,265] A new study created in memory with name: no-name-dc4e5aab-7c04-415e-8f0f-1d1a6db0188a


In [49]:
%%time
tuner.run()

feature_fraction, val_score: 0.502641:  14%|#4        | 1/7 [02:17<13:43, 137.20s/it][I 2024-05-29 13:01:05,490] Trial 0 finished with value: 0.5026412211052049 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.5026412211052049.
feature_fraction, val_score: 0.502641:  29%|##8       | 2/7 [04:32<11:20, 136.13s/it][I 2024-05-29 13:03:20,876] Trial 1 finished with value: 0.5041602447098251 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.5026412211052049.
feature_fraction, val_score: 0.501298:  43%|####2     | 3/7 [06:38<08:45, 131.48s/it][I 2024-05-29 13:05:26,818] Trial 2 finished with value: 0.5012981715913597 and parameters: {'feature_fraction': 0.7}. Best is trial 2 with value: 0.5012981715913597.
feature_fraction, val_score: 0.501298:  57%|#####7    | 4/7 [08:50<06:34, 131.63s/it][I 2024-05-29 13:07:38,665] Trial 3 finished with value: 0.5039838147326969 and parameters: {'feature_fraction': 0.8}. Best is trial 2 with valu

CPU times: user 2h 57min 21s, sys: 1h 1min 59s, total: 3h 59min 21s
Wall time: 2h 7min 44s





# Cross validation + Thresholding

In [50]:
def probs_to_labels(probs, threshold=0.5):
    return (probs >= threshold).astype('int')

In [51]:
def get_best_threshold(y_true, y_probs):
    candidates = np.arange(0.001, 0.999, 0.001)
    scores = [
        f1_score(y_true, probs_to_labels(y_probs, t)) 
        for t in candidates
    ]
    
    return candidates[np.argmax(scores)]

In [68]:
def custom_cv(estimator, seed=SEED, verbose=True):
    oof_preds = {}
    test_preds = {}
    scores = []
    
    X, y = train[FEATURES], train[TARGET]
    X_test = test[FEATURES]
    
    cv = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[early_stopping])
        
        val_probs = model.predict_proba(X_val)[:, 1]
        best_threshold = get_best_threshold(y_val, val_probs)
        val_preds = probs_to_labels(val_probs, best_threshold)
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
        test_probs = model.predict_proba(X_test)[:, 1]
        test_preds[f'fold{fold}'] = probs_to_labels(test_probs, best_threshold)
        
        
        f1_best = f1_score(y_val, val_preds)
        scores.append(f1_best)
        if verbose:
            auc = roc_auc_score(y_val, val_probs)
            f1 = f1_score(y_val, probs_to_labels(val_probs, 0.5))
            print(f'Fold #{fold}: AUC = {auc:.5f}, F1 @0.5 = {f1: .5f}, F1 @best = {f1_best:.5f}' \
                  f' ({model.best_iteration_:>4} rounds, best threshold: {best_threshold:.3f})')
            
        _ = gc.collect()
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int')  # mode of fold-wise predictions
    oof_preds = pd.Series(oof_preds).sort_index()
    
    print(f'\nOOF F1 @best: {f1_score(y, oof_preds):.5f}, ' \
          f'Avg F1 @best: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    
    return oof_preds, test_preds

In [69]:
%%time
model = LGBMClassifier(**tuner.best_params, n_estimators=10000)
op, tp = custom_cv(model)

Fold #0: AUC = 0.87515, F1 @0.5 =  0.81410, F1 @best = 0.84241 (1065 rounds, best threshold: 0.272)
Fold #1: AUC = 0.83772, F1 @0.5 =  0.78864, F1 @best = 0.81609 ( 603 rounds, best threshold: 0.347)
Fold #2: AUC = 0.83922, F1 @0.5 =  0.78395, F1 @best = 0.80220 ( 657 rounds, best threshold: 0.307)
Fold #3: AUC = 0.85062, F1 @0.5 =  0.77170, F1 @best = 0.82386 ( 662 rounds, best threshold: 0.360)
Fold #4: AUC = 0.85016, F1 @0.5 =  0.79344, F1 @best = 0.80226 ( 593 rounds, best threshold: 0.298)
Fold #5: AUC = 0.83075, F1 @0.5 =  0.78864, F1 @best = 0.80466 ( 513 rounds, best threshold: 0.412)
Fold #6: AUC = 0.81843, F1 @0.5 =  0.77341, F1 @best = 0.79545 ( 535 rounds, best threshold: 0.407)
Fold #7: AUC = 0.79752, F1 @0.5 =  0.74214, F1 @best = 0.79365 ( 539 rounds, best threshold: 0.250)
Fold #8: AUC = 0.78491, F1 @0.5 =  0.71565, F1 @best = 0.79144 ( 436 rounds, best threshold: 0.332)
Fold #9: AUC = 0.85338, F1 @0.5 =  0.78769, F1 @best = 0.80656 ( 856 rounds, best threshold: 0.555)


In [70]:
def create_submission_files(preds, notebook='05'):
    for col in preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = preds[col]
        sub.to_csv(f'nb{notebook}_{col}.csv', index=False)

In [71]:
create_submission_files(tp)

In [72]:
!head nb05_mode.csv

ID,Target
2661,1
2662,1
2663,1
2664,1
2665,1
2666,1
2667,1
2668,1
2669,1
