In [1]:
import time
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# from IPython.display import clear_output
from itertools import combinations
from sklearn.metrics import r2_score
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, KFold
import xgboost as xgb

SEED = 2024

In [2]:
# Check GPU availability
import subprocess

try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'cuda'
except Exception:
    DEVICE = 'cpu'

print(f'Available device: {DEVICE}')

Available device: cuda


In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e5'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
# sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original = pd.read_csv('/kaggle/input/flood-prediction-factors/flood.csv')

In [4]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [5]:
features = list(test.columns)
TARGET = 'FloodProbability'

In [6]:
train['FloodLabel'] = (train['FloodProbability'] >= 0.5).astype('int')
original['FloodLabel'] = (original['FloodProbability'] >= 0.5).astype('int')

CAT_TARGET = 'FloodLabel'

In [7]:
train['sum_all'] = train[features].sum(axis=1)
test['sum_all'] = test[features].sum(axis=1)
original['sum_all'] = original[features].sum(axis=1)

features_ext1 = list(test.columns)

In [8]:
%%time
pairs = [list(x) for x in combinations(features, 2)]
for pair in pairs:
    train[f'sum_{pair[0]}_{pair[1]}'] = train[pair].sum(axis=1)
    test[f'sum_{pair[0]}_{pair[1]}'] = test[pair].sum(axis=1)
    original[f'sum_{pair[0]}_{pair[1]}'] = original[pair].sum(axis=1)
    
    train[f'diff_{pair[0]}_{pair[1]}'] = train[pair[0]] - train[pair[1]]
    test[f'diff_{pair[0]}_{pair[1]}'] = test[pair[0]] - test[pair[1]]
    original[f'diff_{pair[0]}_{pair[1]}'] = original[pair[0]] - original[pair[1]]
    
    train[f'prod_{pair[0]}_{pair[1]}'] = train[pair[0]] * train[pair[1]]
    test[f'prod_{pair[0]}_{pair[1]}'] = test[pair[0]] * test[pair[1]]
    original[f'prod_{pair[0]}_{pair[1]}'] = original[pair[0]] * original[pair[1]]
    
features_ext2 = list(test.columns)

del(pairs)
_ = gc.collect()

len(features_ext2)

CPU times: user 1min 13s, sys: 6.65 s, total: 1min 20s
Wall time: 1min 18s


591

In [9]:
# %%time
# triplets = [list(x) for x in combinations(features, 3)]
# for triple in triplets:
#     train[f'sum_{triple[0]}_{triple[1]}_{triple[2]}'] = train[triple].sum(axis=1)
#     test[f'sum_{triple[0]}_{triple[1]}_{triple[2]}'] = test[triple].sum(axis=1)
#     original[f'sum_{triple[0]}_{triple[1]}_{triple[2]}'] = original[triple].sum(axis=1)
    
# features_ext3 = list(test.columns)

# del(triplets)
# _ = gc.collect()

# len(features_ext3)

In [10]:
def comp_metric(y_true, y_pred):
    return r2_score(y_true, y_pred)

In [11]:
def custom_cv(task, feature_set, estimator, extend, folds=10, seed=SEED, verbose=True):
    oof_preds, test_preds = {}, {}
    scores = []
    
    if task == 'clf':
        task_target = CAT_TARGET
    elif task == 'reg':
        task_target = TARGET
    else:
        print('Invalid task.')
        return

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[CAT_TARGET])):
        X_train, X_val = train.iloc[train_ids], train.iloc[val_ids]
        if extend: # concat original dataset with training folds and remove duplicates
            X_train = pd.concat([X_train, original], axis=0, ignore_index=True)
        
        y_val = X_val[TARGET]
        y_train = X_train[task_target]
        X_train, X_val = X_train[feature_set], X_val[feature_set]
        _ = gc.collect()
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)

        if task == 'clf':
            val_preds = model.predict_proba(X_val)[:, 1]
            oof_preds.update(dict(zip(val_ids, val_preds)))
            test_preds[f'fold{fold}'] = model.predict_proba(test[feature_set])[:, 1]
        else:  # 'reg'
            val_preds = model.predict(X_val)
            oof_preds.update(dict(zip(val_ids, val_preds)))
            test_preds[f'fold{fold}'] = model.predict(test[feature_set])

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration:>5} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)  # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [13]:
op, tp = {}, {}

feature_sets = {
    'f0': features,
    'f1': features_ext1,
    'f2': features_ext2,
#     'f3': features_ext3
}

In [14]:
BASE_PARAMS = {
    'base_score': 0.5,
    'booster': 'gbtree',
    'tree_method': 'hist',
    'n_estimators': 25000,
    'early_stopping_rounds': 100,
    'device': DEVICE,
    'enable_categorical': True,
    'verbosity': 0,
    'n_jobs': -1,
    'seed': SEED
}

In [16]:
reg_model1 = xgb.XGBRegressor(
    **BASE_PARAMS, 
    learning_rate=0.1,
    objective='reg:logistic', 
    eval_metric='mae')

for extend in [False, True]:
    print(f'===== Dataset extended: {extend} =====')
    for fname, fset in feature_sets.items():
        print('=' * 30)
        print(f'Feature set: {fname}')
        _, _ = custom_cv(task='reg', feature_set=fset, estimator=reg_model1, extend=extend)

===== Dataset extended: False =====
Feature set: f0
Fold # 0: 0.83822 ( 2635 rounds)
Fold # 1: 0.84053 ( 2840 rounds)
Fold # 2: 0.84020 ( 3555 rounds)
Fold # 3: 0.84035 ( 3164 rounds)
Fold # 4: 0.84164 ( 2506 rounds)
Fold # 5: 0.84069 ( 3058 rounds)
Fold # 6: 0.84006 ( 2767 rounds)
Fold # 7: 0.84059 ( 2654 rounds)
Fold # 8: 0.83922 ( 3247 rounds)
Fold # 9: 0.83921 ( 2913 rounds)

Avg score: 0.84007 +/- 0.00091
OOF score: 0.84007

Feature set: f1
Fold # 0: 0.86621 ( 1437 rounds)
Fold # 1: 0.86781 ( 1195 rounds)
Fold # 2: 0.86737 ( 1185 rounds)
Fold # 3: 0.86743 ( 1396 rounds)
Fold # 4: 0.86860 ( 1433 rounds)
Fold # 5: 0.86796 ( 1045 rounds)
Fold # 6: 0.86723 ( 1160 rounds)
Fold # 7: 0.86799 ( 1369 rounds)
Fold # 8: 0.86702 ( 1064 rounds)
Fold # 9: 0.86647 ( 1410 rounds)

Avg score: 0.86741 +/- 0.00069
OOF score: 0.86741

Feature set: f2
Fold # 0: 0.86569 (   62 rounds)
Fold # 1: 0.86757 (   75 rounds)
Fold # 2: 0.86682 (   66 rounds)
Fold # 3: 0.86718 (   97 rounds)
Fold # 4: 0.86803 ( 

In [20]:
reg_model2 = xgb.XGBRegressor(
    **BASE_PARAMS, 
    learning_rate=0.1,
    objective='reg:logistic', 
    eval_metric='rmse')

op['reg2'], tp['reg2'] = custom_cv(
    task='reg',
    feature_set=features_ext1,
    estimator=reg_model2,
    extend=False)

Fold # 0: 0.86636 (  702 rounds)
Fold # 1: 0.86794 (  704 rounds)
Fold # 2: 0.86744 (  730 rounds)
Fold # 3: 0.86759 (  729 rounds)
Fold # 4: 0.86875 (  783 rounds)
Fold # 5: 0.86804 (  484 rounds)
Fold # 6: 0.86734 (  544 rounds)
Fold # 7: 0.86821 (  509 rounds)
Fold # 8: 0.86704 (  766 rounds)
Fold # 9: 0.86669 (  660 rounds)

Avg score: 0.86754 +/- 0.00069
OOF score: 0.86754



In [19]:
reg_model3 = xgb.XGBRegressor(
    **BASE_PARAMS, 
    learning_rate=0.1,
    objective='reg:logistic', 
    eval_metric='mape')

op['reg3'], tp['reg3'] = custom_cv(
    task='reg',
    feature_set=features_ext1,
    estimator=reg_model3,
    extend=False)

Fold # 0: 0.86628 ( 1226 rounds)
Fold # 1: 0.86781 ( 1211 rounds)
Fold # 2: 0.86737 ( 1185 rounds)
Fold # 3: 0.86743 ( 1396 rounds)
Fold # 4: 0.86860 ( 1433 rounds)
Fold # 5: 0.86784 ( 1480 rounds)
Fold # 6: 0.86723 ( 1160 rounds)
Fold # 7: 0.86799 ( 1369 rounds)
Fold # 8: 0.86702 ( 1064 rounds)
Fold # 9: 0.86637 ( 1725 rounds)

Avg score: 0.86739 +/- 0.00068
OOF score: 0.86740



In [21]:
reg_model4 = xgb.XGBRegressor(
    **BASE_PARAMS, 
    objective='reg:tweedie',
    learning_rate=0.01,
    tweedie_variance_power=1.05,
    eval_metric='rmse')

op['reg4'], tp['reg4'] = custom_cv(
    task='reg',
    feature_set=features_ext1,
    estimator=reg_model4,
    extend=False)

Fold # 0: 0.86641 ( 5592 rounds)
Fold # 1: 0.86785 ( 4097 rounds)
Fold # 2: 0.86740 ( 4945 rounds)
Fold # 3: 0.86759 ( 4711 rounds)
Fold # 4: 0.86872 ( 5559 rounds)
Fold # 5: 0.86799 ( 5442 rounds)
Fold # 6: 0.86739 ( 4853 rounds)
Fold # 7: 0.86818 ( 3896 rounds)
Fold # 8: 0.86699 ( 4787 rounds)
Fold # 9: 0.86661 ( 3774 rounds)

Avg score: 0.86751 +/- 0.00068
OOF score: 0.86752



In [22]:
reg_model5 = xgb.XGBRegressor(
    **BASE_PARAMS, 
    objective='reg:tweedie',
    learning_rate=0.002,
    tweedie_variance_power=1.1,
    eval_metric='rmse')

op['reg5'], tp['reg5'] = custom_cv(
    task='reg',
    feature_set=features_ext1,
    estimator=reg_model5,
    extend=False)

Fold # 0: 0.86636 (21921 rounds)
Fold # 1: 0.86786 (19685 rounds)
Fold # 2: 0.86735 (19678 rounds)
Fold # 3: 0.86763 (24999 rounds)
Fold # 4: 0.86869 (24349 rounds)
Fold # 5: 0.86797 (23033 rounds)
Fold # 6: 0.86736 (21211 rounds)
Fold # 7: 0.86818 (18792 rounds)
Fold # 8: 0.86695 (18871 rounds)
Fold # 9: 0.86654 (13038 rounds)

Avg score: 0.86749 +/- 0.00069
OOF score: 0.86749



In [23]:
reg_model6 = xgb.XGBRegressor(
    **BASE_PARAMS, 
    objective='reg:tweedie',
    learning_rate=0.01,
    tweedie_variance_power=1.5,
    eval_metric='rmse')

op['reg6'], tp['reg6'] = custom_cv(
    task='reg',
    feature_set=features_ext1,
    estimator=reg_model6,
    extend=False)

Fold # 0: 0.86638 ( 4877 rounds)
Fold # 1: 0.86786 ( 4113 rounds)
Fold # 2: 0.86738 ( 4742 rounds)
Fold # 3: 0.86763 ( 5009 rounds)
Fold # 4: 0.86871 ( 5145 rounds)
Fold # 5: 0.86795 ( 4630 rounds)
Fold # 6: 0.86734 ( 4472 rounds)
Fold # 7: 0.86820 ( 4129 rounds)
Fold # 8: 0.86699 ( 4674 rounds)
Fold # 9: 0.86661 ( 4125 rounds)

Avg score: 0.86750 +/- 0.00068
OOF score: 0.86751



In [26]:
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
def create_submission_files(preds, config, notebook='06'):
    for col in preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = preds[col].clip(0, 1)
        sub.to_csv(f'nb{notebook}_{config}_{col}.csv', index=False)

In [27]:
for config in ['reg2', 'reg3', 'reg4', 'reg5', 'reg6']:
    create_submission_files(tp[config], config)

In [28]:
!head nb06_reg4_fold8.csv

id,FloodProbability
1117957,0.5771406
1117958,0.45237368
1117959,0.4490325
1117960,0.47142982
1117961,0.4712417
1117962,0.50763166
1117963,0.53735936
1117964,0.52754396
1117965,0.47246042
