In [1]:
import time
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# from IPython.display import clear_output
from itertools import combinations
from sklearn.metrics import r2_score
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold
import catboost as cb

SEED = 2024

In [2]:
# Check GPU availability
import subprocess

try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'GPU'
except Exception:
    DEVICE = 'CPU'

print(f'Available device: {DEVICE}')

Available device: GPU


In [3]:
DATA_DIR = '/kaggle/input/playground-series-s4e5'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original = pd.read_csv('/kaggle/input/flood-prediction-factors/flood.csv')

In [4]:
train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

_ = gc.collect()

In [5]:
features = list(test.columns)
TARGET = 'FloodProbability'

In [6]:
train['FloodLabel'] = (train['FloodProbability'] >= 0.5).astype('int')
original['FloodLabel'] = (original['FloodProbability'] >= 0.5).astype('int')

CAT_TARGET = 'FloodLabel'

In [7]:
train['sum_all'] = train[features].sum(axis=1)
test['sum_all'] = test[features].sum(axis=1)
original['sum_all'] = original[features].sum(axis=1)

features_ext1 = list(test.columns)

In [8]:
%%time
pairs = [list(x) for x in combinations(features, 2)]
for pair in pairs:
    train[f'sum_{pair[0]}_{pair[1]}'] = train[pair].sum(axis=1)
    test[f'sum_{pair[0]}_{pair[1]}'] = test[pair].sum(axis=1)
    original[f'sum_{pair[0]}_{pair[1]}'] = original[pair].sum(axis=1)
    
    train[f'diff_{pair[0]}_{pair[1]}'] = train[pair[0]] - train[pair[1]]
    test[f'diff_{pair[0]}_{pair[1]}'] = test[pair[0]] - test[pair[1]]
    original[f'diff_{pair[0]}_{pair[1]}'] = original[pair[0]] - original[pair[1]]
    
    train[f'prod_{pair[0]}_{pair[1]}'] = train[pair[0]] * train[pair[1]]
    test[f'prod_{pair[0]}_{pair[1]}'] = test[pair[0]] * test[pair[1]]
    original[f'prod_{pair[0]}_{pair[1]}'] = original[pair[0]] * original[pair[1]]
    
features_ext2 = list(test.columns)

del(pairs)
_ = gc.collect()

len(features_ext2)

CPU times: user 1min 18s, sys: 6.95 s, total: 1min 25s
Wall time: 1min 24s


591

In [9]:
def comp_metric(y_true, y_pred):
    return r2_score(y_true, y_pred)

In [10]:
def custom_cv(task, feature_set, estimator, extend, folds=10, seed=SEED, verbose=True):
    oof_preds, test_preds = {}, {}
    scores = []
    
    if task == 'clf':
        task_target = CAT_TARGET
    elif task == 'reg':
        task_target = TARGET
    else:
        print('Invalid task.')
        return

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[CAT_TARGET])):
        X_train, X_val = train.iloc[train_ids], train.iloc[val_ids]
        if extend: # concat original dataset with training folds and remove duplicates
            X_train = pd.concat([X_train, original], axis=0, ignore_index=True)
        
        y_val = X_val[TARGET]
        y_train = X_train[task_target]
        X_train, X_val = X_train[feature_set], X_val[feature_set]
        _ = gc.collect()
        
        model = clone(estimator)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0)

        if task == 'clf':
            val_preds = model.predict_proba(X_val)[:, 1]
            oof_preds.update(dict(zip(val_ids, val_preds)))
            test_preds[f'fold{fold}'] = model.predict_proba(test[feature_set])[:, 1]
        else:  # 'reg'
            val_preds = model.predict(X_val)
            oof_preds.update(dict(zip(val_ids, val_preds)))
            test_preds[f'fold{fold}'] = model.predict(test[feature_set])

        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>5} rounds)')
        _ = gc.collect()

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)  # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nAvg score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(train[TARGET], oof_preds):.5f}\n')
    
    return oof_preds, test_preds

In [11]:
op, tp = {}, {}

In [12]:
BASE_PARAMS = {
    'loss_function': 'RMSE',
    'eval_metric': 'R2',
    'metric_period': 1,
    'iterations': 25000,
    'learning_rate': 0.01,
    'early_stopping_rounds': 200,
    'use_best_model': True,
    'task_type': DEVICE,
    'thread_count': -1,
    'random_seed': SEED
}

model = cb.CatBoostRegressor(**BASE_PARAMS)

op['cfg1'], tp['cfg1'] = custom_cv(
    task='reg', 
    feature_set=features_ext1, 
    estimator=model, 
    extend=False)

Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 0: 0.86629 (19376 rounds)


Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 1: 0.86801 (21745 rounds)


Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 2: 0.86744 (18897 rounds)


Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 3: 0.86770 (20352 rounds)


Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 4: 0.86876 (19954 rounds)


Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 5: 0.86796 (16573 rounds)


Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 6: 0.86742 (20334 rounds)


Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 7: 0.86829 (20150 rounds)


Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 8: 0.86705 (18545 rounds)


Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


Fold # 9: 0.86682 (21941 rounds)

Avg score: 0.86757 +/- 0.00069
OOF score: 0.86758



In [13]:
ALT_PARAMS = {
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'metric_period': 1,
    'iterations': 25000,
    'learning_rate': 0.01,
    'early_stopping_rounds': 200,
    'use_best_model': True,
    'task_type': DEVICE,
    'thread_count': -1,
    'random_seed': SEED
}

model = cb.CatBoostRegressor(**ALT_PARAMS)

op['cfg2'], tp['cfg2'] = custom_cv(
    task='reg', 
    feature_set=features_ext1, 
    estimator=model, 
    extend=False)

Fold # 0: 0.86634 (24386 rounds)
Fold # 1: 0.86801 (21745 rounds)
Fold # 2: 0.86744 (18894 rounds)
Fold # 3: 0.86771 (20353 rounds)
Fold # 4: 0.86876 (19946 rounds)
Fold # 5: 0.86796 (16576 rounds)
Fold # 6: 0.86742 (20351 rounds)
Fold # 7: 0.86825 (16212 rounds)
Fold # 8: 0.86705 (18496 rounds)
Fold # 9: 0.86681 (22006 rounds)

Avg score: 0.86757 +/- 0.00068
OOF score: 0.86758



In [14]:
def create_submission_files(preds, config, notebook='07'):
    for col in preds.columns:
        sub = sample_sub.copy()
        sub[TARGET] = preds[col].clip(0, 1)
        sub.to_csv(f'nb{notebook}_{config}_{col}.csv', index=False)

In [15]:
create_submission_files(tp['cfg1'], 'cfg1')
create_submission_files(tp['cfg2'], 'cfg2')

In [17]:
!head nb07_cfg1_mean.csv

id,FloodProbability
1117957,0.5774950401596665
1117958,0.45160297973061014
1117959,0.4497330737228634
1117960,0.4708236938088298
1117961,0.4713741348351682
1117962,0.5073519066947646
1117963,0.537319807356057
1117964,0.5277014556806436
1117965,0.4723967242570494
