# Setup

In [1]:
import gc
gc.enable()

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import catboost as cb

SEED = 2024

In [2]:
#Check GPU availability
import subprocess

try:
    subprocess.check_output('nvidia-smi')
    DEVICE = 'GPU'
except Exception:
    DEVICE = 'CPU'

print(f'Available device: {DEVICE}')

Available device: CPU


In [3]:
DATA_DIR = '/kaggle/input/ml-olympiad-smoking'

train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

# Data overview

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159256 entries, 0 to 159255
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   159256 non-null  int64  
 1   age                  159256 non-null  int64  
 2   height(cm)           159256 non-null  int64  
 3   weight(kg)           159256 non-null  int64  
 4   waist(cm)            159256 non-null  float64
 5   eyesight(left)       159256 non-null  float64
 6   eyesight(right)      159256 non-null  float64
 7   hearing(left)        159256 non-null  int64  
 8   hearing(right)       159256 non-null  int64  
 9   systolic             159256 non-null  int64  
 10  relaxation           159256 non-null  int64  
 11  fasting blood sugar  159256 non-null  int64  
 12  Cholesterol          159256 non-null  int64  
 13  triglyceride         159256 non-null  int64  
 14  HDL                  159256 non-null  int64  
 15  LDL              

In [5]:
TARGET = 'smoking'
train[TARGET].value_counts(normalize=True)

smoking
0    0.562635
1    0.437365
Name: proportion, dtype: float64

In [6]:
FEATURES = [f for f in test.columns if f != 'id']

In [7]:
train[FEATURES].nunique()

age                     18
height(cm)              14
weight(kg)              28
waist(cm)              531
eyesight(left)          20
eyesight(right)         17
hearing(left)            2
hearing(right)           2
systolic               112
relaxation              75
fasting blood sugar    229
Cholesterol            227
triglyceride           392
HDL                    108
LDL                    222
hemoglobin             134
Urine protein            6
serum creatinine        28
AST                    140
ALT                    188
Gtp                    362
dental caries            2
dtype: int64

In [8]:
CAT_FEATURES = ['hearing(left)', 'hearing(right)', 'dental caries']

# Modeling

In [9]:
def comp_metric(y_true, y_pred):
    return roc_auc_score(y_true, y_pred)

**Cross-validation setup**

In [10]:
def custom_cv(features, model, folds=7, seed=SEED):
    X, y = train[features], train[TARGET]
    X_test = test[features]
    cat_features = [f for f in features if f in CAT_FEATURES]
    
    oof_probs = {}
    test_probs = {}
    scores = []

    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            cat_features=cat_features,
            verbose=False)

        val_probs = model.predict_proba(X_val)[:, 1]
        oof_probs.update(dict(zip(val_ids, val_probs)))
        test_probs[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]

        score = comp_metric(y_val, val_probs)
        scores.append(score)
        print(f'Fold #{fold:>2}: {score:.5f} ({model.best_iteration_:>4} rounds)')
        _ = gc.collect()

    test_probs = pd.DataFrame.from_dict(test_probs)
    test_probs['mean'] = test_probs.mean(axis=1) # mean of fold-wise probabilities
    
    oof_probs = pd.Series(oof_probs).sort_index()
    print(f'\nAvg. score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'OOF score: {comp_metric(y, oof_probs):.5f}\n')
    
    return oof_probs, test_probs

**Submission files**

In [11]:
def create_submission_files(preds, config, notebook='00'):
    sub = sample_sub.copy()
    sub[TARGET] = preds['mean']
    sub.to_csv(f'{notebook}_{config}.csv', index=False)

In [12]:
op = {}
tp = {}

In [13]:
BASE_PARAMS = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'metric_period': 1,
    'iterations': 10000,
    'learning_rate': 0.1,
    'early_stopping_rounds': 100,
    'use_best_model': True,
    'task_type': DEVICE,
    'thread_count': -1
}

In [14]:
%%time
config = 'f05_s2024'
model = cb.CatBoostClassifier(**BASE_PARAMS, random_state=SEED)
op[config], tp[config] = custom_cv(FEATURES, model, folds=5, seed=SEED)

create_submission_files(tp[config], config)

Fold # 0: 0.86977 ( 645 rounds)
Fold # 1: 0.86888 (1550 rounds)
Fold # 2: 0.86968 (1559 rounds)
Fold # 3: 0.86580 ( 907 rounds)
Fold # 4: 0.86948 ( 829 rounds)

Avg. score: 0.86872 +/- 0.00149
OOF score: 0.86868

CPU times: user 8min 29s, sys: 21.1 s, total: 8min 50s
Wall time: 2min 23s


In [15]:
%%time
config = 'f07_s2024'
model = cb.CatBoostClassifier(**BASE_PARAMS, random_state=SEED)
op[config], tp[config] = custom_cv(FEATURES, model, folds=7, seed=SEED)

create_submission_files(tp[config], config)

Fold # 0: 0.86859 (1134 rounds)
Fold # 1: 0.87155 (1077 rounds)
Fold # 2: 0.86967 (1316 rounds)
Fold # 3: 0.87052 ( 962 rounds)
Fold # 4: 0.86804 (1299 rounds)
Fold # 5: 0.86985 ( 761 rounds)
Fold # 6: 0.86908 (1288 rounds)

Avg. score: 0.86962 +/- 0.00110
OOF score: 0.86960

CPU times: user 12min 27s, sys: 30.5 s, total: 12min 57s
Wall time: 3min 28s


In [16]:
%%time
config = 'f10_s2024'
model = cb.CatBoostClassifier(**BASE_PARAMS, random_state=SEED)
op[config], tp[config] = custom_cv(FEATURES, model, folds=10, seed=SEED)

create_submission_files(tp[config], config)

Fold # 0: 0.86793 ( 963 rounds)
Fold # 1: 0.87349 ( 712 rounds)
Fold # 2: 0.86744 ( 859 rounds)
Fold # 3: 0.87065 ( 816 rounds)
Fold # 4: 0.86981 (1609 rounds)
Fold # 5: 0.87294 (1462 rounds)
Fold # 6: 0.86493 (1021 rounds)
Fold # 7: 0.86797 ( 764 rounds)
Fold # 8: 0.87242 ( 703 rounds)
Fold # 9: 0.86771 (1376 rounds)

Avg. score: 0.86953 +/- 0.00266
OOF score: 0.86950

CPU times: user 16min 45s, sys: 40.7 s, total: 17min 25s
Wall time: 4min 39s


In [17]:
%%time
config = 'f15_s2024'
model = cb.CatBoostClassifier(**BASE_PARAMS, random_state=SEED)
op[config], tp[config] = custom_cv(FEATURES, model, folds=15, seed=SEED)

create_submission_files(tp[config], config)

Fold # 0: 0.86800 (1229 rounds)
Fold # 1: 0.87047 ( 716 rounds)
Fold # 2: 0.87565 (1216 rounds)
Fold # 3: 0.86790 ( 974 rounds)
Fold # 4: 0.86774 (1000 rounds)
Fold # 5: 0.87260 (1210 rounds)
Fold # 6: 0.86603 ( 795 rounds)
Fold # 7: 0.87428 (1006 rounds)
Fold # 8: 0.87001 ( 932 rounds)
Fold # 9: 0.86402 ( 751 rounds)
Fold #10: 0.86551 ( 956 rounds)
Fold #11: 0.87023 (1230 rounds)
Fold #12: 0.87461 ( 965 rounds)
Fold #13: 0.86931 (1041 rounds)
Fold #14: 0.86856 (1156 rounds)

Avg. score: 0.86966 +/- 0.00331
OOF score: 0.86967

CPU times: user 24min 56s, sys: 58.4 s, total: 25min 54s
Wall time: 6min 54s


In [18]:
!head 00_f10_s2024.csv

id,smoking
159256,0.5857696103986312
159257,0.2969399356423675
159258,0.39862431727908937
159259,0.01761013120731015
159260,0.5594316786546193
159261,0.9256777108144242
159262,0.5094217369105534
159263,0.053544465616838934
159264,0.12749131541443878


**Time to submit!**