# Setup

In [1]:
%%capture
!pip install --upgrade optuna
!pip install --upgrade catboost

In [2]:
import os
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
np.set_printoptions(precision=4)

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

import catboost
from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, average_precision_score

In [3]:
optuna.__version__, catboost.__version__

('3.0.3', '1.1.1')

In [4]:
SEED = 2311
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [5]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/dataverse_hack/data'

train = pd.read_csv(f'{DATA_URL}/processed/train.csv') #processed dataset from notebook 00
test = pd.read_csv(f'{DATA_URL}/processed/test.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/sample_submission.csv')

In [6]:
TARGET = 'is_claim'

In [7]:
features = list(test.columns)

num_features = ['policy_tenure', 'age_of_car', 'age_of_policyholder', 
                'population_density', 'airbags', 'displacement', 'turning_radius',
                'length', 'width','height', 'gross_weight', 'ncap_rating']

cat_features = [f for f in features if f not in num_features]

In [8]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

# Baseline

In [9]:
def get_best_threshold(y_true, pred_probs):
    candidate_thresholds = np.arange(0, 1, 0.01)
    candidate_scores = [f1_score(y_true, (pred_probs >= t).astype('int')) 
                        for t in candidate_thresholds]
    best_threshold = candidate_thresholds[np.argmax(candidate_scores)]
    return best_threshold

In [13]:
%%time
scores_f1 = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X, y = train[features], train[TARGET]
cat_features = list(X.select_dtypes(include='category').columns)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='F1',
        langevin=True,
        use_best_model=True,
        verbose=False,
        random_state=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        cat_features=cat_features,
        early_stopping_rounds=100,
        verbose=False
    )
    val_probs = model.predict_proba(X_val)[:, 1]
    best_threshold = get_best_threshold(y_val, val_probs)
    val_preds = (val_probs >= best_threshold).astype('int')

    score = f1_score(y_val, val_preds)
    scores_f1.append(score)
    print(f'Fold #{fold}: ({model.best_iteration_} rounds) F1-score = {score:.5f}')
    _ = gc.collect()

print(f'\nAvg F1-score = {np.mean(scores_f1):.5f} +/- {np.std(scores_f1):.5f}\n')

Fold #0: (0 rounds) F1-score = 0.14746
Fold #1: (0 rounds) F1-score = 0.14957
Fold #2: (0 rounds) F1-score = 0.12016
Fold #3: (0 rounds) F1-score = 0.12016
Fold #4: (0 rounds) F1-score = 0.12031

Avg F1-score = 0.13153 +/- 0.01388

CPU times: user 1min 5s, sys: 1.38 s, total: 1min 7s
Wall time: 38.2 s


# Hyperparameter tuning

In [22]:
def objective(trial, data, base_params, proba):

    scores = []
    X, y = data
    cat_features = list(X.select_dtypes(include='category').columns)
    
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.5, step=0.05),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2, 15, step=0.1),
        'depth': trial.suggest_int('depth', 3, 12),
        'rsm': trial.suggest_float('rsm', 0.5, 1.0, step=0.05), #colsample_bylevel
        'random_strength': trial.suggest_float('random_strength', 0.5, 2, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 15, step=0.05),
        'one_hot_max_size': trial.suggest_categorical('one_hot_max_size', [3, 6, 11, 22]),
        'bootstrap_type': trial.suggest_categorical(
            'bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS'])
    }
    #conditional hyperparameters
    if param_grid['bootstrap_type'] == 'Bayesian':
        param_grid['bagging_temperature'] = trial.suggest_int('bagging_temperature', 0, 10)
    else:
        param_grid['subsample'] = trial.suggest_float('subsample', 0.25, 1.0, step=0.05)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = CatBoostClassifier(**base_params, **param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            cat_features=cat_features,
            early_stopping_rounds=100,
            verbose=False
        )

        if proba:
            val_preds = model.predict_proba(X_val)[:, 1]
            scores.append(average_precision_score(y_val, val_preds))
        else:
            val_preds = model.predict(X_val)
            scores.append(f1_score(y_val, val_preds))

    return np.mean(scores)

In [23]:
def tune_params(data, base_params, proba, n_trials=10, direction='maximize'):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(trial, data, base_params, proba),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

# Cross-validation

In [24]:
def cross_validate_predict(data, model_params, proba, n_splits=5):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores = [] #scores on validation set

    X, y, X_test = data
    cat_features = list(X.select_dtypes(include='category').columns)
       
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = CatBoostClassifier(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            cat_features=cat_features,
            early_stopping_rounds=100,
            verbose=False
        )

        if proba:
            val_probs = model.predict_proba(X_val)[:, 1]
            test_probs = model.predict_proba(X_test)[:, 1]
            best_threshold = get_best_threshold(y_val, val_probs)
            val_preds = (val_probs >= best_threshold).astype('int')
            test_preds_fold = (test_probs >= best_threshold).astype('int')
        else:
            val_preds = model.predict(X_val)
            test_preds_fold = model.predict(X_test)
        
        oof_preds.update(dict(zip(val_idx, val_preds)))
        test_preds[f'fold{fold}'] = test_preds_fold

        score = f1_score(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold}: F1 = {score:.5f}')
        _ = gc.collect()
    print(f'Avg. F1 = {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int')

    return oof_preds, test_preds

In [25]:
def run_experiment(data, proba, n_trials=10):
    
    X, y, X_test = data

    base_params = {
        'n_estimators': 10000,
        'loss_function': 'Logloss',
        'eval_metric': 'F1',
        'langevin': True,
        'use_best_model': True,
        'verbose': False,
        'random_seed': SEED
    }
        
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        data=(X, y),
        base_params=base_params,
        proba=proba,
        n_trials=n_trials, 
        direction='maximize' #metric: f1-score/avg_precision -> higher is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model_params = {**base_params, **study.best_params}
    oof_preds, test_preds = cross_validate_predict(data, model_params, proba)
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')

    return oof_preds, test_preds

**Trial runs**

In [26]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [27]:
%%time
op, tp = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    proba=False,
    n_trials=3
)

[32m[I 2022-11-13 14:24:42,519][0m A new study created in memory with name: no-name-380320bd-70f9-4dac-ae97-5886dbb17bb9[0m


----------Hyperparameter tuning----------


[32m[I 2022-11-13 14:25:04,805][0m Trial 0 finished with value: 0.16830967061267366 and parameters: {'learning_rate': 0.2, 'l2_leaf_reg': 13.0, 'depth': 5, 'rsm': 0.55, 'random_strength': 0.55, 'scale_pos_weight': 10.9, 'one_hot_max_size': 11, 'bootstrap_type': 'MVS', 'subsample': 0.9}. Best is trial 0 with value: 0.16830967061267366.[0m
[32m[I 2022-11-13 14:25:29,730][0m Trial 1 finished with value: 0.17538463603935872 and parameters: {'learning_rate': 0.5, 'l2_leaf_reg': 9.5, 'depth': 4, 'rsm': 0.8500000000000001, 'random_strength': 1.05, 'scale_pos_weight': 9.8, 'one_hot_max_size': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.55}. Best is trial 1 with value: 0.17538463603935872.[0m
[32m[I 2022-11-13 14:27:22,377][0m Trial 2 finished with value: 0.14057423651445017 and parameters: {'learning_rate': 0.35, 'l2_leaf_reg': 5.9, 'depth': 10, 'rsm': 1.0, 'random_strength': 1.05, 'scale_pos_weight': 6.95, 'one_hot_max_size': 3, 'bootstrap_type': 'MVS', 'subsample': 0.95000000000

Best trial: 1 -> Best value: 0.17538
Best hyperparameters:
learning_rate   - 0.5
l2_leaf_reg     - 9.5
depth           - 4
rsm             - 0.8500000000000001
random_strength - 1.05
scale_pos_weight - 9.8
one_hot_max_size - 6
bootstrap_type  - Bernoulli
subsample       - 0.55
[Time taken: 160.00s]

-----Cross-validation and prediction-----
Fold #0: F1 = 0.17692
Fold #1: F1 = 0.17835
Fold #2: F1 = 0.17354
Fold #3: F1 = 0.17355
Fold #4: F1 = 0.17456
Avg. F1 = 0.17538 +/- 0.00193
[Time taken: 34.32s]

CPU times: user 5min 27s, sys: 8.16 s, total: 5min 35s
Wall time: 3min 14s


In [28]:
%%time
op, tp = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    proba=True,
    n_trials=3
)

[32m[I 2022-11-13 14:27:56,882][0m A new study created in memory with name: no-name-565ef050-dec6-4033-8572-31c00e7c6a4d[0m


----------Hyperparameter tuning----------


[32m[I 2022-11-13 14:28:22,633][0m Trial 0 finished with value: 0.10117079573162806 and parameters: {'learning_rate': 0.2, 'l2_leaf_reg': 13.0, 'depth': 5, 'rsm': 0.55, 'random_strength': 0.55, 'scale_pos_weight': 10.9, 'one_hot_max_size': 11, 'bootstrap_type': 'MVS', 'subsample': 0.9}. Best is trial 0 with value: 0.10117079573162806.[0m
[32m[I 2022-11-13 14:28:46,745][0m Trial 1 finished with value: 0.107003485574326 and parameters: {'learning_rate': 0.5, 'l2_leaf_reg': 9.5, 'depth': 4, 'rsm': 0.8500000000000001, 'random_strength': 1.05, 'scale_pos_weight': 9.8, 'one_hot_max_size': 6, 'bootstrap_type': 'Bernoulli', 'subsample': 0.55}. Best is trial 1 with value: 0.107003485574326.[0m
[32m[I 2022-11-13 14:30:32,819][0m Trial 2 finished with value: 0.09535158352950965 and parameters: {'learning_rate': 0.35, 'l2_leaf_reg': 5.9, 'depth': 10, 'rsm': 1.0, 'random_strength': 1.05, 'scale_pos_weight': 6.95, 'one_hot_max_size': 3, 'bootstrap_type': 'MVS', 'subsample': 0.950000000000000

Best trial: 1 -> Best value: 0.10700
Best hyperparameters:
learning_rate   - 0.5
l2_leaf_reg     - 9.5
depth           - 4
rsm             - 0.8500000000000001
random_strength - 1.05
scale_pos_weight - 9.8
one_hot_max_size - 6
bootstrap_type  - Bernoulli
subsample       - 0.55
[Time taken: 156.05s]

-----Cross-validation and prediction-----
Fold #0: F1 = 0.17960
Fold #1: F1 = 0.18009
Fold #2: F1 = 0.17354
Fold #3: F1 = 0.17355
Fold #4: F1 = 0.17456
Avg. F1 = 0.17627 +/- 0.00295
[Time taken: 35.96s]

CPU times: user 5min 28s, sys: 8.08 s, total: 5min 36s
Wall time: 3min 12s


In [29]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Exp 1

In [30]:
%%time
op1, tp1 = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    proba=False,
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 52 -> Best value: 0.17748
Best hyperparameters:
learning_rate   - 0.5
l2_leaf_reg     - 13.600000000000001
depth           - 8
rsm             - 1.0
random_strength - 1.9000000000000001
scale_pos_weight - 10.15
one_hot_max_size - 6
bootstrap_type  - MVS
subsample       - 0.8
[Time taken: 4201.04s]

-----Cross-validation and prediction-----
Fold #0: F1 = 0.18511
Fold #1: F1 = 0.17574
Fold #2: F1 = 0.18077
Fold #3: F1 = 0.17831
Fold #4: F1 = 0.16748
Avg. F1 = 0.17748 +/- 0.00588
[Time taken: 48.17s]

CPU times: user 2h 6min 39s, sys: 3min 24s, total: 2h 10min 4s
Wall time: 1h 10min 49s


# Exp 2

In [31]:
%%time
op2, tp2 = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    proba=True,
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 41 -> Best value: 0.10814
Best hyperparameters:
learning_rate   - 0.15000000000000002
l2_leaf_reg     - 9.3
depth           - 4
rsm             - 0.95
random_strength - 0.9
scale_pos_weight - 8.25
one_hot_max_size - 11
bootstrap_type  - Bernoulli
subsample       - 0.6000000000000001
[Time taken: 3990.32s]

-----Cross-validation and prediction-----
Fold #0: F1 = 0.17694
Fold #1: F1 = 0.18441
Fold #2: F1 = 0.17417
Fold #3: F1 = 0.17249
Fold #4: F1 = 0.16734
Avg. F1 = 0.17507 +/- 0.00562
[Time taken: 62.28s]

CPU times: user 1h 59min 24s, sys: 3min 27s, total: 2h 2min 52s
Wall time: 1h 7min 32s


# Submission files

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [33]:
NOTEBOOK = '03'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/analytics_vidhya/dataverse_hack/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [34]:
def create_submission_files(test_preds: pd.DataFrame, expt_num: int):
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{SUBMISSION_PATH}/{expt_num}_{col}.csv', index=False)

In [35]:
create_submission_files(tp1, '01')
create_submission_files(tp2, '02')