<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/subscriber_prediction_hackathon/notebooks/07_extratrees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [2]:
%%capture
!pip install --upgrade optuna

In [3]:
import os
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import log_loss

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

In [6]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.4', f'Change in Optuna version. Original notebook version: 3.0.4'

In [7]:
SEED = 2311
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [8]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/subscriber_prediction_hackathon/data'

train = pd.read_csv(f'{DATA_URL}/raw/train.csv')
test = pd.read_csv(f'{DATA_URL}/raw/test.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/submission.csv')

In [9]:
TARGET = 'y_bool'

In [10]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [11]:
features = list(test.columns)

# Hyperparameter tuning

In [12]:
def objective(trial, data, base_params, n_splits):
    scores = []
    X, y = data
    param_grid = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=25),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'max_features': trial.suggest_float('max_features', 0.55, 0.95, step=0.05),
        'max_samples': trial.suggest_float('max_samples', 0.55, 0.95, step=0.05),
        'class_weight': trial.suggest_categorical(
            'class_weight', ['balanced', 'balanced_subsample', None]
        )
    }
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        model = ExtraTreesClassifier(**base_params, **param_grid)
        model.fit(X_train, y_train)
        val_preds = model.predict_proba(X_val)[:, 1]
        scores.append(log_loss(y_val, val_preds))
    return np.mean(scores)

In [13]:
def tune_params(data, base_params, n_splits, n_trials, direction):
    study = optuna.create_study(
        sampler=TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(trial, data, base_params, n_splits),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

# Cross-validation

In [14]:
def cross_validate_predict(data, model_params, n_splits):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores = [] #scores on validation set

    X, y, X_test = data
       
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

        model = ExtraTreesClassifier(**model_params)
        model.fit(X_train, y_train)
        val_preds = model.predict_proba(X_val)[:, 1]
        test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]
        oof_preds.update(dict(zip(val_idx, val_preds)))

        score = log_loss(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold}: Logloss = {score:.5f}')
        _ = gc.collect()
    print(f'Avg. Logloss = {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    oof_preds = pd.Series(oof_preds).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)
    return oof_preds, test_preds

In [15]:
def run_experiment(data, n_splits=5, n_trials=10):
    
    X, y, X_test = data

    base_params = {
        'bootstrap': True,
        'random_state': SEED,
        'n_jobs': -1
    }
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        data=(X, y), 
        base_params=base_params, 
        n_splits=n_splits, 
        n_trials=n_trials, 
        direction='minimize' #logloss -> lower is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'[Time taken: {end - start:.2f}s]\n')
    _ = gc.collect()
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model_params = {**base_params, **study.best_params}
    oof_preds, test_preds = cross_validate_predict(data, model_params, n_splits)
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')

    return oof_preds, test_preds

**Trial run**

In [16]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [17]:
%%time
op, tp = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    n_splits=5,
    n_trials=3
)

[32m[I 2022-12-13 04:54:35,824][0m A new study created in memory with name: no-name-ed24bc74-d6ad-45ef-b6a5-4fbe04c6961d[0m


----------Hyperparameter tuning----------


[32m[I 2022-12-13 04:54:48,018][0m Trial 0 finished with value: 0.5812081797953861 and parameters: {'n_estimators': 375, 'criterion': 'gini', 'max_depth': 4, 'max_features': 0.55, 'max_samples': 0.8500000000000001, 'class_weight': None}. Best is trial 0 with value: 0.5812081797953861.[0m
[32m[I 2022-12-13 04:55:21,406][0m Trial 1 finished with value: 0.5809474306961713 and parameters: {'n_estimators': 600, 'criterion': 'entropy', 'max_depth': 4, 'max_features': 0.9000000000000001, 'max_samples': 0.95, 'class_weight': None}. Best is trial 1 with value: 0.5809474306961713.[0m
[32m[I 2022-12-13 04:55:52,476][0m Trial 2 finished with value: 0.6693682631532756 and parameters: {'n_estimators': 425, 'criterion': 'gini', 'max_depth': 10, 'max_features': 0.75, 'max_samples': 0.65, 'class_weight': 'balanced_subsample'}. Best is trial 1 with value: 0.5809474306961713.[0m


Best trial: 1 -> Best value: 0.58095
Best hyperparameters:
n_estimators    - 600
criterion       - entropy
max_depth       - 4
max_features    - 0.9000000000000001
max_samples     - 0.95
class_weight    - None
[Time taken: 76.74s]

-----Cross-validation and prediction-----
Fold #0: Logloss = 0.58119
Fold #1: Logloss = 0.58088
Fold #2: Logloss = 0.58007
Fold #3: Logloss = 0.58117
Fold #4: Logloss = 0.58143
Avg. Logloss = 0.58095 +/- 0.00047
[Time taken: 29.09s]

CPU times: user 3min, sys: 2.89 s, total: 3min 3s
Wall time: 1min 45s


In [18]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# 3-fold

In [19]:
%%time
op_3, tp_3 = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    n_splits=3,
    n_trials=50
)

----------Hyperparameter tuning----------
Best trial: 38 -> Best value: 0.58107
Best hyperparameters:
n_estimators    - 525
criterion       - gini
max_depth       - 6
max_features    - 0.95
max_samples     - 0.75
class_weight    - None
[Time taken: 825.30s]

-----Cross-validation and prediction-----
Fold #0: Logloss = 0.58151
Fold #1: Logloss = 0.58069
Fold #2: Logloss = 0.58100
Avg. Logloss = 0.58107 +/- 0.00033
[Time taken: 16.36s]

CPU times: user 25min 3s, sys: 25.7 s, total: 25min 28s
Wall time: 14min 1s


# 5-fold

In [20]:
%%time
op_5, tp_5 = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    n_splits=5,
    n_trials=50
)

----------Hyperparameter tuning----------
Best trial: 23 -> Best value: 0.58077
Best hyperparameters:
n_estimators    - 250
criterion       - entropy
max_depth       - 5
max_features    - 0.95
max_samples     - 0.95
class_weight    - None
[Time taken: 2135.44s]

-----Cross-validation and prediction-----
Fold #0: Logloss = 0.58113
Fold #1: Logloss = 0.58060
Fold #2: Logloss = 0.57967
Fold #3: Logloss = 0.58117
Fold #4: Logloss = 0.58126
Avg. Logloss = 0.58077 +/- 0.00059
[Time taken: 15.55s]

CPU times: user 1h 5min 44s, sys: 53 s, total: 1h 6min 37s
Wall time: 35min 51s


# 7-fold

In [21]:
%%time
op_7, tp_7 = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    n_splits=7,
    n_trials=50
)

----------Hyperparameter tuning----------
Best trial: 44 -> Best value: 0.58075
Best hyperparameters:
n_estimators    - 725
criterion       - gini
max_depth       - 6
max_features    - 0.95
max_samples     - 0.8500000000000001
class_weight    - None
[Time taken: 2390.39s]

-----Cross-validation and prediction-----
Fold #0: Logloss = 0.58176
Fold #1: Logloss = 0.58080
Fold #2: Logloss = 0.58095
Fold #3: Logloss = 0.57834
Fold #4: Logloss = 0.58130
Fold #5: Logloss = 0.58123
Fold #6: Logloss = 0.58087
Avg. Logloss = 0.58075 +/- 0.00103
[Time taken: 68.29s]

CPU times: user 1h 15min 38s, sys: 55 s, total: 1h 16min 32s
Wall time: 40min 58s


# 10-fold

In [22]:
%%time
op_10, tp_10 = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    n_splits=10,
    n_trials=50
)

----------Hyperparameter tuning----------
Best trial: 43 -> Best value: 0.58062
Best hyperparameters:
n_estimators    - 575
criterion       - entropy
max_depth       - 6
max_features    - 0.8500000000000001
max_samples     - 0.8
class_weight    - None
[Time taken: 2833.11s]

-----Cross-validation and prediction-----
Fold #0: Logloss = 0.58271
Fold #1: Logloss = 0.57976
Fold #2: Logloss = 0.58140
Fold #3: Logloss = 0.58039
Fold #4: Logloss = 0.57908
Fold #5: Logloss = 0.58007
Fold #6: Logloss = 0.58163
Fold #7: Logloss = 0.57985
Fold #8: Logloss = 0.58054
Fold #9: Logloss = 0.58078
Avg. Logloss = 0.58062 +/- 0.00100
[Time taken: 74.06s]

CPU times: user 1h 30min, sys: 1min, total: 1h 31min 1s
Wall time: 48min 27s


# Submission files

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
NOTEBOOK = '07'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/machinehack/subscriber_prediction_hackathon/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [24]:
def create_submission_files(test_preds, model_config, path=SUBMISSION_PATH):
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{path}/{model_config}_{col}.csv', index=False)

In [25]:
create_submission_files(tp_3, 'et3')
create_submission_files(tp_5, 'et5')
create_submission_files(tp_7, 'et7')
create_submission_files(tp_10, 'et10')