# Imports

In [None]:
import time
import gc
gc.enable()
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import scipy.stats as st

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import #task-dependent

from sklearn.ensemble import ExtraTreesClassifier as XTC
from sklearn.ensemble import ExtraTreesRegressor as XTR

import optuna
from optuna.samplers import TPESampler

In [None]:
train = pd.read_csv('')
test = pd.read_csv('')

# Config

In [None]:
SEED = 2311

N_FOLDS = 5
N_THREADS = 4 #number of CPUs

IS_CLF = True #True for Classification, False for Regression
TARGET = '----'
ID_COL = '----'
TEST_INDEX = test.pop(ID_COL) # id column

# Preprocessing

In [None]:
features = list(test.columns)
cat_features = list(test.select_dtypes('category').columns)
num_features = list(test.select_dtypes('number').columns)

In [None]:
train[cat_features] = train[cat_features].astype('int')
test[cat_features] = test[cat_features].astype('int')

In [None]:
labels = LabelEncoder()
train[TARGET] = labels.fit_transform(train[TARGET])

# Baseline

In [None]:
baseline_params = {
    'n_estimators': 150,
    'n_jobs': N_THREADS,
    'verbose': 0,
    'random_state': SEED
}

if TASK_IS_CLF:
    baseline = XTC(**baseline_params).fit(train[features], train[TARGET])
else:
    baseline = XTR(**baseline_params).fit(train[features], train[TARGET])

In [None]:
predictions = baseline.predict(test[features])

submission_baseline = pd.DataFrame({ID_COL: TEST_INDEX, 
                                    TARGET: labels.inverse_transform(predictions)})

In [None]:
del baseline
gc.collect()

# Hyperparameter tuning

In [None]:
def objective(trial, train):
    param_grid = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'max_features': trial.suggest_discrete_uniform('max_features', 0.1, 1.0, 0.1),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'ccp_alpha': trial.suggest_uniform('ccp_alpha', 0.0, 0.1)
    }
    
    if param_grid['bootstrap']:
        param_grid['oob_score'] = trial.suggest_categorical('oob_score', [True, False])
        param_grid['max_samples'] = trial.suggest_uniform('max_samples', 0.1, 1.0)
    
    if IS_CLF:
        param_grid['criterion'] = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        param_grid['class_weight'] = trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample'])
        model = XTC(**param_grid, verbose=0, n_jobs=N_THREADS, random_state=SEED)
    else:
        param_grid['criterion'] = trial.suggest_categorical('criterion', ['squared_error', 'absolute_error'])
        model = XTR(**param_grid, verbose=0, n_jobs=N_THREADS, random_state=SEED)
        
    scores = []
    for fold in range(N_FOLDS):
        xtrain = train[train.fold != fold]
        ytrain = xtrain[TARGET]
        xval = train[train.fold == fold]
        yval = xval[TARGET]
        gc.collect()
        
        model.fit(xtrain[features], ytrain)
        
        val_preds = model.predict(xval[features])
#         val_preds = model.predict_proba(xval[features])[:1]
        score = ----(yval, val_preds)
        scores.append(score)
        
    return np.mean(scores)

In [None]:
def tune(objective, direction, train):
    study = optuna.create_study(sampler=TPESampler(seed=SEED),
                                direction=direction)
    
    study.optimize(lambda trial: objective(trial, train),
                   n_trials=100)
    
    best_params = study.best_params
    print(f'Best score: {study.best_value:.5f}')
    print('Best params:')
    for key, value in best_params.items():
        print(f'\t{key}: {value}')
    
    return best_params

In [None]:
direction = '----' #maximize/minimize according to metric
best_params = tune(objective, direction, train)

In [None]:
gc.collect()

# CV + Inference

In [None]:
def custom_cv(train, test, features, model):
    oof_preds = {}
    test_preds = []
    scores = []
    
    cv_start = time.time()
    for fold in range(N_SPLITS):
        print('-' * 40)
        
        xtrain = train[train.fold != fold].reset_index(drop=True)
        xval = train[train.fold == fold].reset_index(drop=True)    
        val_idx = xval[ID_COL].values.tolist()
        
        fold_start = time.time()
        
        model.fit(xtrain[features], xtrain[TARGET])
        val_preds = model.predict(xval[features])
#         val_preds = model.predict_proba(xval[features])[:,1]
        oof_preds.update(dict(zip(val_idx, val_preds)))
        score = ----(xval[TARGET], val_preds)
        scores.append(score)
        
        fold_end = time.time()
        
        print(f'Fold #{fold}: Score = {score:.5f}\t[Time: {fold_end - fold_start:.2f} secs]')
        
        test_preds.append(model.predict(test[features]))
#         test_preds.append(model.predict_proba(test[features])[:,1])
        
    cv_end = time.time()
    print(f'\nAverage score = {np.mean(scores):.5f} with std. dev. = {np.std(scores):.5f}')
    print(f'[Total time: {cv_end - cv_start:.2f} secs]\n')
    
    oof_preds = pd.DataFrame.from_dict(oof_preds, orient='index').reset_index()
    test_preds = st.mode(np.column_stack(test_preds), axis=1).mode
#     test_preds = np.mean(np.column_stack(test_preds), axis=1)
    
    return oof_preds, test_preds

In [None]:
if IS_CLF:
    model = XTC(**best_params, verbose=0, n_jobs=N_THREADS, random_state=SEED)
else:
    model = XTR(**best_params, verbose=0, n_jobs=N_THREADS, random_state=SEED)

In [None]:
oof_preds, test_preds = custom_cv(train, test, features, model)

# Postprocessing and Submission

In [None]:
#any post-processing if needed

test_preds = labels.inverse_transform(test_preds)

In [None]:
submission_xt = pd.DataFrame({ID_COL: TEST_INDEX, 
                              TARGET: test_preds})

submission_xt.to_csv('submission_xt.csv', index=False)