In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tabular-playground-series-may-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-may-2022/train.csv
/kaggle/input/tabular-playground-series-may-2022/test.csv


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import loguniform
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import log_loss
from sklearn.model_selection import cross_validate, train_test_split
from sklearn import set_config
set_config(display='diagram')

Assume KFold cross_validation is valid

In [3]:
train_data = pd.read_csv(
    "/kaggle/input/tabular-playground-series-may-2022/train.csv",
)
train_target = train_data.target.copy()
train_data.drop(columns=["id", "target"], inplace=True)

In [4]:
print(train_data.f_27[0])
print(train_data.f_27.nunique())

ABABDADBAB
741354


In [5]:
new_features = pd.DataFrame(
    np.concatenate(
        train_data.f_27.apply(lambda x: [sym for sym in x]).values, axis=0
    ).reshape(-1, 10),
    columns = [f"f_27_{x}" for x in range(10)]
)

In [6]:
train_data = pd.concat(([train_data, new_features]), axis=1)
train_data.drop(columns="f_27", inplace=True)

In [7]:
cat_processor = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
preprocessor = ColumnTransformer(
    [("OrdinalEncoder", cat_processor, [f"f_27_{x}" for x in range(10)])],
    remainder="passthrough"
)
estimator = HistGradientBoostingClassifier()

model = make_pipeline(preprocessor, estimator)

In [None]:
# cv_results = cross_validate(
#     model, train_data, train_target, scoring=["accuracy", "neg_log_loss"],
#     return_train_score=True,cv=5, n_jobs=3, verbose=2
# )

In [None]:
# cv_results = pd.DataFrame(cv_results)
# cv_results

Found with random search:

| l2_regularization | learning_rate | max_leaf_nodes | min_samples_leaf | mean_test_score | std_test_score |
| ----------- | ----------- | ----------- | ----------- | ----------- | ----------- |
| 0.244505 | 0.182073 | 98 | 29 | 0.900514 | 0.000727 | 

In [8]:
def feed_batch(df, size=10000):
    for i in range(0, df.shape[0], size):
        try:
            batch = df.iloc[i : i + size].copy()
        except:
            batch = df.iloc[i:].copy()
        yield batch

In [9]:
data_generator = feed_batch(train_data[-20000:])
target_generator = feed_batch(train_target[-20000:])

In [10]:
try:
    test_batch_data = next(data_generator)
    test_batch_target = next(target_generator)
    model.fit(test_batch_data[:7500], test_batch_target[:7500])
    model.score(test_batch_data[7500:], test_batch_target[7500:])
except StopIteration:
    print("Ops")

In [41]:
(test_batch_target[:7500] == model.predict(test_batch_data[:7500])).value_counts()

True     6885
False     615
Name: target, dtype: int64

In [12]:
log_loss(test_batch_target[7500:], model.predict(test_batch_data[7500:]))

6.49335488955454

In [13]:
print(loguniform.cdf(50,1,100))  # current procentile
print(loguniform.ppf(.80,1,100))  # 80th procentile value
print(loguniform.cdf(39.810717055349734,1,100))

0.8494850021680093
39.810717055349734
0.7999999999999999


In [14]:
def update_int_param(cur_val, min_val=2, max_val=256, by=1):
    # ensure return is int and within the interval
    new_val = max([round(cur_val + by), min_val])
    return min([new_val, max_val])

def update_log_param(cur_val, min_val=1e-3, max_val=1e3, by=0.001):
    cur_p = loguniform.cdf(cur_val, min_val, max_val)
    if (cur_p < 0) | (cur_p > 1):
        return cur_val
    return loguniform.ppf(cur_p + by, min_val, max_val)

In [15]:
model.set_params(**{"histgradientboostingclassifier__l2_regularization":1e-3})

In [16]:
param_update_funcs = {
    'histgradientboostingclassifier__l2_regularization': update_log_param,
    'histgradientboostingclassifier__learning_rate': update_log_param,
    'histgradientboostingclassifier__max_leaf_nodes': update_int_param,
    'histgradientboostingclassifier__min_samples_leaf': update_int_param,
}

In [17]:
pd.DataFrame({key: [model.get_params()[key ]] for key in param_update_funcs.keys()})

Unnamed: 0,histgradientboostingclassifier__l2_regularization,histgradientboostingclassifier__learning_rate,histgradientboostingclassifier__max_leaf_nodes,histgradientboostingclassifier__min_samples_leaf
0,0.001,0.1,31,20


In [61]:
def batch_train(
    model, batch_data, batch_target, param_update_funcs
):
    # data split, initial fit and score
    X_train, X_test, y_train, y_test = train_test_split(batch_data, batch_target)
    model.fit(X_train, y_train)
    loss = log_loss(y_test, model.predict(X_test))

    # initiate log
    params_log = np.array(
        [[model.get_params()[key] for key in param_update_funcs.keys()]])
    loss_log = [loss]

    # forward pass
    for param, update_func in param_update_funcs.items():
        cur_val = model.get_params()[param]
        new_val = update_func(cur_val)  # "step" increase
        model.set_params(**{param: new_val})
        model.fit(X_train, y_train)
        loss = log_loss(y_test, model.predict(X_test))

        # log entry
        params_log = np.append(
            params_log,
            np.array(
                [[model.get_params()[key] for key in param_update_funcs.keys()]]),
            axis=0)
        loss_log.append(loss)

        # return to the original parameter value
        model.set_params(**{param: cur_val})
    
    log = pd.DataFrame(params_log, columns=param_update_funcs.keys())
    log["test_loss"] = loss_log

    return log

In [62]:
log = batch_train(model, test_batch_data, test_batch_target, param_update_funcs)
log.rename(columns={item:item.split("__")[-1] for item in log.columns})

Unnamed: 0,l2_regularization,learning_rate,max_leaf_nodes,min_samples_leaf,test_loss
0,0.001,0.1,31.0,20.0,6.037437
1,0.001014,0.1,31.0,20.0,6.037437
2,0.001,0.101391,31.0,20.0,5.871651
3,0.001,0.1,32.0,20.0,5.857834
4,0.001,0.1,31.0,21.0,6.051253


In [65]:
def update_params(log):
    d_losses = log["test_loss"][1:] - log["test_loss"][0]
    new_params = {}
    for i, dl in enumerate(d_losses):
        if np.allclose(0, dl):
            continue  # go to next iteration
        elif dl < 0:
            new_params[log.iloc[:, i].name] = \
            log.iloc[:, i][0] + log.iloc[:, i].diff().max()
        else:
            new_params[log.iloc[:, i].name] = \
            log.iloc[:, i][0] - log.iloc[:, i].diff().max()
    return new_params

In [66]:
new_params = update_params(log)
new_params

{'histgradientboostingclassifier__learning_rate': 0.101391138573668,
 'histgradientboostingclassifier__max_leaf_nodes': 32.0,
 'histgradientboostingclassifier__min_samples_leaf': 19.0}

In [83]:
def train_epoch(model, data, target, param_update_funcs,
                batch_size=50_000, verbose=None):
    """"""
    params_log = np.array(
        [[model.get_params()[key] for key in param_update_funcs.keys()]])
    loss_log, acc_log = [], []

    data_generator = feed_batch(data, size=batch_size)
    target_generator = feed_batch(target, size=batch_size)

    for batch_idx, (batch_data, batch_target) in enumerate(
        zip(data_generator, target_generator)):
        batch_log = batch_train(model, batch_data, batch_target, param_update_funcs)
        updated_params = update_params(batch_log)
        model.set_params(**updated_params)

        # get log entries
        X_train, X_test, y_train, y_test = train_test_split(batch_data, batch_target)
        model.fit(X_train, y_train)
        loss = log_loss(y_test, model.predict(X_test))
        acc = model.score(X_test, y_test)
    
        # log
        params_log = np.append(
            params_log,
            np.array(
                [[model.get_params()[key] for key in param_update_funcs.keys()]]),
            axis=0)
        loss_log.append(loss), acc_log.append(acc)

        if verbose:
            print(f"Batch {batch_idx}: test_loss: {loss:.4f}, test_accuracy: {acc:.4f}")
        
    total_log = pd.DataFrame(params_log[1:], columns=param_update_funcs.keys())
    total_log["test_loss"] = loss_log
    total_log["test_accuracy"] = acc_log
    return total_log
    # generate a batch
    # train on the batch - get log
    # use log to get new hyperparameters
    # set new hyperparameters

In [None]:
total_log = train_epoch(model,train_data, train_target, param_update_funcs,
                        batch_size=10_000, verbose=True)

Batch 0: test_loss: 5.8164, test_accuracy: 0.8316
Batch 1: test_loss: 5.4157, test_accuracy: 0.8432
Batch 2: test_loss: 6.2032, test_accuracy: 0.8204
Batch 3: test_loss: 5.7888, test_accuracy: 0.8324
Batch 4: test_loss: 6.4934, test_accuracy: 0.8120
Batch 5: test_loss: 5.8716, test_accuracy: 0.8300
Batch 6: test_loss: 5.8855, test_accuracy: 0.8296
Batch 7: test_loss: 5.3467, test_accuracy: 0.8452
Batch 8: test_loss: 5.7335, test_accuracy: 0.8340
Batch 9: test_loss: 5.9822, test_accuracy: 0.8268
Batch 10: test_loss: 5.9545, test_accuracy: 0.8276
Batch 11: test_loss: 5.7749, test_accuracy: 0.8328
Batch 12: test_loss: 5.9546, test_accuracy: 0.8276
Batch 13: test_loss: 6.1756, test_accuracy: 0.8212
Batch 14: test_loss: 5.6782, test_accuracy: 0.8356
Batch 15: test_loss: 6.4381, test_accuracy: 0.8136
Batch 16: test_loss: 6.2032, test_accuracy: 0.8204
Batch 17: test_loss: 5.8440, test_accuracy: 0.8308
Batch 18: test_loss: 5.7611, test_accuracy: 0.8332
Batch 19: test_loss: 6.2585, test_accurac

In [88]:
total_log.rename(columns={item:item.split("__")[-1] for item in log.columns})

Unnamed: 0,l2_regularization,learning_rate,max_leaf_nodes,min_samples_leaf,test_loss,test_accuracy
0,0.001042,0.10407,38.0,21.0,4.990216,0.85552
1,0.001028,0.102623,37.0,22.0,5.034423,0.85424
2,0.001013,0.101195,36.0,21.0,4.970873,0.85608
3,0.001027,0.102603,37.0,22.0,5.042715,0.854
4,0.001042,0.10403,38.0,21.0,4.929428,0.85728
5,0.001056,0.105477,37.0,22.0,4.946003,0.8568
6,0.001071,0.10401,36.0,21.0,4.760875,0.86216
7,0.001086,0.105457,37.0,22.0,4.965349,0.85624
8,0.001101,0.106924,38.0,23.0,4.879693,0.85872
9,0.001086,0.105437,37.0,22.0,4.807846,0.8608


In [92]:
[model.get_params()[key] for key in param_update_funcs.keys()]

[0.0010847447439617773, 0.10535498969018756, 37.0, 20.0]

In [94]:
model.fit(train_data[:37500], train_target[:37500])

In [95]:
model.score(train_data[37500:50000], train_target[37500:50000])

0.85728

In [89]:
total_dummy_log = train_epoch(
    model,train_data, train_target,
    {key: lambda x:x for key in param_update_funcs.keys()}, verbose=True)

KeyboardInterrupt: 