In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sklearn.datasets as sk_data
import sklearn.model_selection as sk_selection
import sklearn.metrics as sk_metrics
import sklearn.ensemble as sk_ensemble
import random_neural_net_models.tabular as rnnm_tab
import torch.optim as optim
import random_neural_net_models.losses as rnnm_loss
import random_neural_net_models.learner as rnnm_learner
import random_neural_net_models.data as rnnm_data
import random_neural_net_models.utils as rnnm_utils
from pathlib import Path
from torch.utils.data import DataLoader

In [None]:
rnnm_utils.make_deterministic(42)

In [None]:
device = rnnm_utils.get_device()
device

In [None]:
X, y = sk_data.fetch_kddcup99(random_state=42, return_X_y=True, as_frame=True)

In [None]:
cat_cols = [
    "protocol_type",
    "service",
    "flag",
    "land",
    "logged_in",
    "is_host_login",
    "is_guest_login",
]

In [None]:
X[cat_cols]

In [None]:
num_cols = [c for c in X.columns if c not in cat_cols]
X[num_cols].T

In [None]:
X

In [None]:
X, maps_str2int_x = rnnm_tab.make_string_columns_to_int(
    X, categorical_columns=cat_cols
)
X[cat_cols].T

In [None]:
y

In [None]:
y, map_str2int_y = rnnm_tab.make_string_series_to_int(y)
y

In [None]:
X0, X1, y0, y1 = sk_selection.train_test_split(
    X.to_numpy(), y.to_numpy(), test_size=0.2, random_state=42, stratify=y
)

## baseline model

In [None]:
cat_cols

In [None]:
cat_cols_ids = X.columns.get_indexer_for(cat_cols)
cat_cols_ids

In [None]:
num_cols_ids = X.columns.get_indexer_for(num_cols)
num_cols_ids

In [None]:
base_model = sk_ensemble.HistGradientBoostingClassifier(
    categorical_features=cat_cols_ids
)

In [None]:
base_model.fit(X0, y0)

In [None]:
y_pred_base = base_model.predict(X1)

In [None]:
print(sk_metrics.classification_report(y_true=y1, y_pred=y_pred_base))

              precision    recall  f1-score   support

           0       0.39      0.98      0.55       441
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00        11
           4       0.00      0.00      0.00         2
           5       0.86      0.88      0.87       249
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         1
           9       1.00      0.97      0.99     21440
          10       0.05      0.13      0.07        46
          11       0.99      0.87      0.93     19456
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         1
          14       0.20      0.83      0.32        53
          15       0.73      0.55      0.62       208
          16       0.00      0.00      0.00         2
          17       0.89      0.85      0.87       318
          18       1.00      1.00      1.00     56158
          19       0.00      0.00      0.00         0
          20       0.98      0.98      0.98       196
          21       0.54      0.76      0.63       204
          22       0.00      0.00      0.00         4

    accuracy                           0.97     98805
   macro avg       0.33      0.38      0.34     98805
weighted avg       0.99      0.97      0.98     98805

## neural net model

In [None]:
import numpy as np

# y0 = y0.astype(np.int64)
# y1 = y1.astype(np.int64)

In [None]:
X0[:, num_cols_ids].shape

In [None]:
X0_cat = X0[:, cat_cols_ids].astype(int)
X1_cat = X1[:, cat_cols_ids].astype(int)
X0_num = X0[:, num_cols_ids].astype(float)
X1_num = X1[:, num_cols_ids].astype(float)

In [None]:
np.isfinite(X0_cat).all(), np.isfinite(X0_num).all()

In [None]:
np.isfinite(X1_cat).all(), np.isfinite(X1_num).all()

In [None]:
ds_train = rnnm_data.NumpyNumCatTrainingDataset(
    X_numerical=X0_num, X_categorical=X0_cat, y=y0
)
ds_valid = rnnm_data.NumpyNumCatTrainingDataset(
    X_numerical=X1_num, X_categorical=X1_cat, y=y1
)

In [None]:
ds_train[0]

In [None]:
len(X0)

In [None]:
batch_size = 500
dl_train = DataLoader(
    ds_train,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=rnnm_data.collate_numpy_numcat_dataset_to_xyblock_keep_orig_y,
    drop_last=True,
)
dl_valid = DataLoader(
    ds_valid,
    batch_size=batch_size,
    collate_fn=rnnm_data.collate_numpy_numcat_dataset_to_xyblock_keep_orig_y,
)

In [None]:
next(iter(dl_train))

In [None]:
n_cats_per_col = rnnm_data.calc_n_categories_per_column(X0_cat)
n_cats_per_col

In [None]:
n_hidden = [200, 100]
do_impute = False
impute_bias_source = rnnm_tab.BiasSources.zero
n_features = X0.shape[1]
n_classes = len(set(y0))

model = rnnm_tab.TabularModelClassification(
    n_features=n_features,
    n_hidden=n_hidden,
    n_classes=n_classes,
    use_batch_norm=True,
    n_categories_per_column=n_cats_per_col,
)

In [None]:
learning_rate = 0.1
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss = rnnm_loss.CrossEntropyXy()
loss_callback = rnnm_learner.TrainLossCallback()

save_dir = Path("./models")

callbacks = [loss_callback]

In [None]:
learner = rnnm_learner.Learner(
    model,
    optimizer,
    loss,
    callbacks=callbacks,
    save_dir=save_dir,
    device=device,
    show_epoch_progress=True,
)

In [None]:
lr_find_callback = rnnm_learner.LRFinderCallback(1e-5, 100, 100)

learner.find_learning_rate(
    dl_train, n_epochs=10, lr_find_callback=lr_find_callback
)

In [None]:
# !mkdir models

In [None]:
lr_find_callback.plot(yscale="log", ylim=(0.3, 5))

In [None]:
learning_rate = 1e-2
n_epochs = 5

scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer=optimizer,
    max_lr=learning_rate,
    epochs=n_epochs,
    steps_per_epoch=len(dl_train),
)
scheduler_callback = rnnm_learner.EveryBatchSchedulerCallback(scheduler)
learner.update_callback(scheduler_callback)

In [None]:
learner.fit(dl_train, n_epochs=n_epochs, dataloader_valid=dl_valid)

In [None]:
loss_callback.plot(yscale="log")

In [None]:
probs = learner.predict(dl_valid).detach().softmax(dim=1).numpy()
probs[:3]

In [None]:
y_pred_nn = np.argmax(probs, axis=1)
y_pred_nn

In [None]:
print(sk_metrics.classification_report(y_true=y1, y_pred=y_pred_nn))

              precision    recall  f1-score   support

           0       1.00      0.78      0.88       441
           1       1.00      0.17      0.29         6
           2       1.00      0.50      0.67         2
           3       0.71      0.91      0.80        11
           4       1.00      0.50      0.67         2
           5       0.99      0.96      0.98       249
           6       0.27      1.00      0.42         4
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         1
           9       1.00      1.00      1.00     21440
          10       0.89      0.91      0.90        46
          11       0.99      1.00      1.00     19456
          12       0.00      0.00      0.00         1
          13       1.00      1.00      1.00         1
          14       0.98      0.98      0.98        53
          15       1.00      1.00      1.00       208
          16       0.00      0.00      0.00         2
          17       1.00      0.98      0.99       318
          18       1.00      1.00      1.00     56158
          20       1.00      0.99      1.00       196
          21       0.90      0.89      0.89       204
          22       0.00      0.00      0.00         4

    accuracy                           1.00     98805
   macro avg       0.72      0.66      0.66     98805
weighted avg       1.00      1.00      1.00     98805