# UPDATE - 6/12/2021

In today's `Kaggler` v0.9.13 release, I added transfer learning between `DAE`/`SDAE`. So you can train `SDAE` only with training data, then initialize `DAE` with the trained `SDAE` model, train it with both trainging and test data, or vice versa.

Example code is as follows:
```python
# train supervised DAE only with trianing data
sdae = SDAE(cat_cols=cat_cols, num_cols=num_cols, encoding_dim=encoding_dim, random_state=RANDOM_SEED)
_ = sdae.fit_transform(trn[feature_cols], trn[TARGET_COL])

# initialize unsupervied DAE and train it with both training and test data
dae = DAE(cat_cols=cat_cols, num_cols=num_cols, encoding_dim=encoding_dim, random_state=RANDOM_SEED,
          pretrained_model=sdae, freeze_embedding=True)
_ = dae.fit_transform(df[feature_cols])

# initialize another supervised DAE and train it with training data
sdae2 = SDAE(cat_cols=cat_cols, num_cols=num_cols, encoding_dim=encoding_dim, random_state=RANDOM_SEED,
             pretrained_model=dae, freeze_embedding=False)
_ = sdae2.fit_transform(trn[feature_cols], trn[TARGET_COL])
```

Hope it helps!

# TPS 6 - Supervised DAE + Keras (GPU)

In this notebook, I will show how to train a neural network model with supervised denoising autoencoder (SDAE) and target encoded features in Keras (GPU).

I added the supervised version of DAE, `SDAE` to `Kaggler` in today's v0.9.8 release. At Kaggle, DAE is mostly used as a unsupervised feature extraction method. However, it's possible to train DAE in a supervised manner with a target variable.

To transform features with `SDAE`, you can do as follows:

```python
sdae = SDAE(cat_cols=feature_cols, encoding_dim=encoding_dim, n_layer=1, noise_std=.001, random_state=seed)
sdae.fit(trn[feature_cols], y)
X = sdae.transform(df[feature_cols])
```

The contents of the notebooks are organized as follows:

1. Installing and loading libraries: installs `Kaggler` and load data and libraries
2. Feature engineering: shows how to transform features with target encoding with `Kaggler`
3. Model definition and training: shows how to define and train a NN model with skip connection in `Keras`
4. Submission

Enjoy~!

# Part 1. Loading Libraries and Data

In [None]:
import gc
import joblib
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from warnings import simplefilter

In [None]:
# limit the GPU memory growth
gpu = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpu))
if len(gpu) > 0:
    tf.config.experimental.set_memory_growth(gpu[0], True)

In [None]:
!pip install kaggler

In [None]:
import kaggler
from kaggler.model import AutoLGB
from kaggler.preprocessing import DAE, SDAE, TargetEncoder, LabelEncoder
print(kaggler.__version__)

In [None]:
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
simplefilter('ignore')

In [None]:
feature_name = 'le_te_sdae'
algo_name = 'fc'
version = 11
model_name = f'{algo_name}_{feature_name}_v{version}'

data_dir = Path('../input/tabular-playground-series-jun-2021')
train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

build_dir = Path('.')
predict_val_file = build_dir / f'{model_name}.val.txt'
predict_tst_file = build_dir / f'{model_name}.tst.txt'
submission_file = build_dir / f'{model_name}.sub.csv'

id_col = 'id'
target_col = 'target'

n_fold = 5
encoding_dim = 128
n_encoder = 3
seed = 42
n_class = 9
n_stop = 5
n_epoch = 100
n_emb = 16
n_hidden_unit = 128 
dropout = .3
ratio = 4
batch_size = 64 * ratio
lr = 0.0001 * ratio

In [None]:
trn = pd.read_csv(train_file, index_col=id_col)
tst = pd.read_csv(test_file, index_col=id_col)
sub = pd.read_csv(sample_file, index_col=id_col)
print(trn.shape, tst.shape, sub.shape)

In [None]:
y = trn[target_col].str.split('_').str[1].astype(int) - 1
n_trn = trn.shape[0]
df = pd.concat([trn.drop(target_col, axis=1), tst], axis=0)
print(df.shape)

In [None]:
feature_cols = df.columns.tolist()
print(len(feature_cols))

# Part 2. Feature Engineering

## DAE vs. SDAE Comparison/Transfer Learning

In [None]:
sdae = SDAE(cat_cols=feature_cols, encoding_dim=encoding_dim, n_layer=1, n_encoder=n_encoder, noise_std=.01, 
            batch_size=batch_size, learning_rate=lr, n_epoch=3, random_state=seed, label_encoding=True)
sdae.fit(trn[feature_cols], y)

In [None]:
dae = DAE(cat_cols=feature_cols, encoding_dim=encoding_dim, n_layer=1, n_encoder=n_encoder, noise_std=.01, 
          batch_size=batch_size, learning_rate=lr, n_epoch=10, random_state=seed, label_encoding=True,
          pretrained_model=sdae, freeze_embedding=True)
dae.fit(df[feature_cols])

In [None]:
X = dae.transform(df[feature_cols].sample(n=1_000, random_state=seed))
print(X.shape)
ax = sns.heatmap(X)

In [None]:
sdae_cols = [f'sdae_{i + 1}' for i in range(encoding_dim * n_encoder)]

In [None]:
le = LabelEncoder(min_obs=50)
df_le = le.fit_transform(df[feature_cols])
df_le.columns = [f'le_{x}' for x in df.columns]

In [None]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
te = TargetEncoder(cv=cv)
te.fit(trn[feature_cols], y)
df_te = te.transform(df[feature_cols])
df_te.columns = [f'te_{x}' for x in df.columns]

In [None]:
all_df = pd.concat([df_le, df_te], axis=1)
print(all_df.shape)
all_df.head()

In [None]:
all_feature_cols = all_df.columns.tolist()
n_feature = len(all_feature_cols)
cat_cols = df_le.columns.tolist()
num_cols = [x for x in all_feature_cols if x not in cat_cols]
n_cat_col = len(cat_cols)
n_num_col = len(num_cols)
print(n_feature, n_cat_col, n_num_col)

# Part 3. Keras NN Model Training

We will use a neural network model with skip connections.

In [None]:
def build_model(n_emb=16, n_hidden_unit=128, dropout=.3):
    cat_inputs = []
    embs = []
    for i, col in enumerate(cat_cols):
        inp = keras.layers.Input((1,), name=f'{col}')
        emb = keras.layers.Embedding(input_dim=all_df[col].nunique(), output_dim=n_emb)(inp)
        cat_inputs.append(inp)
        embs.append(emb)

    num_inputs = keras.layers.Input((len(num_cols) + len(sdae_cols),))
    
    inputs = cat_inputs + [num_inputs]
    merged_inputs = keras.layers.Concatenate()(inputs)
    x = keras.layers.Dense(n_hidden_unit, 'relu')(merged_inputs)
    x = keras.layers.Dropout(dropout)(x)
    ox = x
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Add()([ox, x])
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.Dropout(dropout)(x)
    ox = x
    
    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Add()([ox, x])

    x = keras.layers.Dense(n_hidden_unit, 'relu')(x)
    x = keras.layers.Dropout(dropout)(x)

    outputs = keras.layers.Dense(n_class, 'softmax')(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(optimizer=keras.optimizers.Adam(lr), loss='sparse_categorical_crossentropy')
    return model

In [None]:
model = build_model(n_emb, n_hidden_unit, dropout)
model.summary()

To avoid overfitting, we will generate `SDAE` features for each cross-validation fold.

In [None]:
es = keras.callbacks.EarlyStopping(patience=n_stop, restore_best_weights=True)
rlr  = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=n_stop, cooldown=0, min_lr=1e-7)

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

X = all_df.iloc[:n_trn].values
X_tst = all_df.iloc[n_trn:].values

P = np.zeros((n_trn, n_class), dtype=float)
P_tst = np.zeros((X_tst.shape[0], n_class), dtype=float)
for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
    y_trn, y_val = y[i_trn], y[i_val]
    
    sdae = SDAE(cat_cols=df.columns.tolist(), encoding_dim=encoding_dim, n_layer=1, n_encoder=3,
                noise_std=.01, batch_size=batch_size, learning_rate=lr, n_epoch=20, random_state=seed, 
                label_encoding=True, pretrained_model=dae, freeze_embedding=False)
    X_sdae_trn = sdae.fit_transform(trn[feature_cols].iloc[i_trn], y_trn,
                                    validation_data=(trn[feature_cols].iloc[i_val], y_val))
    X_sdae_val = sdae.transform(trn[feature_cols].iloc[i_val])
    X_sdae_tst = sdae.transform(tst[feature_cols])
    
    X_trn_i = np.hstack([X[i_trn], X_sdae_trn])
    X_val_i = np.hstack([X[i_val], X_sdae_val])
    X_tst_i = np.hstack([X_tst, X_sdae_tst])
    
    joblib.dump(X_trn_i, str(build_dir / f'{feature_name}.trn{i}.joblib'))
    joblib.dump(X_val_i, str(build_dir / f'{feature_name}.val{i}.joblib'))
    joblib.dump(X_tst_i, str(build_dir / f'{feature_name}.tst{i}.joblib'))
    
    model = build_model()
    history = model.fit([X_trn_i[:, i] for i in range(n_cat_col)] + [X_trn_i[:, n_cat_col:]], 
                        y_trn,
                        validation_data=([X_val_i[:, i] for i in range(n_cat_col)] + [X_val_i[:, n_cat_col:]], 
                                         y_val),
                        epochs=n_epoch, batch_size=batch_size, callbacks=[es, rlr], verbose=0)
    P[i_val] = model.predict([X_val_i[:, i] for i in range(n_cat_col)] + [X_val_i[:, n_cat_col:]])
    P_tst += model.predict([X_tst_i[:, i] for i in range(n_cat_col)] + [X_tst_i[:, n_cat_col:]]) / n_fold
    print(f'CV #{i} Loss: {log_loss(y[i_val], P[i_val]):.6f}')

    del model, history
    gc.collect()
    K.clear_session()

In [None]:
print(f'CV Loss: {log_loss(y, P):.6f}')
np.savetxt(predict_val_file, P, fmt='%.6f')
np.savetxt(predict_tst_file, P_tst, fmt='%.6f')

# Part 4. Submission

In [None]:
sub[sub.columns] = P_tst
sub.to_csv(submission_file)
sub.head()

Hope this helps.