# Combining discrete and continuous features in neural networks
---

## Reference
* [Simple Keras embedding in 10 folds](https://www.kaggle.com/pourchot/simple-keras-embedding-in-10-folds) by [@pourchot](https://www.kaggle.com/pourchot)

## Libraries

In [None]:
import pandas as pd
import numpy as np
import datetime
import random
import time
import os
import gc

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix, classification_report
from sklearn.cluster import KMeans
from scipy.stats import mode, skew, kurtosis

from tensorflow.keras import backend as K
import tensorflow as tf
import tensorflow_addons as tfa

import matplotlib.pyplot as plt
import seaborn as sns

#----------
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50

import warnings
warnings.simplefilter('ignore')

In [None]:
tf.__version__, tfa.__version__

## Configuration

In [None]:
CFG = {
    'target': 'target',
    'n_class': 9,
    'lr': 1e-4,
    'batch_size': 256,
    'epochs': 50,
    'verbose': 1,
    'patience': 5,
    'n_splits': 10,
    'seed': 2021
}

In [None]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(CFG['seed'])

## Load and check data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

all_df = pd.concat([train, test]).reset_index(drop=True)

In [None]:
cat_features = [col for col in all_df.columns if 'feature_' in col]
cnt_features = []

In [None]:
all_df['mean'] = np.mean(all_df[cat_features], axis=1)
all_df['std'] = np.std(all_df[cat_features], axis=1)
all_df['skew'] = skew(all_df[cat_features], axis=1)
all_df['kurtosis'] = kurtosis(all_df[cat_features], axis=1)
cnt_features += ['mean', 'std', 'skew', 'kurtosis']

In [None]:
scaler = MinMaxScaler()
all_df[cnt_features] = scaler.fit_transform(all_df[cnt_features])

In [None]:
km = KMeans(n_clusters=CFG['n_class']*2, random_state=CFG['seed'], n_jobs=-1)
all_df['cluster'] = km.fit_predict(all_df[cat_features])
cat_features += ['cluster']

In [None]:
all_features = cat_features + cnt_features

In [None]:
train_npy = all_df.iloc[:train.shape[0]][all_features].to_numpy()
test_npy = all_df.iloc[train.shape[0]:][all_features].to_numpy()
target = train[CFG['target']].apply(lambda x: int(x.split("_")[-1])-1).to_numpy()

### Check train data

In [None]:
plt.figure(figsize=(16, 4))

for pos in range(10):
    i = random.sample(list(range(train.shape[0])), 1)
    
    plt.tight_layout()
    plt.subplot(2, 5, pos+1)
    plt.plot(train_npy[i[0]].reshape(-1))
    plt.title(f"row {i[0]}")

### Check test data

In [None]:
plt.figure(figsize=(16, 4))

for pos in range(10):
    i = random.sample(list(range(test.shape[0])), 1)

    plt.tight_layout()
    plt.subplot(2, 5, pos+1)
    plt.plot(test_npy[i[0]].reshape(-1))
    plt.title(f"row {i[0]}")

### Check target distribution

In [None]:
plt.figure(figsize=(16, 2))
plt.hist(target, bins=CFG['n_class'])
plt.title("Target distribution")
plt.xlabel('Label')
plt.ylabel('freq #')

In [None]:
X_train = train_npy.copy()
X_test = test_npy.copy()
y_train = tf.keras.utils.to_categorical(target, num_classes=CFG['n_class'])

## Create CNN models

In [None]:
def create_model(cat_shape=(76, ), cnt_shape=(4, )):
    cat_input = tf.keras.layers.Input(shape=cat_shape, name='cat_input')
    cnt_input = tf.keras.layers.Input(shape=cnt_shape, name='cnt_input')

    x = tf.keras.layers.Embedding(1024, 16, name='embedding_1')(cat_input)
    x1 = tf.keras.layers.Flatten(name='flatten')(x)
  
    x = tf.keras.layers.Dense(32, activation='relu', name='dense_1')(cnt_input)
    x2 = tf.keras.layers.Dropout(0.2, name='dropout_1')(x)

    x = tf.keras.layers.Concatenate(axis=1)([x1, x2])
    
    x = tf.keras.layers.Dropout(0.4, name='dropout_2')(x)
    x = tf.keras.layers.Dense(128, activation='relu', name='dense_2')(x)
    x = tf.keras.layers.Dense(64, activation='relu', name='dense_3')(x)
    x = tf.keras.layers.Dense(32, activation='relu', name='dense_4')(x)
    outputs = tf.keras.layers.Dense(CFG['n_class'], activation='softmax', name='output')(x)
    
    model = tf.keras.Model([cat_input, cnt_input], outputs)

    metrics = tf.keras.metrics.CategoricalCrossentropy(
        from_logits=False,
        label_smoothing=0,
        name='categorical_crossentropy'
    )
    loss = tf.keras.losses.CategoricalCrossentropy(
                from_logits=False,
                label_smoothing=0,
                reduction='auto',
                name='categorical_crossentropy'
    )
    optimizer = tfa.optimizers.AdamW(
        weight_decay=1e-7,
        learning_rate=CFG['lr'],
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07,
        amsgrad=True,
        name='AdamW',
    )
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    
    return model

create_model().summary()

## Training

In [None]:
scheduler_cb = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=2,
    verbose=0,
    mode='auto',
    min_delta=0.0001,
    cooldown=0,
    min_lr=0
)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=CFG['patience'],
    verbose=1,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)


In [None]:
kf = StratifiedKFold(n_splits=CFG['n_splits'], shuffle=True, random_state=CFG['seed'])
history = []

nn_oof = np.zeros((X_train.shape[0], CFG['n_class']))
nn_pred = 0

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=X_train, y=target)):
    print(f"===== FOLD {fold} =====")
    X_tr, y_tr = X_train[trn_idx], y_train[trn_idx]
    X_va, y_va = X_train[val_idx], y_train[val_idx]

    start = time.time()
    K.clear_session()
    
    model = create_model(cat_shape=X_tr[:, :len(cat_features)].shape[1], cnt_shape=X_tr[:, len(cat_features):].shape[1])
    
    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    history.append(
        model.fit(
            x=[X_tr[:, :len(cat_features)], X_tr[:, len(cat_features):]],
            y=y_tr,
            batch_size=CFG['batch_size'],
            epochs=CFG['epochs'],
            validation_data=([X_va[:, :len(cat_features)], X_va[:, len(cat_features):]],
                             y_va),
            callbacks=[scheduler_cb, early_stopping_cb, tensorboard_cb],
            verbose=CFG['verbose']
        )
    )
    
    nn_oof[val_idx] = model.predict([X_va[:, :len(cat_features)], X_va[:, len(cat_features):]])
    nn_pred += model.predict([X_test[:, :len(cat_features)], X_test[:, len(cat_features):]]) / CFG['n_splits']
    
    nn_logloss = log_loss(target[val_idx], nn_oof[val_idx])
    min_loss = min(history[fold].history['loss'])
    min_val_loss = min(history[fold].history['val_loss'])
    print(f"FOLD {fold:d}: logloss score {nn_logloss:.6f} (train loss={min_loss:.6f}, validation loss={min_val_loss:.6f})")
    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
    del model
    _ = gc.collect()

nn_logloss = log_loss(target, nn_oof)
print(f"logloss score {nn_logloss}")

## Submission

In [None]:
submission.iloc[:, 1:] = nn_pred
submission.to_csv("submission.csv", index=False)

## Check results

In [None]:
plt.figure(figsize=(16, 8), tight_layout=True)
for i in range(9):
    plt.subplot(3, 3, i+1)
    plt.title(f"Class_{i+1}")
    submission[f'Class_{i+1}'].hist(bins=int(submission.shape[0]/1000))

In [None]:
cm = confusion_matrix(target, nn_oof.argmax(axis=1))

plt.figure(figsize=((16,8)))
sns.heatmap(cm, annot=True, fmt='5d', cmap='Blues')
plt.savefig("confusion_matrix.png")

In [None]:
print(classification_report(target, nn_oof.argmax(axis=1), digits=4))