# Keras NN Tabular Regression Problem

_By Nick Brooks_

https://keras.io/examples/structured_data/structured_data_classification_from_scratch/

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import pprint as pp
import matplotlib.pyplot as plt
import time

import itertools
import seaborn as sns

from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import to_categorical
from tensorflow.keras import callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

notebookstart = time.time()

print("Tensorflow Version: ", tf.__version__)
print("Eager mode enabled: ", tf.executing_eagerly())
print("GPU available: ", tf.test.is_gpu_available())

In [None]:
TARGETVAR = "target"
IDVAR = 'id'
NCHANNELS = 1
BATCHSIZE = 128
EPOCHS = 100

continuous_cols = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']
string_cols = []
categorical_cols = []

all_cols = continuous_cols + string_cols + categorical_cols

In [None]:
def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature


def encode_string_categorical_feature(feature, name, dataset):
    # Create a StringLookup layer which will turn strings into integer indices
    index = StringLookup()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    index.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = index(feature)

    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")

    # Prepare a dataset of indices
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices
    encoded_feature = encoder(encoded_feature)
    return encoded_feature


def encode_integer_categorical_feature(feature, name, dataset):
    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the space of possible indices
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices
    encoded_feature = encoder(feature)
    return encoded_feature


def dataframe_to_dataset(dataframe, labels, role, BATCHSIZE):
    dataframe = dataframe.copy()
    if role != "test":
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    else: 
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if role == "train":
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(BATCHSIZE)
    return ds

def ohe_target(arr):
    label_mapper = {name: i for i,name in enumerate(set(arr))}
    num_label = np.vectorize(label_mapper.get)(arr)
    train_labels = to_categorical(num_label)
    
    return train_labels, num_label, label_mapper

### From http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py #
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
    #    print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
train[IDVAR] = train.index

display(train.sample(4))
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
test_index = test[IDVAR]
submission = pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")

# train_labels, num_label, label_mapper = ohe_target(train[TARGETVAR].values)
train_labels = train[TARGETVAR].values
val_dataframe = train[all_cols].sample(frac=0.2, random_state=1337)
train_dataframe = train[all_cols].drop(val_dataframe.index)

## Process Training
# train_labels[train_labels <= 0] = 0.001

In [None]:
print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

train_ds = dataframe_to_dataset(train_dataframe, train.loc[train_dataframe.index, TARGETVAR],
                                "train", BATCHSIZE)
val_ds = dataframe_to_dataset(val_dataframe, train.loc[val_dataframe.index, TARGETVAR],
                              "val", BATCHSIZE)
test_ds = dataframe_to_dataset(test[all_cols], np.zeros((test.shape[0], NCHANNELS)), "test", BATCHSIZE)

In [None]:
print('Look at Data')
for x, y in train_ds.take(1):
    pp.pprint(x)
    pp.pprint(y)

In [None]:
continuous_inputs = [keras.Input(shape=(1,), name=x) for x in continuous_cols]
string_inputs = [keras.Input(shape=(1,), name=x, dtype="string") for x in string_cols]
categorical_inputs = [keras.Input(shape=(1,), name=x, dtype="int64") for x in categorical_cols]

all_inputs = continuous_inputs + string_inputs + categorical_inputs
print("All Input Len: {}".format(len(all_inputs)))

encoded_strings = [encode_string_categorical_feature(var_input, var_name, train_ds)
                   for var_input, var_name in zip(string_inputs, string_cols)]

encoded_nums = [encode_numerical_feature(var_input, var_name, train_ds)
                   for var_input, var_name in zip(continuous_inputs, continuous_cols)]

encoded_cats = [encode_integer_categorical_feature(var_input, var_name, train_ds)
                   for var_input, var_name in zip(categorical_inputs, categorical_cols)]

all_features = layers.concatenate(encoded_nums + encoded_strings + encoded_cats)
print("All Feature Feature Len: {}".format(all_features.shape))

In [None]:
def build_model(NCHANNELS=NCHANNELS):
    x = layers.Dense(64, activation="relu")(all_features)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(3, activation="relu")(x)
    x = layers.Dropout(0.3)(x)

    output = layers.Dense(NCHANNELS, activation="linear")(x)
    model = keras.Model(all_inputs, output)
    return model

In [None]:
# `rankdir='LR'` is to make the graph horizontal.
model = build_model()
keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

In [None]:
model = build_model()
# checkpoint = callbacks.ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)
es = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001,
                             patience=7, verbose=1, mode='min', baseline=None,
                             restore_best_weights=True)

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=2, min_lr=0.00001, verbose=1)

model.compile(Adam(lr=1e-3), "mse", metrics=["mse", "mae", tf.keras.metrics.RootMeanSquaredError()])
history = model.fit(train_ds, batch_size=BATCHSIZE, epochs=EPOCHS,
                    validation_data=val_ds, verbose=1, callbacks = [es, reduce_lr])

In [None]:
plot_metrics = ['loss', 'root_mean_squared_error']

f, ax = plt.subplots(1,2,figsize = [12,4])
for p_i,metric in enumerate(plot_metrics):
    ax[p_i].plot(history.history[metric], label='Train ' + metric, )
    ax[p_i].plot(history.history['val_' + metric], label='Val ' + metric)
    ax[p_i].set_title("Loss Curve - {}".format(metric))
    ax[p_i].set_ylabel(metric.title())
    ax[p_i].legend()
plt.show()

In [None]:
pred = model.predict(test_ds)
submission['target'] = pred
submission.to_csv('keras_nn.csv', index=False)

In [None]:
!head keras_nn.csv

## EDA

In [None]:
f,ax = plt.subplots(figsize = [8,5])
sns.distplot(pred, label="Test")
sns.distplot(train_labels, label="Train")
ax.legend()
plt.show()

In [None]:
print("Notebook Runtime: %0.2f Minutes"%((time.time() - notebookstart)/60))