In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pprint import pprint

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the Data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')

In [None]:
train.head()

In [None]:
train.columns

In [None]:
label = 'target'
numeric_columns = [c for c in train.columns if 'cont' in c]
categorical_columns = [c for c in train.columns if 'cat' in c]

print(f'numeric_columns = {numeric_columns}')
print(f'categorical_columns = {categorical_columns}')

# Building Simple MLP using only the numeric columns

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras.layers.experimental.preprocessing import Normalization

inputs = {}
numeric_normalized_layers = []

for column in numeric_columns:
    inputs[column] = layers.Input(name=column, shape=(), dtype=tf.float32)
    normalizer = Normalization(name=f'normalized_{column}')
    normalizer.adapt(train[column].values)
    normalized = normalizer(inputs[column])

    numeric_normalized_layers.append(normalized)

merged = layers.Concatenate(name='merged')(numeric_normalized_layers)

hidden_layer_0 = layers.Dense(20, activation='relu', name='hidden_0')(merged)
hidden_layer_1 = layers.Dense(20, activation='relu', name='hidden_1')(hidden_layer_0)
hidden_layer_2 = layers.Dense(20, activation='relu', name='hidden_2')(hidden_layer_1)

output = layers.Dense(1, activation='linear', name='output')(hidden_layer_2)

model = keras.Model(inputs=inputs, outputs=output)

learning_rate = 0.001

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.RootMeanSquaredError()])

model.summary()

In [None]:
keras.utils.plot_model(model, rankdir='LR', show_shapes=True)

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def df_to_dataset(df, numeric_columns, batch_size, target, shuffle=True):
    df = df.copy()
    
    labels = df.pop(target)

    ds = tf.data.Dataset.from_tensor_slices((dict(df[numeric_columns]), labels)).cache()
       
    if shuffle:
        ds = ds.shuffle(buffer_size=df.shape[0])
        
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    return ds.batch(batch_size)

In [None]:
train, valid = train_test_split(train, test_size=0.2, random_state=0)

train_dataset = df_to_dataset(train, numeric_columns, 32, label, shuffle=True)
valid_dataset = df_to_dataset(valid, numeric_columns, 32, label, shuffle=False)

In [None]:
checkpoint_filepath = './checkpoint/best_model/numeric'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=False,
        monitor='val_root_mean_squared_error',
        mode='min',
        save_best_only=True)

history = model.fit(train_dataset, epochs=10, validation_data=valid_dataset,  callbacks=[model_checkpoint_callback])

In [None]:
f, ax = plt.subplots(1,2,figsize=(17,8))
ax[0].set_title('model loss')
ax[0].plot(history.history['loss'], label='train loss')
ax[0].plot(history.history['val_loss'], label='valid loss')
ax[0].set_xlabel('epoch')
ax[0].set_ylabel('loss')
ax[0].legend()
ax[0].grid()

ax[1].set_title('model RMSE')
ax[1].plot(history.history['root_mean_squared_error'], label='train RMSE')
ax[1].plot(history.history['val_root_mean_squared_error'], label='valid RMSE')
ax[1].set_xlabel('epoch')
ax[1].set_ylabel('RMSE')
ax[1].legend()
ax[1].grid()

In [None]:
model = keras.models.load_model(checkpoint_filepath)

loss, rmse = model.evaluate(valid_dataset)
print(f'best loss = {loss}, best RMSE = {rmse}')

# Building Simple MLP using numeric and categorical columns

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')

label = 'target'
numeric_columns = [c for c in train.columns if 'cont' in c]
categorical_columns = [c for c in train.columns if 'cat' in c]

print(f'numeric_columns = {numeric_columns}')
print(f'categorical_columns = {categorical_columns}')

In [None]:
vocabulary = {column: train[column].unique().tolist() for column in categorical_columns}
pprint(vocabulary, width=100)

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras.layers.experimental.preprocessing import Normalization

inputs = {}
numeric_normalized_layers = []
categorical_onehot_encoded_layers = []

for column in numeric_columns:
    inputs[column] = layers.Input(name=column, shape=(), dtype=tf.float32)
    normalizer = Normalization(name=f'normalized_{column}')
    normalizer.adapt(train[column].values)
    normalized = normalizer(inputs[column])

    numeric_normalized_layers.append(normalized)
        
for column in categorical_columns:
    vocab = vocabulary[column]
    inputs[column] = layers.Input(name=column, shape=(), dtype=tf.string)

    label_encoder = StringLookup(vocabulary=vocab, mask_token=None, num_oov_indices=0, name=f'label_{column}')
    label_encoded = label_encoder(inputs[column])

    onehot_encoder = CategoryEncoding(output_mode="binary", name=f'onehot_{column}')
    onehot_encoder.adapt(label_encoder(vocab))
    onehot_encoded = onehot_encoder(label_encoded)

    categorical_onehot_encoded_layers.append(onehot_encoded)

merged_numeric = layers.Concatenate(name='merged_numerical')(numeric_normalized_layers)
hidden_layer_00 = layers.Dense(20, activation='relu', name='hidden_00')(merged_numeric)
hidden_layer_01 = layers.Dense(20, activation='relu', name='hidden_01')(hidden_layer_00)
hidden_layer_02 = layers.Dense(20, activation='relu', name='hidden_02')(hidden_layer_01)

merged_categorical = layers.Concatenate(name='merged_categorical')(categorical_onehot_encoded_layers)
hidden_layer_10 = layers.Dense(1, activation='relu', name='hidden_10')(merged_categorical)

merged_hidden = layers.Concatenate(name='merged_hidden')([hidden_layer_02, hidden_layer_10])

output = layers.Dense(1, activation='linear', name='output')(merged_hidden)

model = keras.Model(inputs=inputs, outputs=output)

learning_rate = 0.001

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.RootMeanSquaredError()])

model.summary()

In [None]:
keras.utils.plot_model(model, rankdir='LR', show_shapes=True)

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def df_to_dataset(df, columns, batch_size, target, shuffle=True):
    df = df.copy()
    
    labels = df.pop(target)

    ds = tf.data.Dataset.from_tensor_slices((dict(df[columns]), labels)).cache()
       
    if shuffle:
        ds = ds.shuffle(buffer_size=df.shape[0])
        
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    return ds.batch(batch_size)

train, valid = train_test_split(train, test_size=0.2, random_state=0)

train_dataset = df_to_dataset(train, numeric_columns+categorical_columns, 32, label, shuffle=True)
valid_dataset = df_to_dataset(valid, numeric_columns+categorical_columns, 32, label, shuffle=False)

In [None]:
checkpoint_filepath = './checkpoint/best_model/numeric_categorical'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=False,
        monitor='val_root_mean_squared_error',
        mode='min',
        save_best_only=True)

history = model.fit(train_dataset, epochs=10, validation_data=valid_dataset,  callbacks=[model_checkpoint_callback])

In [None]:
f, ax = plt.subplots(1,2,figsize=(17,8))
ax[0].set_title('model loss')
ax[0].plot(history.history['loss'], label='train loss')
ax[0].plot(history.history['val_loss'], label='valid loss')
ax[0].set_xlabel('epoch')
ax[0].set_ylabel('loss')
ax[0].legend()
ax[0].grid()

ax[1].set_title('model RMSE')
ax[1].plot(history.history['root_mean_squared_error'], label='train RMSE')
ax[1].plot(history.history['val_root_mean_squared_error'], label='valid RMSE')
ax[1].set_xlabel('epoch')
ax[1].set_ylabel('RMSE')
ax[1].legend()
ax[1].grid()

In [None]:
model = keras.models.load_model(checkpoint_filepath)

loss, rmse = model.evaluate(valid_dataset)
print(f'best loss = {loss}, best RMSE = {rmse}')

# Building NN using numeric and categorical columns and their embeddings

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')

label = 'target'
numeric_columns = [c for c in train.columns if 'cont' in c]
categorical_columns = [c for c in train.columns if 'cat' in c]

print(f'numeric_columns = {numeric_columns}')
print(f'categorical_columns = {categorical_columns}')

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import CategoryEncoding
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras.layers.experimental.preprocessing import Normalization

inputs = {}
numeric_normalized_layers = []
categorical_onehot_encoded_layers = []
categorical_embeddings = []

for column in numeric_columns:
    inputs[column] = layers.Input(name=column, shape=(), dtype=tf.float32)
    normalizer = Normalization(name=f'normalized_{column}')
    normalizer.adapt(train[column].values)
    normalized = normalizer(inputs[column])

    numeric_normalized_layers.append(normalized)
        
for column in categorical_columns:
    vocab = vocabulary[column]
    inputs[column] = layers.Input(name=column, shape=(), dtype=tf.string)

    label_encoder = StringLookup(vocabulary=vocab, mask_token=None, num_oov_indices=0, name=f'label_{column}')
    label_encoded = label_encoder(inputs[column])

    onehot_encoder = CategoryEncoding(output_mode="binary", name=f'onehot_{column}')
    onehot_encoder.adapt(label_encoder(vocab))
    onehot_encoded = onehot_encoder(label_encoded)

    categorical_onehot_encoded_layers.append(onehot_encoded)
    
    embedding_dims = int(math.sqrt(len(vocab)))
    embedding_ecoder = layers.Embedding(input_dim=len(vocab), output_dim=embedding_dims, name=f'embedding_{column}')
    encoded_feature = embedding_ecoder(label_encoded)
    
    categorical_embeddings.append(encoded_feature)
    

merged_numeric = layers.Concatenate(name='merged_numerical')(numeric_normalized_layers)
merged_embedding = layers.Concatenate(name='merged_embedding')(categorical_embeddings)
merged_dense = layers.Concatenate(name='merged_dense')([merged_numeric,merged_embedding])
hidden_layer_00 = layers.Dense(20, activation='relu', name='hidden_00')(merged_dense)
hidden_layer_01 = layers.Dense(20, activation='relu', name='hidden_01')(hidden_layer_00)
hidden_layer_02 = layers.Dense(20, activation='relu', name='hidden_02')(hidden_layer_01)

merged_categorical = layers.Concatenate(name='merged_categorical')(categorical_onehot_encoded_layers)
hidden_layer_10 = layers.Dense(1, activation='relu', name='hidden_10')(merged_categorical)

merged_hidden = layers.Concatenate(name='merged_hidden')([hidden_layer_02, hidden_layer_10])

output = layers.Dense(1, activation='linear', name='output')(merged_hidden)

model = keras.Model(inputs=inputs, outputs=output)

learning_rate = 0.001

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.RootMeanSquaredError()])

model.summary()

In [None]:
keras.utils.plot_model(model, rankdir='LR', show_shapes=True)

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def df_to_dataset(df, columns, batch_size, target, shuffle=True):
    df = df.copy()
    
    labels = df.pop(target)

    ds = tf.data.Dataset.from_tensor_slices((dict(df[columns]), labels)).cache()
       
    if shuffle:
        ds = ds.shuffle(buffer_size=df.shape[0])
        
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    return ds.batch(batch_size)

train, valid = train_test_split(train, test_size=0.2, random_state=0)

train_dataset = df_to_dataset(train, numeric_columns+categorical_columns, 32, label, shuffle=True)
valid_dataset = df_to_dataset(valid, numeric_columns+categorical_columns, 32, label, shuffle=False)

In [None]:
checkpoint_filepath = './checkpoint/best_model/numeric_categorical_embedding'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=False,
        monitor='val_root_mean_squared_error',
        mode='min',
        save_best_only=True)

history = model.fit(train_dataset, epochs=10, validation_data=valid_dataset,  callbacks=[model_checkpoint_callback])

In [None]:
f, ax = plt.subplots(1,2,figsize=(17,8))
ax[0].set_title('model loss')
ax[0].plot(history.history['loss'], label='train loss')
ax[0].plot(history.history['val_loss'], label='valid loss')
ax[0].set_xlabel('epoch')
ax[0].set_ylabel('loss')
ax[0].legend()
ax[0].grid()

ax[1].set_title('model RMSE')
ax[1].plot(history.history['root_mean_squared_error'], label='train RMSE')
ax[1].plot(history.history['val_root_mean_squared_error'], label='valid RMSE')
ax[1].set_xlabel('epoch')
ax[1].set_ylabel('RMSE')
ax[1].legend()
ax[1].grid()

In [None]:
model = keras.models.load_model(checkpoint_filepath)

loss, rmse = model.evaluate(valid_dataset)
print(f'best loss = {loss}, best RMSE = {rmse}')