# Categorical Structured Data ML with Keras

[Link to Colab (deprecated)](https://colab.research.google.com/drive/1GmAhxnKVvrhWffospDEe0rc-QB_tjfhE?usp=sharing)

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import random
from pathlib import Path

from src.custom_types import TypeEnum
from src.tf_layer_constructors import (
    gen_normalization_layer,
    gen_multihot_categorical_encoding_layer
)
from src.tf_utils import df_to_tfds
from src.data_examples.ex1_data_loader import ExampleDataLoader

print('Using TensorFlow version', tf.__version__)

In [None]:
RAND_SEED = 1337

np.random.seed(RAND_SEED)
random.seed(RAND_SEED)
np.set_printoptions(precision=3, suppress=True)

tf.keras.backend.clear_session()

In [None]:
saved_model_path = Path('saved_models')
saved_model_path.mkdir(exist_ok=True)

## Loading Example Data

In [None]:
data = ExampleDataLoader()

data.download().load().clean()
data.df.info()

In [None]:
data.df['symboling_threshold'] = [1 if i > 0 else 0 for i in data.df['symboling']]

target_feature_label = 'symboling_threshold'

In [None]:
sns.pairplot(
  data.df[[
    data.feature_label,
    "curb_weight",
    "engine_size",
    "horsepower",
    # "peak_rpm",
    "city_mpg",
    "highway_mpg",
    "price"
  ]], diag_kind='kde', hue=data.feature_label, palette=sns.color_palette('hls', len(data.df[data.feature_label].unique())))

In [None]:
def generate_random_sample_from_spec(data_spec, features_override=[]):
  ret = {}
  for k in data_spec.keys():
    if features_override and k not in features_override:
      continue
    v = data_spec.get(k)
    if isinstance(v, tuple):
      ret[k] = random.random() * (v[1] - v[0])
    elif isinstance(v, list):
      ret[k] = random.choice(v)
    else:
      ret[k] = v
  return ret

In [None]:
__inference_sample_spec = list(map(lambda x: x.replace('_', '-'), data.features_categorical + data.features_numeric_continuous))
__inference_sample = generate_random_sample_from_spec(data.data_spec, __inference_sample_spec)
inference_sample = {}
for k, v in __inference_sample.items():
  inference_sample[k.replace('-', '_')] = tf.convert_to_tensor([v])

inference_sample

## 1. Centralized (Conventional) Training 

In [None]:
n_epoch = 25
batch_size = 24

ex1ch1_model_path = saved_model_path / 'ex1ch1_auto_classifier'

In [None]:
df_train = data.df.sample(frac=0.8, random_state=RAND_SEED)
df_val_test = data.df.drop(df_train.index)
df_test = df_val_test.sample(frac=0.5, random_state=RAND_SEED)
df_val = df_val_test.drop(df_test.index)

tfds_train  = df_to_tfds(df_train, target_feature_label, batch_size=batch_size, )
tfds_test   = df_to_tfds(df_test,  target_feature_label, batch_size=batch_size, )
tfds_val    = df_to_tfds(df_val,   target_feature_label, batch_size=batch_size, )

df_train.shape, df_test.shape, df_val.shape

In [None]:
all_inputs = {}
all_encoded_features = {}

In [None]:
all_inputs['normalization'] = {}
all_encoded_features['normalization'] = {}

for col_name in data.features_numeric_continuous:

  input_numeric = tf.keras.Input(shape=(1,), name=col_name, dtype='float32')
  normalization_layer = gen_normalization_layer(tfds_train, col_name)
  encoded_normalized_input = normalization_layer(input_numeric)

  all_inputs['normalization'][col_name] = input_numeric
  all_encoded_features['normalization'][col_name] = encoded_normalized_input

In [None]:
all_inputs['categorical'] = {}
all_encoded_features['categorical'] = {}

for col_name in data.features_categorical:
  input_categorical = tf.keras.Input(shape=(1,), name=col_name, dtype='string')
  categorical_encoder = gen_multihot_categorical_encoding_layer(tfds_train, col_name, TypeEnum.string, max_tokens=5)
  encoded_categorical_input = categorical_encoder(input_categorical)

  all_inputs['categorical'][col_name] = input_categorical
  all_encoded_features['categorical'][col_name] = encoded_categorical_input

In [None]:
all_inputs

In [None]:
def build_categorical_model(input_layers, feature_layers):
  nn_dense = tf.keras.layers.Dense(32, activation='relu')(feature_layers)
  nn_dense = tf.keras.layers.Dropout(0.5)(nn_dense)
  output = tf.keras.layers.Dense(1)(nn_dense)
  return tf.keras.Model(input_layers, output)


In [None]:
all_inputs_layers = [j for i in all_inputs.values() for j in i.values()]
all_features = [j for i in all_encoded_features.values() for j in i.values()]
all_feature_layers = tf.keras.layers.concatenate(all_features)

x = tf.keras.layers.Dense(64, activation="relu")(all_feature_layers)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(all_inputs_layers, output)
all_encoded_features

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=["accuracy"])
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

In [None]:
import csv
class MetricsLogger(tf.keras.callbacks.Callback):
    def __init__(self, filename, _metrics=['accuracy', 'loss']):
        super().__init__()
        self.filename = filename
        self.file = None
        self.writer = None
        self._metrics=_metrics

    def on_train_begin(self, logs=None):
        self.file = open(self.filename, 'w')
        self.writer = csv.DictWriter(self.file, ['epoch'] + self._metrics)
        self.writer.writeheader()

    def on_epoch_end(self, epoch, logs=None):
        print(logs)
        row = {'epoch': epoch + 1}
        for k in self._metrics:
            row[k] = logs.get(k, np.nan)
        self.writer.writerow(row)
        self.file.flush()

    def on_train_end(self, logs=None):
        self.file.close()

In [None]:
metrics_logger = MetricsLogger('metrics.csv', ['loss', 'accuracy', 'val_loss', 'foo'])

model.fit(tfds_train, epochs=n_epoch, validation_data=tfds_val, callbacks=[metrics_logger])

In [None]:
model.name

In [None]:
loss, accuracy = model.evaluate(tfds_test)
print("Accuracy", accuracy)
model.save(ex1ch1_model_path)

In [None]:
loaded_model = tf.keras.models.load_model(ex1ch1_model_path)
predictions = loaded_model.predict(inference_sample)
predictions