## Import packages

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

## Get and clean the data

In [None]:
train_filename = '../input/distributed-training-with-kubeflow/train_EDA.csv'
train = pd.read_csv(train_filename, low_memory=False)

In [None]:
# Remove the first two columns (index and week of type string) from the training dataset
columns = list(train.columns)[2:]
train = train[columns]

# Convert boolean inputs to int
for i in range(12,17):
    train[columns[i]] = np.where(train[columns[i]] == 'true', 1, 0)
    
# Convert date type inputs to int
train['WEEK_LABEL'] = pd.to_numeric(train['WEEK_LABEL'].str.replace('-',''))

train.head()

In [None]:
# Split features from label
features = train.copy()
label = features.pop('UNITS')

## Normalize the dataset

In [None]:
# Create a normalization layer
normalizer = preprocessing.Normalization(axis=-1)
normalizer.adapt(np.array(features))

In [None]:
first = np.array(features[:1])

with np.printoptions(precision=2, suppress=True):
    print('First example:', first)
    print()
    print('Normalized:', normalizer(first).numpy())

## Predict UNITS from ID with a single-variable linear regression

In [None]:
id = np.array(features['id'])

id_normalizer = preprocessing.Normalization(input_shape=[1,], axis=None)
id_normalizer.adapt(id)

In [None]:
id_model = tf.keras.Sequential([
    id_normalizer,
    layers.Dense(units=1)
])

id_model.summary()

In [None]:
# Predict the first 10 ids
id_model.predict(id[:10])

In [None]:
# Compile the model
id_model.compile(
    optimizer = tf.optimizers.Adam(learning_rate=0.1),
    loss = 'mean_absolute_error'
)

In [None]:
# Execute the model
history = id_model.fit(
    features['id'],
    label,
    epochs = 100,
    verbose = 2,
    # Calculate validation results on 20% of the training data
    validation_split = 0.2
)

In [None]:
# Visualize the model's training progress
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.ylim([0, 100])
    plt.xlabel('Epoch')
    plt.ylabel('Error [UNITS]')
    plt.legend()
    plt.grid(True)
    
plot_loss(history)

In [None]:
x = tf.linspace(0.0, 250, 251)
y = id_model.predict(x)

In [None]:
def plot_id(x, y):
    plt.scatter(features['id'], label, label='Data')
    plt.plot(x, y, color='k', label='Predictions')
    plt.xlabel('ID')
    plt.ylabel('UNITS')
    plt.legend()

plot_id(x,y)

## Evaluate the performance

In [None]:
test_filename = '../input/distributed-training-with-kubeflow/test.csv'
test = pd.read_csv(test_filename)

In [None]:
# Remove the index column from the test dataset
test_columns = list(test.columns)[1:]
test = test[test_columns]

# Split the test features and label
test_features = test.copy()
test_label = test_features.pop('UNITS')

In [None]:
# Collect the results on the test set
test_results = {}

test_results['id_model'] = id_model.evaluate(
    test_features['id'],
    test_label,
    verbose = 0
)

In [None]:
pd.DataFrame(test_results, index=['Mean absolute error [UNITS]']).T

### 