<a href="https://colab.research.google.com/github/prometheus-eco-racing/embedded-ai-sfhmmy22-att/blob/master/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Python Environment 

The next cell sets up the dependencies in required for the notebook, run it.

In [None]:
!apt-get -qq install xxd
!pip3 install pandas numpy matplotlib
!pip3 install tensorflow==2.0.0

from locale import normalize
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

# Upload Data

Create a `data` folder and upload the `balanced.csv` and `unbalanced.csv` files you just created.

Sometimes copy-paste from arduino serial monitor adds a second newline. Uncomment and run the following cell to consume them:

In [None]:
# import sys
# for fpath in ["data/balanced.csv", "data/unbalanced.csv"]:
#     with open(fpath) as fin, open(fpath+"2", 'w+') as fout:
#         for line in fin:
#             if not line.isspace():
#                 fout.write(line)

# Plot the data

In [None]:
df_unbalanced = pd.read_csv("data/unbalanced.csv")
df_balanced = pd.read_csv("data/balanced.csv")

plt.figure()
plt.plot(df_unbalanced['aX'], 'g.', label='x', linestyle='solid', marker=',')
plt.plot(df_unbalanced['aY'], 'b.', label='y', linestyle='solid', marker=',')
plt.plot(df_unbalanced['aZ'], 'r.', label='z', linestyle='solid', marker=',')
plt.title("Unbalanced")
plt.xlabel("Sample #")
plt.ylabel("Gyroscope (deg/sec)")
plt.legend()

plt.figure()
plt.plot(df_balanced['aX'], 'g.', label='x', linestyle='solid', marker=',')
plt.plot(df_balanced['aY'], 'b.', label='y', linestyle='solid', marker=',')
plt.plot(df_balanced['aZ'], 'r.', label='z', linestyle='solid', marker=',')
plt.title("Balanced")
plt.xlabel("Sample #")
plt.ylabel("Gyroscope (deg/sec)")
plt.legend()
plt.show()

# Train the Neural Network

## Preprocess the dataset

The next cell parses the csv files and transforms them to a format that will be used to train the full connected neural network.

In [None]:
CLASS = [
    "balanced",
    "unbalanced",
]

MSR_SERIES_PER_SAMPLE = 10 # number of measurement series (6) per sample

inputs = []
outputs = []

# read each csv file and push an input and output
for output_class in CLASS:
  output = 1 if output_class == "balanced" else 0
  print(f"Processing output_class '{output_class}'.")

  df = pd.read_csv("data/" + output_class + ".csv", dtype=float)

  # calculate the number of dataset samples for the current class
  num_samples = int(df.shape[0] / MSR_SERIES_PER_SAMPLE)
  
  print(f"\tThere are {num_samples} samples of the {output_class} output_class in the dataset.")


  for i in range(num_samples):
    tensor = []
    for j in range(MSR_SERIES_PER_SAMPLE):
      index = i * MSR_SERIES_PER_SAMPLE + j

      tensor += [
          df['aX'][index], 
          df['aY'][index], 
          df['aZ'][index], 
          df['gX'][index], 
          df['gY'][index], 
          df['gZ'][index],
          ]

    inputs.append(tensor)
    outputs.append(output)

# convert the list to numpy array
inputs = np.array(inputs)
outputs = np.array(outputs)

print("Data set parsing and preparation complete.")

## Randomize and split the input and output pairs for training

Randomly sprint the input and ouput pairs into sets of data. 60% for training, 20% for validation, and 20% for testing.

  - the training set is used to train the model
  - the validation set is used to measure how well the model is performing during training
  - the testing set is used test the model after training

In [None]:
# Randomize the order of the inputs, so they can be evenly distributed for training, testing, and validation
randomize = np.arange(len(inputs))
np.random.shuffle(randomize)
inputs = inputs[randomize]
outputs = outputs[randomize]

# Split the recordings (group of samples) into three sets: training, testing and validation
TRAIN_SPLIT = int(0.6 * len(inputs))
TEST_SPLIT = int(0.2 * len(inputs) + TRAIN_SPLIT)

inputs_train, inputs_test, inputs_validate = np.split(inputs, [TRAIN_SPLIT, TEST_SPLIT])
outputs_train, outputs_test, outputs_validate = np.split(outputs, [TRAIN_SPLIT, TEST_SPLIT])

print(len(inputs_train))
print(len(inputs_test))
print(len(inputs_validate))

## Build & Train the Model

Build and train a [TensorFlow](https://www.tensorflow.org) model using the high-level [Keras](https://www.tensorflow.org/guide/keras) API.

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(50, activation='relu', kernel_initializer='he_uniform'))
model.add(tf.keras.layers.Dense(15, activation='relu', kernel_initializer='he_uniform'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])
history = model.fit(inputs_train, outputs_train, epochs=60, batch_size=32, validation_data=(inputs_validate, outputs_validate))

## Verify 

Graph the models performance vs validation.


### Graph the loss

Graph the loss to see when the model stops improving.

In [None]:
# increase the size of the graphs. The default size is (6,4).
plt.rcParams["figure.figsize"] = (20,10)

# graph the loss, the model above is configure to use "mean squared error" as the loss function
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'g.', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
epochs = range(1, len(accuracy) + 1)
plt.plot(epochs, accuracy, 'g.', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### Run with Test Data
Put our test data into the model and plot the predictions


In [None]:
# use the model to predict the test inputs
predictions = np.array((model.predict(inputs_test) > 0.5)[:], dtype=int)

# Plot the predictions along with to the test data
plt.clf()
plt.title('Training data predicted vs actual values')
plt.plot(inputs_test, outputs_test, 'b.', label='Actual')
plt.plot(inputs_test, predictions, 'r.', label='Predicted')
plt.show()

# Convert the Trained Model to Tensorflow Lite

The next cell converts the model to TFlite format. The size in bytes of the model is also printed out.

In [None]:
# Convert the model to the TensorFlow Lite format without quantization
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the model to disk
open("model.tflite", "wb").write(tflite_model)
  
import os
basic_model_size = os.path.getsize("model.tflite")
print("Model is %d bytes" % basic_model_size)

## Encode the Model in an Arduino Header File 

The next cell creates a constant byte array that contains the TFlite model. Import it as a tab with the sketch below.

In [None]:
!echo "//Build time:" date "+%T"          >> model.h
!echo "const unsigned char model[] = {"    > model.h
!cat model.tflite | xxd -i                >> model.h
!echo "};"                                >> model.h

import os
model_h_size = os.path.getsize("model.h")
print(f"Header file, model.h, is {model_h_size:,} bytes.")
print("\nOpen the side panel (refresh if needed). Double click model.h to download the file.")