In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import json
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf

In [None]:
if tf.config.list_physical_devices('GPU'):
    print("TensorFlow is using GPU")
else:
    print("TensorFlow is using CPU")

In [None]:
TRAINING_DATA_DIR = "../data/split/train"
VALIDATION_DATA_DIR = "../data/split/val"
TEST_DATA_DIR = "../data/split/test"

BATCH_SIZE = 16

In [None]:
def get_training_data_size():
    data_size = 0
    for dir in os.listdir(TRAINING_DATA_DIR):
        for _ in os.listdir(os.path.join(TRAINING_DATA_DIR, dir)):
            data_size += 1
    return data_size

get_training_data_size()

In [None]:
def data_generator(data_dir, batch_size):
    """ 
    Generator function that yields a batch of data and labels from the given data directory.
    Data file order is shuffled for each epoch.

    Args:
        data_dir (str): The directory containing the data.
        batch_size (int): The size of the batch.
    
    Yields:
        tuple: A tuple containing the batch of data and the corresponding labels. (batch_data, batch_labels)
        batch_data (list): A list of data samples.
        batch_labels (list): A list of labels corresponding to the data samples.
    """
    subdirs = [os.path.join(data_dir, d) for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    label_to_index = {os.path.basename(label): i for i, label in enumerate(subdirs)}  # map label to index

    while True:
        np.random.shuffle(subdirs)
        data_files = []
        for subdir in subdirs:
            data_files.extend([os.path.join(subdir, f) for f in os.listdir(subdir) if f.endswith('.json')])

        np.random.shuffle(data_files)

        for i in range(0, len(data_files), batch_size):  # len(data_files) % batch_size
            batch_files = data_files[i:i+batch_size]
            if len(batch_files) < batch_size:
                continue 
            
            batch_data = []
            batch_labels = []
            for data_file in batch_files:
                with open(data_file, 'r') as f:
                    data = json.load(f)

                data = [value for coordinate_dict in data for value in coordinate_dict.values()]  # flatten list of dictionaries
                data = MinMaxScaler().fit_transform(np.array(data).reshape(-1, 1))
                data = data.flatten().tolist()
                batch_data.append(data)
                label_index = label_to_index[os.path.basename(os.path.dirname(data_file))]
                batch_labels.append(tf.constant(label_index, dtype=tf.int32))
            yield (batch_data, batch_labels)

In [None]:
training_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(TRAINING_DATA_DIR, BATCH_SIZE),
    output_signature=(
        tf.TensorSpec(shape=(BATCH_SIZE, 63), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.int32)
    )
)

validation_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(VALIDATION_DATA_DIR, BATCH_SIZE),
    output_signature=(
        tf.TensorSpec(shape=(BATCH_SIZE, 63), dtype=tf.float32),
        tf.TensorSpec(shape=(BATCH_SIZE,), dtype=tf.int32)
    )
)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(TEST_DATA_DIR, BATCH_SIZE),
    output_signature=(
        tf.TensorSpec(shape=(BATCH_SIZE, 63), dtype=tf.float32),
        tf.TensorSpec(shape=(BATCH_SIZE,), dtype=tf.int32)
    )
)

In [None]:
for data, labels in test_dataset.take(1):
    print(data)
    print(labels)
    break

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(63,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
steps_per_epoch = get_training_data_size() // BATCH_SIZE
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

model.fit(training_dataset,
        validation_data=validation_dataset,
        epochs=30,
        steps_per_epoch=steps_per_epoch,
        callbacks=[early_stopping]
) 