In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import json
import numpy as np
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf

In [2]:
if tf.config.list_physical_devices('GPU'):
    print("TensorFlow is using GPU")
else:
    print("TensorFlow is using CPU")

TensorFlow is using CPU


In [3]:
PROCESSED_DATA_DIR = "../data/processed"
TRAINING_DATA_DIR = "../data/split/train"
VALIDATION_DATA_DIR = "../data/split/val"
TEST_DATA_DIR = "../data/split/test"

BATCH_SIZE = 5

In [4]:
def get_data_size(data):
    data_size = 0
    for dir in os.listdir(data):
        for _ in os.listdir(os.path.join(data, dir)):
            data_size += 1
    return data_size

print(get_data_size(TRAINING_DATA_DIR))
print(get_data_size(VALIDATION_DATA_DIR))

167
35


In [5]:
def compute_min_max(data_dir):
    """Compute the minimum and maximum values of the data."""
    min_val = float('inf')
    max_val = float('-inf')

    all_data = []
    subdirs = [os.path.join(data_dir, d) for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]

    for subdir in subdirs:
        data_files = [os.path.join(subdir, f) for f in os.listdir(subdir) if f.endswith('.json')]

        for data_file in data_files:
            with open(data_file, 'r') as f:
                data = json.load(f)

            data = [value for coordinate_dict in data for value in coordinate_dict.values()]  # flatten list of dictionaries
            min_val = min(min_val, min(data))
            max_val = max(max_val, max(data))
            all_data.extend(data)
    return data, min_val, max_val


data, min_val, max_val = compute_min_max(PROCESSED_DATA_DIR)
scaler = MinMaxScaler(feature_range=(min_val, max_val))
scaler.fit(np.array(data).reshape(-1, 1))

In [6]:
def data_generator(data_dir):
    """ 
    Generator function that yields data and labels from the given data directory.

    Args:
        data_dir (str): The directory containing the data.
    
    Yields:
        tuple: A tuple containing the data and the corresponding label. (data, label)
        data (list): A data sample.
        label (int): The label corresponding to the data sample.
    """
    subdirs = [os.path.join(data_dir, d) for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    label_to_index = {os.path.basename(label): i for i, label in enumerate(subdirs)}
    for subdir in subdirs:
        data_files = [os.path.join(subdir, f) for f in os.listdir(subdir) if f.endswith('.json')]
        
        for data_file in data_files:
            with open(data_file, 'r') as f:
                data = json.load(f)

            data = [value for coordinate_dict in data for value in coordinate_dict.values()] 
            data = scaler.transform(np.array(data).reshape(-1, 1))
            data = data.flatten().tolist()
            label_index = label_to_index[os.path.basename(os.path.dirname(data_file))]
            yield (data, tf.constant(label_index, dtype=tf.int32))

In [7]:
training_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(TRAINING_DATA_DIR),
    output_signature=(
        tf.TensorSpec(shape=(63,), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32)
    )
)

validation_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(VALIDATION_DATA_DIR),
    output_signature=(
        tf.TensorSpec(shape=(63,), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32)
    )
)

test_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(TEST_DATA_DIR),
    output_signature=(
        tf.TensorSpec(shape=(63,), dtype=tf.float32),
        tf.TensorSpec(shape=(), dtype=tf.int32)
    )
)

In [8]:
training_dataset = training_dataset.shuffle(1000).batch(BATCH_SIZE)
validation_dataset = validation_dataset.shuffle(1000).batch(BATCH_SIZE)
test_dataset = test_dataset.shuffle(1000).batch(BATCH_SIZE)

In [9]:
def build_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(63,)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

In [10]:
def train_model(model):
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,
                                                        restore_best_weights=True, min_delta= 0.01)

        model.fit(training_dataset.repeat(),
                steps_per_epoch=get_data_size(TRAINING_DATA_DIR) // BATCH_SIZE,
                validation_data=validation_dataset.repeat(),
                validation_steps=get_data_size(VALIDATION_DATA_DIR) // BATCH_SIZE,
                epochs=100,
                callbacks=[early_stopping]
        ) 

        return model

In [11]:
model = train_model(build_model())

loss, accuracy = model.evaluate(validation_dataset)
print(f"Test accuracy: {accuracy}")
print(f"Test loss: {loss}")

Epoch 1/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6125 - loss: 0.6410 - val_accuracy: 0.7143 - val_loss: 0.5774
Epoch 2/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6108 - loss: 0.6805 - val_accuracy: 0.7143 - val_loss: 0.5873
Epoch 3/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7665 - loss: 0.5208 - val_accuracy: 0.7143 - val_loss: 0.5815
Epoch 4/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6943 - loss: 0.5846 - val_accuracy: 0.7143 - val_loss: 0.5686
Epoch 5/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6334 - loss: 0.6441 - val_accuracy: 0.7143 - val_loss: 0.5756
Epoch 6/100
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7425 - loss: 0.5442 - val_accuracy: 0.7143 - val_loss: 0.6004
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━

  self.gen.throw(typ, value, traceback)
