This notebook generates a zip archive with training / validation data for the Game of Life competition. This data can be converted to a Kaggle dataset and fed to the Game of Life - Training notebook.

In [None]:
%%capture
%pip install datatable
%pip install "seaborn>=0.11.0"

import math
import os
from shutil import make_archive

import numpy as np

import datatable as dt
import pandas as pd

import tensorflow as tf

comp_folder = os.path.join(os.pardir, "input", "jane-street-market-prediction")

train_df = dt.fread(os.path.join(comp_folder, "train.csv")).to_pandas()
train_df.set_index("ts_id", inplace=True)
test_df = pd.read_csv(os.path.join(comp_folder, "example_test.csv"), index_col="ts_id")
feat_df = pd.read_csv(os.path.join(comp_folder, "features.csv"), index_col="feature")

train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

In [None]:
def generate(n_samples, min_density=0):
    # when n_samples is large, we need to drop roughly 28% of 
    # the samples because they end up without any live cells. 
    # try to compensate ahead of time by actually generating
    # n_samples / 0.7 samples
    guess = int(n_samples / 0.7)
    
    # since we will split into 5 batches, we round up
    # to the nearest multiple of 5
    guess = guess + 5 - (guess % 5)
    
    # sample random boards of zeros and ones with 
    # random densities in (0.01, 0.99)
    P = tf.random.uniform(shape=[guess, 1, 1, 1],
                          minval=0.01, maxval=0.99)
    U = tf.random.uniform(shape=[guess, 25, 25, 1])
    X = tf.where(U > P, 1.0, 0.0)
        
    # evolve all samples for 5 warmup steps + 1 step from delta
    for step in range(6):
        X = tflife(X)

    # now split into 5 equal batches and evolve each batch for
    # 0, 1, 2, 3, or 4 further steps (from delta) respectively
    split = tf.split(X, 5)
    for delta in range(4):
        split = [tflife(sp) if ix <= delta else sp
                 for ix, sp in enumerate(split)]

    # merge the batches back together
    X = tf.concat(split, axis=0)

    # only keep boards with a minimum density of alive cells
    alive = tf.reduce_mean(X, axis=[1, 2, 3]) > min_density
    X = tf.boolean_mask(X, alive)

    # shuffle the samples
    X = tf.random.shuffle(X)
    
    # ignore excess samples
    if X.shape[0] >= n_samples:
        X = X[:n_samples]
    
    return X

Create a temporary directory structure and generate samples. Then compress the data directories in a zip archive in the working (output) directory.

In [None]:
# create temporary data directories
temp = os.path.join(os.pardir, "temp", "tempdata")
os.makedirs(os.path.join(temp, "train"), exist_ok=True)
os.makedirs(os.path.join(temp, "valid"), exist_ok=True)
os.makedirs(os.path.join(temp, "test"), exist_ok=True)

# flag if we produce less samples than planned
train_flag = False
valid_flag = False

for file in range(TRAIN_FILES):
    # generate the training samples and store how many
    # were actually obtained
    X = generate(TRAIN_SAMPLES)
    
    # raise flag if we created less samples than planned
    train_flag = train_flag or (X.shape[0] != TRAIN_SAMPLES)

    # convert the samples to a TF dataset
    ds = tf.data.Dataset.from_tensor_slices(X)
    
    # serialize the tensors in the data set
    ds = ds.map(tf.io.serialize_tensor)
    
    # write the serialized data to TF record
    record_path = os.path.join(temp, "train", f"{file}.tfrec")
    record = tf.data.experimental.TFRecordWriter(record_path)
    record.write(ds)
    
# repeat this procedure for validation data
for file in range(VALID_FILES):
    X = generate(VALID_SAMPLES)
    valid_flag = valid_flag or (X.shape[0] != TRAIN_SAMPLES)
    ds = tf.data.Dataset.from_tensor_slices(X)
    ds = ds.map(tf.io.serialize_tensor)
    record_path = os.path.join(temp, "valid", f"{file}.tfrec")
    record = tf.data.experimental.TFRecordWriter(record_path)
    record.write(ds)
    
# write data to a zip archive in the working (output) directory
arc = make_archive(os.path.join(os.curdir, "data"), "zip", temp)
print(f"Data written to {arc}")

# warn if we have less samples than planned
if train_flag or valid_flag:
    print("Warning: less samples than planned")