# Jane Street - Conversion
This notebook take the data from the  __[Jane Street Market Prediction](https://www.kaggle.com/c/jane-street-market-prediction)__ competition and converts it into a dataset suitable for TPUs.

In [None]:
import json
import os
from shutil import make_archive

import numpy as np
import pandas as pd
import tensorflow as tf

# create a temporary folder to create the dataset in
temp = os.path.join(os.pardir, "temp", "tempdata")
os.makedirs(temp, exist_ok=True)

In [None]:
# number of days available
TOTAL_DAYS = 500

# number of days to put in each tf record
# one day corresponds to ~3 MB on disk
DAYS_PER_FILE = 20

# number of folds to split the data into
FOLDS = 5

# impute missing values with this value
# EDA showed that none of the features are never very 
# negative (<= -10) so we impute missing value with -100
NAN_VALUE = -100.0

In [None]:
# load data, convert to 32-bit floats and replace missing values by median
file = os.path.join(os.pardir, "input", "jane-street-market-prediction", "train.csv")
dtype = {c: np.float32 for c in pd.read_csv(file, nrows=1).columns}
df = pd.read_csv(file, engine="c", dtype=dtype)
df.fillna(NAN_VALUE, inplace=True)

columns = {col: ix for (ix, col) in enumerate(df.columns)}
with open(os.path.join(temp, "columns.json"), "w") as file:
    json.dump(columns, file)

stats = {"nan_value": str(NAN_VALUE)}

In [None]:
days_per_fold = TOTAL_DAYS // FOLDS
files_per_fold = TOTAL_DAYS // (FOLDS * DAYS_PER_FILE)

for fold in range(FOLDS):
    # make a directory for files in this fold
    os.makedirs(os.path.join(temp, f"fold{fold}"), exist_ok=True)
    
    # split data into data for this fold and remainder
    fold_cols = df["date"].between(fold * days_per_fold, (fold + 1) * days_per_fold - 1)
    fold_df, rest_df = df[fold_cols], df[~fold_cols]
    
    # store the statistics of the remaining data
    stats[fold] = {"samples": len(rest_df),
                   "mean": str(rest_df.mean()),
                   "std": str(rest_df.std())}

    # write the days for this fold into tf records
    for file in range(files_per_fold):
        first = fold * days_per_fold + file * DAYS_PER_FILE
        last = first + DAYS_PER_FILE - 1
        file_df = fold_df[fold_df["date"].between(first, last)]

        # convert to TF dataset
        ds = tf.data.Dataset.from_tensor_slices(file_df.to_numpy())
            
        # serialize the tensors in the data set
        ds = ds.map(tf.io.serialize_tensor)

        # write the serialized data to TF record
        record_path = os.path.join(temp, f"fold{fold}", f"{file}.tfrec")
        record = tf.data.experimental.TFRecordWriter(record_path)
        record.write(ds)

In [None]:
with open(os.path.join(temp, "stats.json"), "w") as file:
    json.dump(stats, file)

make_archive(os.path.join(os.curdir, "data"), "zip", temp)