In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from scipy import stats

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Model

In [None]:
class Config:
    is_training = False
    tf_record_dataset_path = "../input/ump-combinatorialpurgedgroupkfold-tf-record/"
    output_dataset_path = "../input/ubiquant-market-prediction-with-dnn-output/"
config = Config()

# Convert tfrecords to npy files

In [None]:
def decode_function(record_bytes):
    return tf.io.parse_single_example(
      # Data
      record_bytes,
      # Schema
      {
          "features": tf.io.FixedLenFeature([300], dtype=tf.float32),
          "time_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "investment_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "target": tf.io.FixedLenFeature([], dtype=tf.float32)
      }
    )
def preprocess(item):
    return (
        tf.expand_dims(tf.cast(item["features"], dtype=tf.float16), -1), 
        tf.cast(item["target"], dtype=tf.float32), 
        tf.cast(item["time_id"], dtype=tf.int16), 
        tf.cast(item["investment_id"], dtype=tf.int16))
def make_dataset(file_paths, batch_size=100000, mode="train"):
    ds = tf.data.TFRecordDataset(file_paths)
    ds = ds.map(decode_function)
    ds = ds.map(preprocess)
    ds = ds.batch(batch_size)
    return ds

In [None]:
for i in range(5):
    path = f"{config.tf_record_dataset_path}fold_{i}_train.tfrecords"
    ds = make_dataset([path], mode="train")
    X_batches = []
    y_batches = []
    tid_batches = []
    iid_batches = []
    j = 0
    for X, y, tid, iid in ds:
        print(j)
        X_batches.append(X.numpy())
        y_batches.append(y.numpy())
        tid_batches.append(tid.numpy())
        iid_batches.append(iid.numpy())
        j += 1
        
    all_X_batches = np.concatenate(X_batches, axis=0)
    all_y_batches = np.concatenate(y_batches, axis=0)
    all_tid_batches = np.concatenate(tid_batches, axis=0)
    all_iid_batches = np.concatenate(iid_batches, axis=0)
    with open(f'fold_{i}_train_X.npy', 'wb') as f:
        np.save(f, all_X_batches)
    with open(f'fold_{i}_train_y.npy', 'wb') as f:
        np.save(f, all_y_batches)
    with open(f'fold_{i}_train_tid.npy', 'wb') as f:
        np.save(f, all_tid_batches)
    with open(f'fold_{i}_train_iid.npy', 'wb') as f:
        np.save(f, all_iid_batches)

In [None]:
for i in range(5):
    path = f"{config.tf_record_dataset_path}fold_{i}_test.tfrecords"
    ds = make_dataset([path], mode="valid")
    X_batches = []
    y_batches = []
    tid_batches = []
    iid_batches = []
    j = 0
    for X, y, tid, iid in ds:
        print(j)
        X_batches.append(X.numpy())
        y_batches.append(y.numpy())
        tid_batches.append(tid.numpy())
        iid_batches.append(iid.numpy())
        j += 1
        
    all_X_batches = np.concatenate(X_batches, axis=0)
    all_y_batches = np.concatenate(y_batches, axis=0)
    all_tid_batches = np.concatenate(tid_batches, axis=0)
    all_iid_batches = np.concatenate(iid_batches, axis=0)
    with open(f'fold_{i}_test_X.npy', 'wb') as f:
        np.save(f, all_X_batches)
    with open(f'fold_{i}_test_y.npy', 'wb') as f:
        np.save(f, all_y_batches)
    with open(f'fold_{i}_test_tid.npy', 'wb') as f:
        np.save(f, all_tid_batches)
    with open(f'fold_{i}_test_iid.npy', 'wb') as f:
        np.save(f, all_iid_batches)