#  Create TF-Record for UMP dataset

In this notebook, I am going to create TF-Record for UMP dataset. I am going to divide the dataset to 10 folds by using Time Series Split. You may experiment other types of Data Spliting method.

## Import Packages

In [None]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt76de7
import tensorflow as tf
import json

## Import dataset

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

In [None]:
investment_id = train.pop("investment_id")
investment_id.head()

In [None]:
time_id = train.pop("time_id")

In [None]:
y = train.pop("target")
y.head()

## Create TF-Record

In [None]:
def create_record(i):
    dic = {}
    dic[f"features"] = tf.train.Feature(float_list=tf.train.FloatList(value=list(train.iloc[i])))
    dic["time_id"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[time_id.iloc[i]]))
    dic["investment_id"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[investment_id.iloc[i]]))
    dic["target"] = tf.train.Feature(float_list=tf.train.FloatList(value=[y.iloc[i]]))
    record_bytes = tf.train.Example(features=tf.train.Features(feature=dic)).SerializeToString()
    return record_bytes
    
def decode_function(record_bytes):
  return tf.io.parse_single_example(
      # Data
      record_bytes,
      # Schema
      {
          "features": tf.io.FixedLenFeature([300], dtype=tf.float32),
          "time_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "investment_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "target": tf.io.FixedLenFeature([], dtype=tf.float32)
      }
  )

Now separate dataset to 10 folds and make sure samples with same time_id appear only in the same fold. So the sample count for each fold are slightly different. In the same time, I also output some useful fold information for convenience.

In [None]:
folds = 10
time_id_value_counts = dict(time_id.value_counts())
time_ids = sorted(time_id.unique())
sample_per_fold = train.shape[0] // folds
fold = 0
sample_count_for_fold = 0
time_ids_for_fold = []
fold_info = []
total_count = 0
for i in range(len(time_ids)):
    identifier = time_ids[i]
    sample_count_for_fold += time_id_value_counts[identifier]
    time_ids_for_fold.append(identifier)
    if sample_count_for_fold >= sample_per_fold or i == len(time_ids) - 1:
        print(f"Sample Count for Fold {fold}", sample_count_for_fold)
        fold_info.append({"sample_count": sample_count_for_fold, "time_ids": time_ids_for_fold, "start_position": total_count, "end_position": total_count + sample_count_for_fold - 1, "file_name": f"fold_{fold}.tfrecords"})
        total_count += sample_count_for_fold
        sample_count_for_fold = 0
        time_ids_for_current_fold = []
        fold += 1
info = pd.DataFrame(fold_info)
info.to_csv("info.csv", index=False)

Let's see a simple sample of creating and reading TF-Record.

In [None]:
%%time
save_path = "sample.tfrecords"
with tf.io.TFRecordWriter(save_path) as file_writer:
    for i in range(10000):
        record_bytes = create_record(i)
        file_writer.write(record_bytes)
dataset = tf.data.TFRecordDataset([save_path])
dataset = dataset.map(decode_function).batch(32)
for item in dataset.take(1):
    print(item)

Now create the whole dataset, it will take more than 1 hour.

In [None]:
%%time
import time
for i, info in enumerate(fold_info):
    begin = time.time()
    save_path = f"fold_{i}.tfrecords"
    print(f"Create {save_path}")
    print(f"Begin position: %d, End Position: %d"%(info["start_position"], info["end_position"]))
    save_path = f"fold_{i}.tfrecords"
    with tf.io.TFRecordWriter(save_path) as file_writer:
        for i in range(info["start_position"], info["end_position"]):
            record_bytes = create_record(i)
            file_writer.write(record_bytes)
    print("Elapsed time: %.2f"%(time.time() - begin))

## Write unique Investment Ids

In [None]:
investment_ids = investment_id.unique()
investment_id_df = pd.DataFrame({"investment_id": investment_ids})
investment_id_df.to_csv("investment_ids.csv", index=False)