In [3]:
# %load setup_enviroment.py
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras
import sys
import os
import time
import sklearn
import numpy as np
import pandas as pd

get_ipython().run_line_magic('matplotlib', 'inline')


In [5]:
source_dir = "./generate_csv/"

print(os.listdir(source_dir))

['train_02.csv', 'valid_01.csv', 'test_07.csv', 'valid_03.csv', 'test_03.csv', 'train_12.csv', 'test_04.csv', 'train_04.csv', 'train_00.csv', 'valid_06.csv', 'valid_05.csv', 'train_13.csv', 'test_02.csv', 'train_01.csv', 'train_14.csv', 'train_05.csv', 'valid_00.csv', 'test_09.csv', 'valid_09.csv', 'train_08.csv', 'train_16.csv', 'train_10.csv', 'valid_02.csv', 'train_07.csv', 'train_11.csv', 'train_15.csv', 'valid_07.csv', 'train_19.csv', 'test_08.csv', 'train_06.csv', 'train_18.csv', 'test_01.csv', 'train_03.csv', 'train_17.csv', 'test_05.csv', 'valid_04.csv', 'test_00.csv', 'valid_08.csv', 'train_09.csv', 'test_06.csv']


In [8]:
def get_filename_by_prefix(source_dir,prefix_name):
    all_files = os.listdir(source_dir)
    result = []
    for filename in all_files:
        if filename.startswith(prefix_name):
            result.append(os.path.join(source_dir,filename))
    return result

train_filenames = get_filename_by_prefix(source_dir,"train")
valid_filenames = get_filename_by_prefix(source_dir,"valid")
test_filenames = get_filename_by_prefix(source_dir,"test")

import pprint

pprint.pprint(train_filenames)
pprint.pprint(valid_filenames)
pprint.pprint(test_filenames)

['./generate_csv/train_02.csv',
 './generate_csv/train_12.csv',
 './generate_csv/train_04.csv',
 './generate_csv/train_00.csv',
 './generate_csv/train_13.csv',
 './generate_csv/train_01.csv',
 './generate_csv/train_14.csv',
 './generate_csv/train_05.csv',
 './generate_csv/train_08.csv',
 './generate_csv/train_16.csv',
 './generate_csv/train_10.csv',
 './generate_csv/train_07.csv',
 './generate_csv/train_11.csv',
 './generate_csv/train_15.csv',
 './generate_csv/train_19.csv',
 './generate_csv/train_06.csv',
 './generate_csv/train_18.csv',
 './generate_csv/train_03.csv',
 './generate_csv/train_17.csv',
 './generate_csv/train_09.csv']
['./generate_csv/valid_01.csv',
 './generate_csv/valid_03.csv',
 './generate_csv/valid_06.csv',
 './generate_csv/valid_05.csv',
 './generate_csv/valid_00.csv',
 './generate_csv/valid_09.csv',
 './generate_csv/valid_02.csv',
 './generate_csv/valid_07.csv',
 './generate_csv/valid_04.csv',
 './generate_csv/valid_08.csv']
['./generate_csv/test_07.csv',
 './gener

In [12]:
def parse_csv_line(line,n_fields =9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x,y

def csv_read_dataset(filenames,n_reads = 5,
                     batch_size = 32,n_parse_threads = 5,
                     shuffle_buffer_size = 10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename:tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_reads)
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls = n_parse_threads)
    dataset = dataset.batch(batch_size)
    
    return dataset

batch_size = 32

train_set = csv_read_dataset(train_filenames,batch_size = batch_size)
valid_set = csv_read_dataset(valid_filenames,batch_size = batch_size)
test_set = csv_read_dataset(test_filenames,batch_size = batch_size)



In [23]:
def serialize_example(x,y):
    """convert x,y to tf.train.Example and serialize"""
    input_features = tf.train.FloatList(value = x)
    label = tf.train.FloatList(value = y)
    
    features = tf.train.Features(
        feature = {
            "input_features" : tf.train.Feature(
                float_list = input_features),
            "label" : tf.train.Feature(
                float_list = label)
        }
    )
    example = tf.train.Example(features = features)
    return example.SerializeToString()

def csv_dataset_to_tfrecords(base_filenames,dataset,
                             n_shards,step_per_shard,
                             compression_type = None):
    all_filenames = []
    
    options = tf.io.TFRecordOptions(compression_type = compression_type)
    for shard_id in range(n_shards):
        filename_fullpath = "{}_{:05d}-of-{:05d}".format(base_filenames,
                                                         shard_id,
                                                         n_shards)
        with tf.io.TFRecordWriter(filename_fullpath,options) as writer:
            for x_batch,y_batch in dataset.take(step_per_shard):
                for x_example,y_example in zip(x_batch,y_batch):
                    writer.write(
                        serialize_example(x_example,y_example))
        all_filenames.append(filename_fullpath)
        
    return all_filenames

In [24]:
n_shards = 20

train_step_per_shard = 11610 // batch_size // n_shards

valid_step_per_shard = 3380 // batch_size // n_shards

test_step_per_shard = 5170 // batch_size // n_shards

output_dir = "generate_tfrecords"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

train_basename = os.path.join(output_dir,"train")
valid_basename = os.path.join(output_dir,"valid")
test_basename = os.path.join(output_dir,"test")

train_tfrecord_filenames = csv_dataset_to_tfrecords(train_basename,
                                                    train_set,
                                                    n_shards,
                                                    train_step_per_shard,
                                                    None)
valid_tfrecord_filenames = csv_dataset_to_tfrecords(valid_basename,
                                                    valid_set,
                                                    n_shards,
                                                    valid_step_per_shard,
                                                    None)
test_tfrecord_filenames = csv_dataset_to_tfrecords(test_basename,
                                                   test_set,
                                                   n_shards,
                                                   test_step_per_shard,
                                                   None)


In [25]:
n_shards = 20

train_step_per_shard = 11610 // batch_size // n_shards

valid_step_per_shard = 3380 // batch_size // n_shards

test_step_per_shard = 5170 // batch_size // n_shards

output_dir = "generate_tfrecords_zip"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

train_basename = os.path.join(output_dir,"train")
valid_basename = os.path.join(output_dir,"valid")
test_basename = os.path.join(output_dir,"test")

train_tfrecord_filenames = csv_dataset_to_tfrecords(train_basename,
                                                    train_set,
                                                    n_shards,
                                                    train_step_per_shard,
                                                    compression_type= "GZIP")
valid_tfrecord_filenames = csv_dataset_to_tfrecords(valid_basename,
                                                    valid_set,
                                                    n_shards,
                                                    valid_step_per_shard,
                                                    compression_type= "GZIP")
test_tfrecord_filenames = csv_dataset_to_tfrecords(test_basename,
                                                   test_set,
                                                   n_shards,
                                                   test_step_per_shard,
                                                   compression_type= "GZIP")


In [26]:
pprint.pprint(train_tfrecord_filenames)
pprint.pprint(valid_tfrecord_filenames)
pprint.pprint(test_tfrecord_filenames)

['generate_tfrecords_zip/train_00000-of-00020',
 'generate_tfrecords_zip/train_00001-of-00020',
 'generate_tfrecords_zip/train_00002-of-00020',
 'generate_tfrecords_zip/train_00003-of-00020',
 'generate_tfrecords_zip/train_00004-of-00020',
 'generate_tfrecords_zip/train_00005-of-00020',
 'generate_tfrecords_zip/train_00006-of-00020',
 'generate_tfrecords_zip/train_00007-of-00020',
 'generate_tfrecords_zip/train_00008-of-00020',
 'generate_tfrecords_zip/train_00009-of-00020',
 'generate_tfrecords_zip/train_00010-of-00020',
 'generate_tfrecords_zip/train_00011-of-00020',
 'generate_tfrecords_zip/train_00012-of-00020',
 'generate_tfrecords_zip/train_00013-of-00020',
 'generate_tfrecords_zip/train_00014-of-00020',
 'generate_tfrecords_zip/train_00015-of-00020',
 'generate_tfrecords_zip/train_00016-of-00020',
 'generate_tfrecords_zip/train_00017-of-00020',
 'generate_tfrecords_zip/train_00018-of-00020',
 'generate_tfrecords_zip/train_00019-of-00020']
['generate_tfrecords_zip/valid_00000-of-

In [34]:
expect_features = {
    "input_features":tf.io.FixedLenFeature([8],dtype = tf.float32),
    "label" : tf.io.FixedLenFeature([1],dtype = tf.float32)
}

def parse_example(serialized_example):
    example = tf.io.parse_single_example(serialized_example,
                                         expect_features)
    return example["input_features"],example["label"]

def tfrecords_read_dataset(filenames,n_reads = 5,
                     batch_size = 32,n_parse_threads = 5,
                     shuffle_buffer_size = 10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename:tf.data.TFRecordDataset(filename,compression_type = "GZIP"),
        cycle_length = n_reads)
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_example,
                          num_parallel_calls = n_parse_threads)
    dataset = dataset.batch(batch_size)
    
    return dataset


tfrecords_train = tfrecords_read_dataset(train_tfrecord_filenames,batch_size = 3)
for x_batch,y_batch in tfrecords_train.take(2):
    print(x_batch)
    print(y_batch)

tf.Tensor(
[[-1.118108   -0.84895366 -0.84369123 -0.02051749  0.19971783 -0.00389153
  -1.3487345   1.2535764 ]
 [ 0.84634066 -0.68987757 -0.3091761  -0.42383355 -1.0479033  -0.1685368
   1.077483   -1.2344942 ]
 [ 0.42359042  0.58273077  0.055551   -0.17149982  0.4782607  -0.10523725
   0.965288   -1.2645314 ]], shape=(3, 8), dtype=float32)
tf.Tensor(
[[1.25 ]
 [1.908]
 [2.068]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[ 0.3701508   1.1394969   0.20343983 -0.09859075 -0.6797969   0.05049406
  -0.7129627   0.7880018 ]
 [-0.22520569 -0.05357341 -0.2128554  -0.09616713  0.14239712  0.44052222
  -0.88593006  0.84306973]
 [-0.44235379  0.82134485 -0.5079775  -0.08779322  0.04566842  0.19360971
  -0.7176375   0.66785353]], shape=(3, 8), dtype=float32)
tf.Tensor(
[[2.148]
 [1.595]
 [1.563]], shape=(3, 1), dtype=float32)


In [43]:
batch_size = 32
tfrecord_train_set = tfrecords_read_dataset(train_tfrecord_filenames,
                                            batch_size = batch_size)
tfrecord_valid_set = tfrecords_read_dataset(valid_tfrecord_filenames,
                                            batch_size = batch_size)
tfrecord_test_set = tfrecords_read_dataset(test_tfrecord_filenames,
                                           batch_size = batch_size)


In [44]:
model = keras.models.Sequential([
    keras.layers.Dense(30,activation="relu",input_shape=[8]),
    keras.layers.Dense(1),
])

model.compile(loss = "mean_squared_error",
              optimizer = "sgd",)

callbacks = [keras.callbacks.EarlyStopping(patience = 5,
                                           min_delta = 1e-2)]

history = model.fit(tfrecord_train_set,
                    validation_data = tfrecord_valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3780 // batch_size,
                    epochs = 100,
                    callbacks = callbacks,)

Train for 348 steps, validate for 118 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [45]:
model.evaluate(tfrecord_test_set,steps = 5160 // batch_size,verbose = 1)



1.1282846205260442