In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras


In [2]:
source_dir = "./generate_csv/"
print(os.listdir(source_dir))

def get_filenames_by_prefix(source_dir, prefix_name):
    all_files = os.listdir(source_dir)
    results = []
    for fn in all_files:
        if fn.startswith(prefix_name):
            results.append(os.path.join(source_dir, fn))
    return results


train_filenames = get_filenames_by_prefix(source_dir, "train")
valid_filenames = get_filenames_by_prefix(source_dir, "valid")
test_filenames = get_filenames_by_prefix(source_dir, "test")

['train_03.csv', 'train_13.csv', 'test_09.csv', 'train_00.csv', 'train_15.csv', 'test_04.csv', 'test_06.csv', 'test_08.csv', 'train_06.csv', 'train_17.csv', 'train_16.csv', 'train_18.csv', 'train_05.csv', 'train_04.csv', 'valid_08.csv', 'valid_04.csv', 'valid_06.csv', 'valid_07.csv', 'train_14.csv', 'valid_01.csv', 'train_08.csv', 'valid_09.csv', 'valid_00.csv', 'test_03.csv', 'valid_05.csv', 'test_00.csv', 'train_19.csv', 'train_01.csv', 'train_11.csv', 'train_02.csv', 'test_02.csv', 'valid_02.csv', 'test_01.csv', 'test_05.csv', 'train_10.csv', 'valid_03.csv', 'train_12.csv', 'train_09.csv', 'train_07.csv', 'test_07.csv']


In [9]:
def parse_csv_line(line_str, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line_str, record_defaults = defs)  # n_fields个tensor的数组
    x = tf.stack(parsed_fields[:-1]) # 具有n_fields -1 个元素的tensor
    y = tf.stack(parsed_fields[-1:])
    return x, y

parse_csv_line('1,2,3,4,5,6,7', 7)

def csv_reader_dataset(file_names, n_readers= 5, batch_size=32, n_parse_threads=5, shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(file_names)
    dataset = dataset.repeat() # unlimit repeat
    dataset = dataset.interleave(
        lambda fn: tf.data.TextLineDataset(fn).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(
        parse_csv_line,
        num_parallel_calls = n_parse_threads
    )
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_filenames, batch_size=32)
valid_set = csv_reader_dataset(valid_filenames, batch_size=32)
test_set = csv_reader_dataset(test_filenames, batch_size=32)

In [28]:
def serialize_example(x, y):
    """Converts x, y to tf.train.Example and serialize"""
    input_features = tf.train.FloatList(value = x)
    label = tf.train.FloatList(value = y)
    features = tf.train.Features(
        feature = {
            "input_features":tf.train.Feature(float_list = input_features),
            "label": tf.train.Feature(float_list = label)
        }
    )
    example = tf.train.Example(features = features)
    return example.SerializeToString()

def csv_dataset_to_tfrecords(base_filename, dataset, n_shards, steps_per_shard, compression_type="None"):
    options = tf.io.TFRecordOptions(compression_type = compression_type)
    all_filenames = []
    for shard_id in range(n_shards):
        filename_fullpath = "{}_{:05d}-of-{:05d}".format(
            base_filename, shard_id, n_shards
        )
        with tf.io.TFRecordWriter(filename_fullpath, options) as writer:
            for x_batch, y_batch in dataset.take(steps_per_shard):
                for x_example, y_example in zip(x_batch, y_batch):
                    writer.write(serialize_example(x_example, y_example))
        all_filenames.append(filename_fullpath)
        
    return all_filenames

In [29]:
n_shards = 20
batch_size = 32
train_steps_per_shard = 11610 // batch_size //n_shards
valid_steps_per_shard = 3880 // batch_size // n_shards
test_steps_per_shard = 5170 // batch_size // n_shards
output_dir = "generate_tfrecords"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
train_basename = os.path.join(output_dir, "train")
valid_basename = os.path.join(output_dir, "valid")
test_basename = os.path.join(output_dir, "test")

train_tfrecord_filenames = csv_dataset_to_tfrecords(
    train_basename, train_set, n_shards, train_steps_per_shard, None
)

valid_tfrecord_filenames = csv_dataset_to_tfrecords(
    valid_basename, valid_set, n_shards, valid_steps_per_shard, None
)

test_tfrecord_filenames = csv_dataset_to_tfrecords(
    test_basename, test_set, n_shards, test_steps_per_shard, None
)

generate_tfrecords/train_00000-of-00020
generate_tfrecords/train_00001-of-00020
generate_tfrecords/train_00002-of-00020
generate_tfrecords/train_00003-of-00020
generate_tfrecords/train_00004-of-00020
generate_tfrecords/train_00005-of-00020
generate_tfrecords/train_00006-of-00020
generate_tfrecords/train_00007-of-00020
generate_tfrecords/train_00008-of-00020
generate_tfrecords/train_00009-of-00020
generate_tfrecords/train_00010-of-00020
generate_tfrecords/train_00011-of-00020
generate_tfrecords/train_00012-of-00020
generate_tfrecords/train_00013-of-00020
generate_tfrecords/train_00014-of-00020
generate_tfrecords/train_00015-of-00020
generate_tfrecords/train_00016-of-00020
generate_tfrecords/train_00017-of-00020
generate_tfrecords/train_00018-of-00020
generate_tfrecords/train_00019-of-00020
generate_tfrecords/valid_00000-of-00020
generate_tfrecords/valid_00001-of-00020
generate_tfrecords/valid_00002-of-00020
generate_tfrecords/valid_00003-of-00020
generate_tfrecords/valid_00004-of-00020


In [31]:
import pprint
pprint.pprint(train_tfrecord_filenames)

['generate_tfrecords_zip/train_00000-of-00020',
 'generate_tfrecords_zip/train_00001-of-00020',
 'generate_tfrecords_zip/train_00002-of-00020',
 'generate_tfrecords_zip/train_00003-of-00020',
 'generate_tfrecords_zip/train_00004-of-00020',
 'generate_tfrecords_zip/train_00005-of-00020',
 'generate_tfrecords_zip/train_00006-of-00020',
 'generate_tfrecords_zip/train_00007-of-00020',
 'generate_tfrecords_zip/train_00008-of-00020',
 'generate_tfrecords_zip/train_00009-of-00020',
 'generate_tfrecords_zip/train_00010-of-00020',
 'generate_tfrecords_zip/train_00011-of-00020',
 'generate_tfrecords_zip/train_00012-of-00020',
 'generate_tfrecords_zip/train_00013-of-00020',
 'generate_tfrecords_zip/train_00014-of-00020',
 'generate_tfrecords_zip/train_00015-of-00020',
 'generate_tfrecords_zip/train_00016-of-00020',
 'generate_tfrecords_zip/train_00017-of-00020',
 'generate_tfrecords_zip/train_00018-of-00020',
 'generate_tfrecords_zip/train_00019-of-00020']


In [34]:
expected_features = {
    "input_features": tf.io.FixedLenFeature([8], dtype=tf.float32),
    "label": tf.io.FixedLenFeature([1], dtype = tf.float32)
}

def parse_example(serialized_example):
    example = tf.io.parse_single_example(serialized_example, expected_features)
    return example["input_features"], example["label"]

def tfrecords_reader_dataset(file_names, n_readers= 5, batch_size=32, n_parse_threads=5, shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(file_names)
    dataset = dataset.repeat() # unlimit repeat
    dataset = dataset.interleave(
        lambda fn: tf.data.TFRecordDataset(fn, compression_type = "GZIP"),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(
        parse_example,
        num_parallel_calls = n_parse_threads
    )
    dataset = dataset.batch(batch_size)
    return dataset

tfrecords_train_set = tfrecords_reader_dataset(train_tfrecord_filenames, batch_size = 32)
tfrecords_valid_set = tfrecords_reader_dataset(valid_tfrecord_filenames, batch_size = 32)
tfrecords_test_set = tfrecords_reader_dataset(test_tfrecord_filenames, batch_size = 32)
#for x_batch, y_batch in tfrecords_train_set.take(1):
#    print(x_batch)
#    print(y_batch)

In [35]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=[8]),
    keras.layers.Dense(1)
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)]
history = model.fit(tfrecords_train_set, 
                    steps_per_epoch = 11600 // 32,  # batch_size = 32
                    validation_data=tfrecords_valid_set,
                    validation_steps = 3870 //32,   # batch_size = 32
                   epochs=100, callbacks=callbacks)

Train for 362 steps, validate for 120 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


In [None]:
model.evaluate(tfrecords_test_set,
              steps = 5160 // 32)