In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.1
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.3
numpy 1.18.1
pandas 1.0.1
sklearn 0.22.1
tensorflow 2.0.1
tensorflow_core.keras 2.2.4-tf


In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [3]:
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state=7)

x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state=11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

In [6]:
output_dir = "./generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
def save_to_csv(output_dir, data, name_prefix, header=None, n_parts = 10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    file_names = []
    
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        file_names.append(part_csv)
        with open(part_csv, 'wt', encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                f.write(",".join(repr(col_t) for col_t in data[row_index]))
                f.write("\n")
            
    
    return file_names

# np.c_    merge data by line

train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]

header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir, train_data, "train", header_str, n_parts = 20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid", header_str, n_parts = 10)
test_filenames = save_to_csv(output_dir, test_data, "test", header_str, n_parts = 10)

In [12]:
import pprint
pprint.pprint(train_filenames)
pprint.pprint(valid_filenames)
pprint.pprint(test_filenames)

['./generate_csv/train_00.csv',
 './generate_csv/train_01.csv',
 './generate_csv/train_02.csv',
 './generate_csv/train_03.csv',
 './generate_csv/train_04.csv',
 './generate_csv/train_05.csv',
 './generate_csv/train_06.csv',
 './generate_csv/train_07.csv',
 './generate_csv/train_08.csv',
 './generate_csv/train_09.csv',
 './generate_csv/train_10.csv',
 './generate_csv/train_11.csv',
 './generate_csv/train_12.csv',
 './generate_csv/train_13.csv',
 './generate_csv/train_14.csv',
 './generate_csv/train_15.csv',
 './generate_csv/train_16.csv',
 './generate_csv/train_17.csv',
 './generate_csv/train_18.csv',
 './generate_csv/train_19.csv']
['./generate_csv/valid_00.csv',
 './generate_csv/valid_01.csv',
 './generate_csv/valid_02.csv',
 './generate_csv/valid_03.csv',
 './generate_csv/valid_04.csv',
 './generate_csv/valid_05.csv',
 './generate_csv/valid_06.csv',
 './generate_csv/valid_07.csv',
 './generate_csv/valid_08.csv',
 './generate_csv/valid_09.csv']
['./generate_csv/test_00.csv',
 './gener

In [13]:
# 1. file_names -> dataset
# 2. read csv -> dataset -> datasets ->merge
# 3. parse csv
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for fl in filename_dataset:
    print(fl)

tf.Tensor(b'./generate_csv/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_13.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_16.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_10.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_05.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_14.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_17.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_01.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_06.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_00.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_12.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_07.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_02.csv', shape=(), dtype=string)
tf.Tensor(b'./generate_csv/train_11.csv', shape=(), dtype=string)
tf.Tensor(

In [14]:
dataset = filename_dataset.interleave(
    lambda fl: tf.data.TextLineDataset(fl).skip(1),    # skip: ignores n lines
    cycle_length=5
)

for t in dataset.take(15):
    print(t)

tf.Tensor(b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147', shape=(), dtype=string)
tf.Tensor(b'0.04971034572063198,-0.8492418886278699,-0.06214699417830008,0.17878747064657746,-0.8025354230744277,0.0005066066922077538,0.6466457006743215,-1.1060793768010604,2.286', shape=(), dtype=string)
tf.Tensor(b'-0.6672227549433569,-0.04823952235146133,0.34529405473316743,0.5382668657200925,1.8521839533415545,-0.0611253832474835,-0.8417093045554153,1.520484740533546,1.59', shape=(), dtype=string)
tf.Tensor(b'-0.09719300311107498,-1.249743071766074,0.36232962250170797,0.026906080250728295,1.033811814747154,0.045881586971778555,1.3418334617377423,-1.6353869745909178,1.832', shape=(), dtype=string)
tf.Tensor(b'0.09734603446040174,0.7527628439249472,-0.20218964416999152,-0.1954700015215477,-0.4060513603629498,0.006785531677655949,-0.813715166526018,0.656614793197258,1.119', shape=

In [9]:
# tf.io.decode_csv(str, record_defaults)        record_defaults: 每部分数据的类型及默认值
sample_str = '1,2,3,4,5'
#record_defaults = [tf.constant(0, dtype = tf.int32)] * 5
record_defaults = [
    tf.constant(0, dtype=tf.int32),
    0,
    np.nan,
    "hello",
    tf.constant([])
]
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)

[<tf.Tensor: id=20, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=21, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=22, shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: id=23, shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: id=24, shape=(), dtype=float32, numpy=5.0>]


In [10]:
sample_str = ",,,,"
try:
    parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Field 4 is required but missing in record 0! [Op:DecodeCSV]


In [11]:
sample_str = "1,2,3,4,5,6,7,8"
try:
    parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 8 in record 0 [Op:DecodeCSV]


In [20]:
def parse_csv_line(line_str, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line_str, record_defaults = defs)
    x = tf.stack(parsed_fields[:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

print(parse_csv_line)

<function parse_csv_line at 0x7ff12859cc20>


In [22]:
# 1. file_names -> dataset
# 2. read csv -> dataset -> datasets ->merge
# 3. parse csv

def csv_reader_dataset(file_names, n_readers= 5, batch_size=32, n_parse_threads=5, shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(file_names)
    dataset = dataset.repeat() # unlimit repeat
    dataset = dataset.interleave(
        lambda fn: tf.data.TextLineDataset(fn).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(
        parse_csv_line,
        num_parallel_calls = n_parse_threads
    )
    dataset = dataset.batch(batch_size)
    return dataset
    
train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)

x:
<tf.Tensor: id=352, shape=(3, 8), dtype=float32, numpy=
array([[-1.0591781e+00,  1.3935647e+00, -2.6331969e-02, -1.1006760e-01,
        -6.1381990e-01, -9.6959352e-02,  3.2471311e-01, -3.7477244e-02],
       [ 6.3034356e-01,  1.8741661e+00, -6.7132145e-02, -1.2543367e-01,
        -1.9737554e-01, -2.2722632e-02, -6.9240725e-01,  7.2652334e-01],
       [ 4.9710345e-02, -8.4924191e-01, -6.2146995e-02,  1.7878747e-01,
        -8.0253541e-01,  5.0660671e-04,  6.4664572e-01, -1.1060793e+00]],
      dtype=float32)>
y:
<tf.Tensor: id=353, shape=(3, 1), dtype=float32, numpy=
array([[0.672],
       [2.419],
       [2.286]], dtype=float32)>
x:
<tf.Tensor: id=354, shape=(3, 8), dtype=float32, numpy=
array([[ 0.81150836, -0.04823952,  0.5187339 , -0.0293864 , -0.03406402,
        -0.05081595, -0.7157357 ,  0.91627514],
       [ 0.63636464, -1.0895426 ,  0.09260903, -0.20538124,  1.2025671 ,
        -0.03630123, -0.6784102 ,  0.18223535],
       [-0.22235657,  1.3935647 ,  0.029913  ,  0.0801452 

In [23]:
train_set = csv_reader_dataset(train_filenames, batch_size=32)
valid_set = csv_reader_dataset(valid_filenames, batch_size=32)
test_set = csv_reader_dataset(test_filenames, batch_size=32)

In [25]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=[8]),
    keras.layers.Dense(1)
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)]
history = model.fit(train_set, 
                    steps_per_epoch = 11600 // 32,  # batch_size = 32
                    validation_data=valid_set,
                    validation_steps = 3870 //32,   # batch_size = 32
                   epochs=100, callbacks=callbacks)

Train for 362 steps, validate for 120 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


In [26]:
model.evaluate(test_set,
              steps = 5160 // 32)



0.3910891309565639