In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

In [3]:
#https://storage.googleapis.com/tf-datasets/titanic/train.csv
#https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/train.csv"
eval_file = "./data/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(len(train_df))
print(len(eval_df))
print(train_df.head())  # head() 默认取前５条数据

627
264
   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  


In [4]:
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

In [5]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
for cc in categorical_columns:
    vocab = train_df[cc].unique()
    print(cc, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column( # indicator_column: can do one hot
            tf.feature_column.categorical_column_with_vocabulary_list(
                cc, vocab
            )
        )
    )
    
for cc in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            cc, dtype=tf.float32
        )
    )
    
# cross feature: age: [1,2,3,4,5], gender: [male, female]
# age_x_gender: [(1, male), (2,male),.....(1, femaile), (2, female)......]
# hash_bucket_size: 将数量过多的交叉特征映射到几个较小的数据中  
#    100000: 100 -> hash(100000 values) % 100  
# 交叉特征比较适合线性模型，ｄｎｎ模型效果不理想
# linear_estimator可以直接使用映射后的交叉特征，dnn_estimator必须先转成indicator_column才能使用
feature_columns.append(
    tf.feature_column.indicator_column(
        tf.feature_column.crossed_column(
            ['age', 'sex'],
            hash_bucket_size = 30
        )
    )
)

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']


In [6]:
def make_dataset(data_df, label_df, epochs = 10, shuffle=True, batch_size =32):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(data_df), label_df)
    )
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [7]:
output_dir = "baseline_model"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
# BaselineClassifier： 根据分类在出现在样本中的比例，来进行预测类别，没有什么模型，只是根据比例来随机猜测
baseline_estimator = tf.estimator.BaselineClassifier(
    model_dir = output_dir,
    n_classes = 2
)
baseline_estimator.train(
    input_fn = lambda : make_dataset(train_df, y_train, epochs=100)
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'baseline_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe995cb2190>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
If using Keras pass *_constrain

FailedPreconditionError: GetNext() failed because the iterator has not been initialized. Ensure that you have run the initializer operation for this iterator before getting the next element.
	 [[node IteratorGetNext (defined at /home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1751) ]]

Original stack trace for 'IteratorGetNext':
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 583, in start
    self.io_loop.start()
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/asyncio/base_events.py", line 534, in run_forever
    self._run_once()
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/asyncio/base_events.py", line 1771, in _run_once
    handle._run()
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 361, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 541, in execute_request
    user_expressions, allow_stdin,
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 300, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2858, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2886, in _run_cell
    return runner(coro)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3063, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3254, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-895487712d34>", line 11, in <module>
    input_fn = lambda : make_dataset(train_df, y_train, epochs=100)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1160, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1187, in _train_model_default
    input_fn, ModeKeys.TRAIN))
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1024, in _get_features_and_labels_from_input_fn
    self._call_input_fn(input_fn, mode))
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/util.py", line 65, in parse_input_fn_result
    result = iterator.get_next()
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/data/ops/iterator_ops.py", line 426, in get_next
    name=name)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/ops/gen_dataset_ops.py", line 2500, in iterator_get_next
    output_shapes=output_shapes, name=name)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/framework/op_def_library.py", line 793, in _apply_op_helper
    op_def=op_def)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3360, in create_op
    attrs, op_def, compute_device)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 3429, in _create_op_internal
    op_def=op_def)
  File "/home/zx/anaconda3/envs/tf2/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 1751, in __init__
    self._traceback = tf_stack.extract_stack()


In [None]:
baseline_estimator.evaluate(
    input_fn = lambda : make_dataset(
        eval_df, yeval, epochs = 1, shuffle=False, batch_size=20
    )
)

In [8]:
linear_output_dir = "linear_model_new_features"
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
    
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes=2,
    feature_columns=feature_columns
)

linear_estimator.train(
    input_fn = lambda : make_dataset(
        train_df, y_train, epochs=100
    )
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linear_model_new_features', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe9941dbd10>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change a

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7fe995ceff50>

In [9]:
linear_estimator.evaluate(
    input_fn = lambda: make_dataset(
        eval_df, y_eval, epochs=1, shuffle=False
    )

)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-29T17:07:27Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_model_new_features/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-02-29-17:07:29
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.79924244, accuracy_baseline = 0.625, auc = 0.8509337, auc_precision_recall = 0.79958504, average_loss = 0.45550686, global_step = 1960, label/mean = 0.375, loss = 0.44758192, precision = 0.7395833, prediction/mean = 0.38257372, recall = 0.7

{'accuracy': 0.79924244,
 'accuracy_baseline': 0.625,
 'auc': 0.8509337,
 'auc_precision_recall': 0.79958504,
 'average_loss': 0.45550686,
 'label/mean': 0.375,
 'loss': 0.44758192,
 'precision': 0.7395833,
 'prediction/mean': 0.38257372,
 'recall': 0.7171717,
 'global_step': 1960}

In [10]:
dnn_output_dir = "./dnn_model_new_features"
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
    
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes =2 ,
    feature_columns = feature_columns,
    hidden_units = [30, 30],  # 每层的单元数
    activation_fn = tf.nn.relu,
    optimizer = 'Adam',
)
dnn_estimator.train(
    input_fn = lambda: make_dataset(
        train_df, y_train, epochs=100
    )
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './dnn_model_new_features', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe94cd2a350>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change al

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7fe94cd22c10>

In [11]:
dnn_estimator.evaluate(
    input_fn = lambda: make_dataset(
        eval_df, y_eval, epochs=1, shuffle=False
    )

)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-29T17:07:44Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./dnn_model_new_features/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-02-29-17:07:45
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.79924244, accuracy_baseline = 0.625, auc = 0.8654729, auc_precision_recall = 0.81769705, average_loss = 0.4514043, global_step = 1960, label/mean = 0.375, loss = 0.43792456, precision = 0.72115386, prediction/mean = 0.3968539, recall = 0.757

{'accuracy': 0.79924244,
 'accuracy_baseline': 0.625,
 'auc': 0.8654729,
 'auc_precision_recall': 0.81769705,
 'average_loss': 0.4514043,
 'label/mean': 0.375,
 'loss': 0.43792456,
 'precision': 0.72115386,
 'prediction/mean': 0.3968539,
 'recall': 0.75757575,
 'global_step': 1960}