In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

In [2]:
#https://storage.googleapis.com/tf-datasets/titanic/train.csv
#https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/train.csv"
eval_file = "./data/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(len(train_df))
print(len(eval_df))
print(train_df.head())  # head() 默认取前５条数据

627
264
   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  


In [3]:
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

In [4]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
for cc in categorical_columns:
    vocab = train_df[cc].unique()
    print(cc, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column( # indicator_column: can do one hot
            tf.feature_column.categorical_column_with_vocabulary_list(
                cc, vocab
            )
        )
    )
    
for cc in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            cc, dtype=tf.float32
        )
    )

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']


In [5]:
def make_dataset(data_df, label_df, epochs = 10, shuffle=True, batch_size =32):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(data_df), label_df)
    )
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [6]:
output_dir = "baseline_model"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
# BaselineClassifier： 根据分类在出现在样本中的比例，来进行预测类别，没有什么模型，只是根据比例来随机猜测
baseline_estimator = tf.estimator.BaselineClassifier(
    model_dir = output_dir,
    n_classes = 2
)
baseline_estimator.train(
    input_fn = lambda : make_dataset(train_df, y_train, epochs=100)
)

TypeError: __init__() got an unexpected keyword argument 'feature_columns'

In [None]:
baseline_estimator.evaluate(
    input_fn = lambda : make_dataset(
        eval_df, yeval, epochs = 1, shuffle=False, batch_size=20
    )
)

In [30]:
linear_output_dir = "linear_model"
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
    
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes=2,
    feature_columns=feature_columns
)

linear_estimator.train(
    input_fn = lambda : make_dataset(
        train_df, y_train, epochs=100
    )
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linear_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f54b21394d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change all layers to 

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7f54b2136d10>

In [32]:
linear_estimator.evaluate(
    input_fn = lambda: make_dataset(
        eval_df, y_eval, epochs=1, shuffle=False
    )

)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-29T16:37:36Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_model/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-02-29-16:37:37
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.78409094, accuracy_baseline = 0.625, auc = 0.83881235, auc_precision_recall = 0.77860093, average_loss = 0.4690346, global_step = 1960, label/mean = 0.375, loss = 0.45286942, precision = 0.71, prediction/mean = 0.38596186, recall = 0.7171717
INFO:tensor

{'accuracy': 0.78409094,
 'accuracy_baseline': 0.625,
 'auc': 0.83881235,
 'auc_precision_recall': 0.77860093,
 'average_loss': 0.4690346,
 'label/mean': 0.375,
 'loss': 0.45286942,
 'precision': 0.71,
 'prediction/mean': 0.38596186,
 'recall': 0.7171717,
 'global_step': 1960}

In [33]:
dnn_output_dir = "./dnn_model"
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
    
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes =2 ,
    feature_columns = feature_columns,
    hidden_units = [30, 30],  # 每层的单元数
    activation_fn = tf.nn.relu,
    optimizer = 'Adam',
)
dnn_estimator.train(
    input_fn = lambda: make_dataset(
        train_df, y_train, epochs=100
    )
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './dnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f553c5f5b10>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change all layers to h

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7f553c317910>

In [34]:
dnn_estimator.evaluate(
    input_fn = lambda: make_dataset(
        eval_df, y_eval, epochs=1, shuffle=False
    )

)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-02-29T16:41:41Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./dnn_model/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-02-29-16:41:42
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.7878788, accuracy_baseline = 0.625, auc = 0.8543006, auc_precision_recall = 0.78625816, average_loss = 0.45781255, global_step = 1960, label/mean = 0.375, loss = 0.4404432, precision = 0.72164947, prediction/mean = 0.35271296, recall = 0.7070707
INFO:ten

{'accuracy': 0.7878788,
 'accuracy_baseline': 0.625,
 'auc': 0.8543006,
 'auc_precision_recall': 0.78625816,
 'average_loss': 0.45781255,
 'label/mean': 0.375,
 'loss': 0.4404432,
 'precision': 0.72164947,
 'prediction/mean': 0.35271296,
 'recall': 0.7070707,
 'global_step': 1960}