In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

In [2]:
#https://storage.googleapis.com/tf-datasets/titanic/train.csv
#https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/train.csv"
eval_file = "./data/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(len(train_df))
print(len(eval_df))
print(train_df.head())  # head() 默认取前５条数据

627
264
   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  


In [3]:
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

In [4]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
for cc in categorical_columns:
    vocab = train_df[cc].unique()
    print(cc, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column( # indicator_column: can do one hot
            tf.feature_column.categorical_column_with_vocabulary_list(
                cc, vocab
            )
        )
    )
    
for cc in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            cc, dtype=tf.float32
        )
    )

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']


In [5]:
def make_dataset(data_df, label_df, epochs = 10, shuffle=True, batch_size =32):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(data_df), label_df)
    )
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset.make_one_shot_iterator().get_next()

In [7]:
output_dir = "./customized_estimator"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def model_fn(features, labels, mode, params):
    # mode: model runtime state : Train, Eval, Predict
    input_for_next_layer = tf.feature_column.input_layer(
        features, params['feature_columns']
    )
    for n_unit in params['hidden_units']:
        input_for_next_layer = tf.layers.dense(
            input_for_next_layer,
            units = n_unit,
            activation = tf.nn.relu
        )
    logits = tf.layers.dense(input_for_next_layer,
                            params['n_classes'],
                            activation=None)
    predicted_classes = tf.argmax(logits, 1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            "class_ids": predicted_classes[:, tf.newaxis],
            "probabilities": tf.nn.softmax(logits),
            "logits": logits
            
        }
        return tf.estimator.EstimatorSpec(mode, predictions = predictions)
    loss = tf.losses.sparse_softmax_cross_entropy(labels = labels, logits = logits)
    accuracy = tf.metrics.accuracy(labels = labels,
                                  predictions  = predicted_classes,
                                  name = "acc_op")
    metrics = {"accuracy": accuracy}
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops = metrics)
    
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minize(loss, global_step=tf.train.get_global_step())
    if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op = train_op)

estimator = tf.estimator.Estimator(
    model_fn = model_fn,
    model_dir = output_dir,
    params = {
        "feature_columns": feature_columns,
        "hidden_units": [30, 50],
        "n_classes": 2
    }
)

estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100
  ))

AttributeError: module 'os' has no attribute 'make_dir'

In [None]:
estimator.evaluate(lambda: make_dataset(eval_df, y_eval, epochs=1))