<a href="https://colab.research.google.com/github/shinchan75034/tensorflow-pocket-ref/blob/main/chapter03/OReilly_C3_titanic_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import functools
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [None]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [None]:
!head {train_file_path}

survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
1,female,35.0,1,0,53.1,First,C,Southampton,n
0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
1,female,4.0,1,1,16.7,Third,G,Southampton,n


In [None]:
train_df = pd.read_csv(train_file_path, header='infer')
test_df = pd.read_csv(test_file_path, header='infer')


In [None]:
train_df

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.2500,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.9250,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1000,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
...,...,...,...,...,...,...,...,...,...,...
622,0,male,28.0,0,0,10.5000,Second,unknown,Southampton,y
623,0,male,25.0,0,0,7.0500,Third,unknown,Southampton,y
624,1,female,19.0,0,0,30.0000,First,B,Southampton,y
625,0,female,28.0,1,2,23.4500,Third,unknown,Southampton,n


In [None]:
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

train_ds = tf.data.experimental.make_csv_dataset(
      train_file_path,
      batch_size=3,
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True)

In [None]:
test_ds = tf.data.experimental.make_csv_dataset(
      test_file_path,
      batch_size=3,
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True)

In [None]:
for batch, label in train_ds.take(1):
  print(label)
  for key, value in batch.items():
    print("{}: {}".format(key,value.numpy()))


tf.Tensor([1 1 1], shape=(3,), dtype=int32)
sex: [b'female' b'female' b'male']
age: [23. 47. 20.]
n_siblings_spouses: [0 1 0]
parch: [0 1 0]
fare: [ 7.55   52.5542  7.2292]
class: [b'Third' b'First' b'Third']
deck: [b'unknown' b'D' b'unknown']
embark_town: [b'Southampton' b'Southampton' b'Cherbourg']
alone: [b'y' b'n' b'y']


In [None]:
feature_columns = []

# numeric cols
for header in ['age', 'n_siblings_spouses', 'parch', 'fare']:
  feature_columns.append(feature_column.numeric_column(header))

In [None]:
train_df.describe()

Unnamed: 0,survived,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0,627.0
mean,0.38756,29.631308,0.545455,0.379585,34.385399
std,0.487582,12.511818,1.15109,0.792999,54.59773
min,0.0,0.75,0.0,0.0,0.0
25%,0.0,23.0,0.0,0.0,7.8958
50%,0.0,28.0,0.0,0.0,15.0458
75%,1.0,35.0,1.0,0.0,31.3875
max,1.0,80.0,8.0,5.0,512.3292


In [None]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

In [None]:
age = feature_column.numeric_column('age')
age_buckets = feature_column.bucketized_column(age, boundaries=[23, 28, 35])
demo(age_buckets)

[[0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]


In [None]:
h = {}
for col in train_df:
  if col in ['sex', 'class', 'deck', 'embark_town', 'alone']:
    print(col, ':', train_df[col].unique())
    h[col] = train_df[col].unique()

sex : ['male' 'female']
class : ['Third' 'First' 'Second']
deck : ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town : ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone : ['n' 'y']


In [None]:
type(h.get('sex').tolist())

list

In [None]:
sex_type = feature_column.categorical_column_with_vocabulary_list(
      'sex', ['male' 'female'])
sex_type_one_hot = feature_column.indicator_column(sex_type)

## use key to lookup value and pass the value to one-hot encoding.
sex_type = feature_column.categorical_column_with_vocabulary_list(
      'sex', h.get('sex').tolist())
sex_type_one_hot = feature_column.indicator_column(sex_type)

class_type = feature_column.categorical_column_with_vocabulary_list(
      'class', ['Third' 'First' 'Second'])
class_type_one_hot = feature_column.indicator_column(class_type)

deck_type = feature_column.categorical_column_with_vocabulary_list(
      'deck', h.get('deck').tolist())
deck_type_one_hot = feature_column.indicator_column(deck_type)

embark_town_type = feature_column.categorical_column_with_vocabulary_list(
      'embark_town', h.get('embark_town').tolist())
embark_town_type_one_hot = feature_column.indicator_column(embark_town_type)

alone_type = feature_column.categorical_column_with_vocabulary_list(
      'alone', h.get('alone').tolist())
alone_one_hot = feature_column.indicator_column(alone_type)

In [None]:
deck = feature_column.categorical_column_with_vocabulary_list(
      'deck', train_df.deck.unique())
deck_embedding = feature_column.embedding_column(deck, dimension=3)

In [None]:
class_hashed = feature_column.categorical_column_with_hash_bucket(
      'class', hash_bucket_size=4)
demo(feature_column.indicator_column(class_hashed))

[[0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]]


In [None]:
feature_column

<module 'tensorflow._api.v2.feature_column' from '/usr/local/lib/python3.6/dist-packages/tensorflow/_api/v2/feature_column/__init__.py'>

In [None]:
crossed_feature = feature_column.crossed_column([sex_type, class_type], hash_bucket_size=5)
demo(feature_column.indicator_column(crossed_feature))

[[0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]]


In [None]:
crossed_feature

CrossedColumn(keys=(VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='class', vocabulary_list=('ThirdFirstSecond',), dtype=tf.string, default_value=-1, num_oov_buckets=0)), hash_bucket_size=5, hash_key=None)

In [None]:
feature_columns = []

# numeric cols
for header in ['age', 'n_siblings_spouses', 'parch', 'fare']:
  feature_columns.append(feature_column.numeric_column(header))

In [None]:
# bucketized cols
age = feature_column.numeric_column('age')
age_buckets = feature_column.bucketized_column(age, boundaries=[23, 28, 35])

In [None]:
feature_columns.append(age_buckets)

In [None]:
# indicator_columns
indicator_column_names = ['sex', 'class', 'deck', 'embark_town', 'alone']
for col_name in indicator_column_names:
  categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, train_df[col_name].unique())
  indicator_column = feature_column.indicator_column(categorical_column)
  feature_columns.append(indicator_column)

In [None]:
# append embedding columns
deck = feature_column.categorical_column_with_vocabulary_list(
      'deck', train_df.deck.unique())
deck_embedding = feature_column.embedding_column(deck, dimension=3)
feature_columns.append(deck_embedding)

In [None]:
# append crossed columns
cross_type_feature = feature_column.crossed_column(['sex', 'class'], hash_bucket_size=5)
feature_columns.append(feature_column.indicator_column(cross_type_feature))

In [None]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def pandas_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('survived')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
val_df, test_df = train_test_split(test_df, test_size=0.4)

In [None]:
print(len(train_df), 'train examples')
print(len(val_df), 'validation examples')
print(len(test_df), 'test examples')

627 train examples
158 validation examples
106 test examples


In [None]:
train_df

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.2500,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.9250,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1000,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
...,...,...,...,...,...,...,...,...,...,...
622,0,male,28.0,0,0,10.5000,Second,unknown,Southampton,y
623,0,male,25.0,0,0,7.0500,Third,unknown,Southampton,y
624,1,female,19.0,0,0,30.0000,First,B,Southampton,y
625,0,female,28.0,1,2,23.4500,Third,unknown,Southampton,n


In [None]:
'''
batch_size = 33
train_ds = df_to_dataset(train_df, batch_size=batch_size)
'''

'\nbatch_size = 33\ntrain_ds = df_to_dataset(train_df, batch_size=batch_size)\n'

In [None]:
batch_size = 33
labels = train_df.pop('survived')
working_ds = tf.data.Dataset.from_tensor_slices((dict(train_df), labels))
working_ds = working_ds.shuffle(buffer_size=len(train_df))
train_ds = working_ds.batch(batch_size)

In [None]:
dict(train_df)

{'age': 0      22.0
 1      38.0
 2      26.0
 3      35.0
 4      28.0
        ... 
 622    28.0
 623    25.0
 624    19.0
 625    28.0
 626    32.0
 Name: age, Length: 627, dtype: float64, 'alone': 0      n
 1      n
 2      y
 3      n
 4      y
       ..
 622    y
 623    y
 624    y
 625    n
 626    y
 Name: alone, Length: 627, dtype: object, 'class': 0       Third
 1       First
 2       Third
 3       First
 4       Third
         ...  
 622    Second
 623     Third
 624     First
 625     Third
 626     Third
 Name: class, Length: 627, dtype: object, 'deck': 0      unknown
 1            C
 2      unknown
 3            C
 4      unknown
         ...   
 622    unknown
 623    unknown
 624          B
 625    unknown
 626    unknown
 Name: deck, Length: 627, dtype: object, 'embark_town': 0      Southampton
 1        Cherbourg
 2      Southampton
 3      Southampton
 4       Queenstown
           ...     
 622    Southampton
 623    Southampton
 624    Southampton
 625    Southamp

In [None]:
val_ds = pandas_to_dataset(val_df, shuffle=False, batch_size=batch_size)
test_ds = pandas_to_dataset(test_df, shuffle=False, batch_size=batch_size)

In [None]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

Epoch 1/10
Consider rewriting this model with the Functional API.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f934feb16d8>

In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features_7 (DenseFeatu multiple                  24        
_________________________________________________________________
dense_9 (Dense)              multiple                  4608      
_________________________________________________________________
dense_10 (Dense)             multiple                  16512     
_________________________________________________________________
dropout_3 (Dropout)          multiple                  0         
_________________________________________________________________
dense_11 (Dense)             multiple                  129       
Total params: 21,273
Trainable params: 21,273
Non-trainable params: 0
_________________________________________________________________
