In [34]:
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt  
import pandas as pd 
np.set_printoptions(precision=3, suppress=True)

import functools
from __future__ import absolute_import, division, print_function, unicode_literals

In [35]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file('train.csv', TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file('eval.csv', TEST_DATA_URL)

In [36]:
LABEL_COLUMN = 'survived'
LABELS = {0, 1}

# makes dataset from csv file and returns dataset 
def get_dataset(file_path, **kwargs) : 
    dataset = tf.data.experimental.make_csv_dataset(
        file_path, batch_size=5, label_name=LABEL_COLUMN, 
        na_value="?", num_epochs=1, ignore_errors=True, **kwargs
    )

    return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [37]:
def show_batch(dataset) : 
    for batch, label in dataset.take(1) : 
        for key, value in batch.items() : 
            print("{:20s} : {}".format(key, value.numpy()))

show_batch(raw_train_data) # batch size 5

sex                  : [b'female' b'male' b'female' b'female' b'male']
age                  : [29. 18. 24. 28. 31.]
n_siblings_spouses   : [1 0 0 1 1]
parch                : [1 0 0 0 1]
fare                 : [10.462  8.3   69.3   15.5   37.004]
class                : [b'Third' b'Third' b'First' b'Third' b'Second']
deck                 : [b'G' b'unknown' b'B' b'unknown' b'unknown']
embark_town          : [b'Southampton' b'Southampton' b'Cherbourg' b'Queenstown' b'Cherbourg']
alone                : [b'n' b'y' b'y' b'n' b'n']


In [38]:
SELECT_COLUMNS = {"survived", "age", "n_siblings_spouses", "class", "deck", "alone"}
temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)
show_batch(temp_dataset) # only selected features 

age                  : [29.  24.  55.5 30.  26. ]
n_siblings_spouses   : [0 0 0 0 1]
class                : [b'First' b'Third' b'Third' b'Second' b'Second']
deck                 : [b'B' b'unknown' b'unknown' b'unknown' b'unknown']
alone                : [b'y' b'n' b'y' b'y' b'n']


if data is already numeric we can pack it into a vector before passing to the model 

In [39]:
SELECT_COLUMNS = ["survived", "age", "n_siblings_spouses", "parch", "fare"]
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(
    train_file_path, select_columns=SELECT_COLUMNS, column_defaults = DEFAULTS
)
show_batch(temp_dataset) # only selected features 

age                  : [28. 28. 36. 24. 22.]
n_siblings_spouses   : [0. 1. 0. 0. 0.]
parch                : [0. 0. 0. 0. 0.]
fare                 : [  7.896  24.15  135.633  79.2     7.75 ]


In [40]:
example_batch, labels_batch = next(iter(temp_dataset))

In [41]:
def pack(feature, label) : 
    return tf.stack(list(feature.values()), axis = 1), label

packed_dataset = temp_dataset.map(pack)
for feature, labels in packed_dataset.take(1) :
    print(feature.numpy())
    print()
    print(labels.numpy())

[[27.     0.     0.     8.663]
 [28.     8.     2.    69.55 ]
 [16.     0.     0.     7.75 ]
 [22.     1.     1.    29.   ]
 [31.     0.     0.     7.775]]

[1 0 1 1 0]


In [42]:
class PackNumericFeatures(object) : 
    def __init__(self, names) : 
        self.names = names  

    def __call__(self, features, labels) :  
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis = -1)
        features.update({'numeric' : numeric_features}) 

        return features, labels

In [43]:
NUMERIC_FEATURES = ["age", "n_siblings_spouses", "parch", "fare"]

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES)
)
packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES)
)


 all the numeric features are packed in a tensor where others are kept seperate 

In [44]:
show_batch(packed_train_data)

sex                  : [b'male' b'male' b'male' b'male' b'male']
class                : [b'Third' b'Second' b'First' b'Third' b'First']
deck                 : [b'unknown' b'unknown' b'unknown' b'unknown' b'A']
embark_town          : [b'Southampton' b'Southampton' b'Southampton' b'Southampton'
 b'Southampton']
alone                : [b'y' b'y' b'y' b'n' b'y']
numeric              : [[18.     0.     0.     7.775]
 [24.     0.     0.    13.   ]
 [34.     0.     0.    26.55 ]
 [ 1.     5.     2.    46.9  ]
 [39.     0.     0.     0.   ]]


In [45]:
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


normalizing nnumerical data 

In [46]:
MEAN = np.array(desc.T["mean"])
STD = np.array(desc.T["std"])

Old method 

```py
def normalize_numeric_data(data, mean, std) : 
    return (data - mean)/std

normalizer = functools.partial(normalize_numeric_data, MEAN, STD)

numeric_column = tf.feature_column.numeric_column(
    "numeric", normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)]
)
numeric_columns = [numeric_column]
numeric_columns

numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)
numeric_layer(example_batch).numpy()
```

In [47]:
example_batch, labels_batch = next(iter(packed_train_data))
example_batch['numeric'].numpy()

array([[20.   ,  0.   ,  0.   ,  9.846],
       [28.   ,  0.   ,  0.   ,  6.95 ],
       [47.   ,  0.   ,  0.   , 25.587],
       [30.   ,  1.   ,  0.   , 24.   ],
       [28.   ,  0.   ,  0.   ,  7.896]], dtype=float32)

In [48]:
numeric_layer = tf.keras.layers.Normalization(axis = -1, mean = MEAN, variance=STD**2)
numeric_layer(example_batch['numeric'].numpy())

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[-0.77 , -0.474, -0.479, -0.449],
       [-0.13 , -0.474, -0.479, -0.503],
       [ 1.388, -0.474, -0.479, -0.161],
       [ 0.029,  0.395, -0.479, -0.19 ],
       [-0.13 , -0.474, -0.479, -0.485]], dtype=float32)>

Categorical data 

In [49]:
CATEGORIES = {
    'sex' : ["male", "female"], 
    "class" : ["First", "Second", "Third"], 
    "deck" : [chr(i) for i in range(65, 75)], 
    "embark_town" : ["Cherbourg", "Southhampton", "Queenstown"], 
    "alone" : ['y', 'n']
}

In [50]:
categorical_columns = []
for feature, vocab in CATEGORIES.items() : 
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key = feature, vocabulary_list = vocab
    )
    categorical_columns.append(tf.feature_column.indicator_column(cat_col))

categorical_columns # indicator columns have been created 

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Southhampton', 'Queenstown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('y', 'n'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]