In [2]:
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt  
import pandas as pd 
np.set_printoptions(precision=3, suppress=True)

import functools
from __future__ import absolute_import, division, print_function, unicode_literals

In [3]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file('train.csv', TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file('eval.csv', TEST_DATA_URL)

In [4]:
LABEL_COLUMN = 'survived'
LABELS = {0, 1}

# makes dataset from csv file and returns dataset 
def get_dataset(file_path, **kwargs) : 
    dataset = tf.data.experimental.make_csv_dataset(
        file_path, batch_size=5, label_name=LABEL_COLUMN, 
        na_value="?", num_epochs=1, ignore_errors=True, **kwargs
    )

    return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

Instructions for updating:
Use `tf.data.Dataset.ignore_errors` instead.


In [5]:
def show_batch(dataset) : 
    for batch, label in dataset.take(1) : 
        for key, value in batch.items() : 
            print("{:20s} : {}".format(key, value.numpy()))

show_batch(raw_train_data) # batch size 5

sex                  : [b'male' b'male' b'male' b'male' b'female']
age                  : [28. 28. 35. 27. 16.]
n_siblings_spouses   : [0 0 0 0 0]
parch                : [0 0 0 2 0]
fare                 : [ 26.55    0.     26.288 211.5     7.75 ]
class                : [b'First' b'Second' b'First' b'First' b'Third']
deck                 : [b'unknown' b'unknown' b'E' b'C' b'unknown']
embark_town          : [b'Southampton' b'Southampton' b'Southampton' b'Cherbourg' b'Queenstown']
alone                : [b'y' b'y' b'y' b'n' b'y']


In [6]:
SELECT_COLUMNS = {"survived", "age", "n_siblings_spouses", "class", "deck", "alone"}
temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)
show_batch(temp_dataset) # only selected features 

age                  : [49. 28. 30. 36. 34.]
n_siblings_spouses   : [0 0 1 0 0]
class                : [b'Third' b'Third' b'Third' b'First' b'Third']
deck                 : [b'unknown' b'unknown' b'unknown' b'B' b'unknown']
alone                : [b'y' b'y' b'n' b'n' b'y']


if data is already numeric we can pack it into a vector before passing to the model 

In [7]:
SELECT_COLUMNS = ["survived", "age", "n_siblings_spouses", "parch", "fare"]
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(
    train_file_path, select_columns=SELECT_COLUMNS, column_defaults = DEFAULTS
)
show_batch(temp_dataset) # only selected features 

age                  : [21. 28. 60. 45. 14.]
n_siblings_spouses   : [0. 1. 1. 0. 1.]
parch                : [0. 0. 1. 0. 0.]
fare                 : [ 7.796 24.    79.2   35.5   11.242]


In [8]:
example_batch, labels_batch = next(iter(temp_dataset))

In [9]:
def pack(feature, label) : 
    return tf.stack(list(feature.values()), axis = 1), label

packed_dataset = temp_dataset.map(pack)
for feature, labels in packed_dataset.take(1) :
    print(feature.numpy())
    print()
    print(labels.numpy())

[[ 19.      0.      0.      7.775]
 [ 31.      0.      0.      7.854]
 [ 39.      0.      0.      7.925]
 [ 28.      0.      0.      7.896]
 [ 28.      1.      0.    133.65 ]]

[0 0 1 0 1]


In [10]:
class PackNumericFeatures(object) : 
    def __init__(self, names) : 
        self.names = names  

    def __call__(self, features, labels) :  
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis = -1)
        features.update({'numeric' : numeric_features}) 

        return features, labels

In [11]:
NUMERIC_FEATURES = ["age", "n_siblings_spouses", "parch", "fare"]

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES)
)
packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES)
)


 all the numeric features are packed in a tensor where others are kept seperate 

In [12]:
show_batch(packed_train_data)

sex                  : [b'male' b'female' b'male' b'male' b'male']
class                : [b'First' b'Third' b'Third' b'Second' b'Second']
deck                 : [b'C' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town          : [b'Southampton' b'Southampton' b'Queenstown' b'Cherbourg' b'Southampton']
alone                : [b'n' b'n' b'y' b'y' b'y']
numeric              : [[ 64.      1.      4.    263.   ]
 [ 30.      1.      1.     24.15 ]
 [ 28.      0.      0.      7.725]
 [ 23.      0.      0.     15.046]
 [ 35.      0.      0.     10.5  ]]


In [13]:
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


normalizing nnumerical data 

In [14]:
MEAN = np.array(desc.T["mean"])
STD = np.array(desc.T["std"])

Old method 

```py
def normalize_numeric_data(data, mean, std) : 
    return (data - mean)/std

normalizer = functools.partial(normalize_numeric_data, MEAN, STD)

numeric_column = tf.feature_column.numeric_column(
    "numeric", normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)]
)
numeric_columns = [numeric_column]
numeric_columns

numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)
numeric_layer(example_batch).numpy()
```

In [37]:
example_batch, labels_batch = next(iter(packed_train_data))
example_batch['numeric'].numpy()

array([[ 31.   ,   0.   ,   0.   ,  13.   ],
       [ 14.   ,   1.   ,   2.   , 120.   ],
       [ 25.   ,   0.   ,   0.   ,  13.   ],
       [ 27.   ,   1.   ,   0.   ,  21.   ],
       [ 47.   ,   1.   ,   1.   ,  52.554]], dtype=float32)

In [36]:
numeric_layer = tf.keras.layers.Normalization(axis = -1, mean = MEAN, variance=STD**2)
numeric_layer(example_batch['numeric'].numpy())

<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[-0.13 , -0.474, -0.479, -0.404],
       [-1.01 , -0.474,  2.043,  1.401],
       [-1.089, -0.474, -0.479, -0.482],
       [ 0.989,  0.395, -0.479, -0.154],
       [ 1.788, -0.474, -0.479, -0.071]], dtype=float32)>

Categorical data 