# TF Dataset을 이용한 데이터 처리 해보기

### import packages

In [45]:
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
np.set_printoptions(precision=3, suppress=True)

### data Load

In [46]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

In [47]:
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

In [48]:
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path, # 파일 경로
        batch_size=5, # 데이터 배치 크기
        label_name=LABEL_COLUMN, # 라벨 이름 설정
        na_value="?", # 
        num_epochs=1,
        ignore_errors=True,
        **kwargs)
    return dataset

raw_train_data = get_dataset(train_file_path)

In [49]:
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print("{:20s}: {}".format(key, value.numpy()))

In [50]:
show_batch(raw_train_data)

sex                 : [b'male' b'female' b'female' b'male' b'male']
age                 : [28. 42. 27. 44. 22.]
n_siblings_spouses  : [8 1 1 0 0]
parch               : [2 0 0 0 0]
fare                : [69.55  26.    13.858  7.925  8.05 ]
class               : [b'Third' b'Second' b'Second' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Cherbourg' b'Southampton' b'Southampton']
alone               : [b'n' b'n' b'n' b'y' b'y']


### data preprocessing

In [51]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_dataset(train_file_path, column_names = CSV_COLUMNS)

show_batch(temp_dataset)

sex                 : [b'female' b'male' b'female' b'female' b'female']
age                 : [28.  3. 28. 24. 38.]
n_siblings_spouses  : [0 1 0 0 0]
parch               : [2 1 0 0 0]
fare                : [15.246 26.     7.879 83.158 80.   ]
class               : [b'Third' b'Second' b'Third' b'First' b'First']
deck                : [b'unknown' b'F' b'unknown' b'C' b'B']
embark_town         : [b'Cherbourg' b'Southampton' b'Queenstown' b'Cherbourg' b'unknown']
alone               : [b'n' b'n' b'y' b'y' b'y']


In [52]:
SELECT_COLUMN = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path,
                          select_columns=SELECT_COLUMN,
                          column_defaults=DEFAULTS)
show_batch(temp_dataset)

age                 : [28. 35. 28. 44. 28.]
n_siblings_spouses  : [0. 0. 0. 0. 0.]
parch               : [0. 0. 2. 1. 0.]
fare                : [  7.896 512.329  22.358  16.1    15.5  ]


In [53]:
example_batch, labels_batch = next(iter(temp_dataset))

In [54]:
def pack(features, label):
    return tf.stack(list(features.values()), axis=-1), label

In [55]:
packed_dataset = temp_dataset.map(pack)

for features, labels in packed_dataset.take(1):
    print(features.numpy())
    print()
    print(labels.numpy())

[[ 25.      1.      2.    151.55 ]
 [ 24.      2.      0.     73.5  ]
 [ 36.      1.      2.    120.   ]
 [ 35.      0.      0.    135.633]
 [ 33.      0.      0.      7.775]]

[0 0 1 1 0]


In [56]:
show_batch(raw_train_data)

sex                 : [b'female' b'female' b'male' b'male' b'female']
age                 : [35. 28. 35. 41. 42.]
n_siblings_spouses  : [1 0 0 0 0]
parch               : [0 1 0 0 0]
fare                : [ 90.     55.      7.896   7.125 227.525]
class               : [b'First' b'First' b'Third' b'Third' b'First']
deck                : [b'C' b'E' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Cherbourg' b'Southampton' b'Cherbourg']
alone               : [b'n' b'n' b'y' b'y' b'y']


In [57]:
example_batch, labels_batch = next(iter(temp_dataset))

In [58]:
class PackNumericFeatures(object):
    def __init__(self, names):
        self.names=names
    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features['numeric'] = numeric_features
        
        return features, labels

In [59]:
NUMERIC_FEATURES = ['age', 'n_siblings_spouses', 'parch', 'fare']
packed_train_data =  raw_train_data.map(PackNumericFeatures(NUMERIC_FEATURES))

In [60]:
show_batch(packed_train_data)

sex                 : [b'male' b'female' b'male' b'female' b'male']
class               : [b'Third' b'First' b'Third' b'Third' b'Third']
deck                : [b'unknown' b'C' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Cherbourg' b'Cherbourg' b'Queenstown' b'Queenstown']
alone               : [b'y' b'n' b'y' b'y' b'y']
numeric             : [[22.     0.     0.     7.25 ]
 [28.     1.     0.    89.104]
 [28.     0.     0.     7.229]
 [28.     0.     0.     7.75 ]
 [28.     0.     0.     7.829]]


In [65]:
example_batch, labels_batch = next(iter(temp_dataset))

In [66]:
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [74]:
MEAN = np.array(desc.T['mean'])
STD = np.array(desc.T['std'])

In [75]:
def normalize_numeric_data(data, mean, std):
    return (data-mean)/std

In [76]:
import functools

normalizer = partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_column = tf.feature_column.numeric_column('numeric', normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_column

NameError: name 'normailze_numeric_data' is not defined