In [8]:
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
print(tf.__version__)

2.8.0


In [9]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

column_names = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare',
       'class', 'deck', 'embark_town', 'alone']

df = pd.read_csv(TRAIN_DATA_URL)
# pd.set_option('display.max_rows', None)
print(df)
# 데이터 정보 확인
print(df.info())
# 수치형 데이터 확인
print(df.describe())
# 범주형 데이터 확인
df.describe(include = np.object_)
# 결측치 확인
df.isnull().sum()

     survived     sex   age  n_siblings_spouses  parch     fare   class  \
0           0    male  22.0                   1      0   7.2500   Third   
1           1  female  38.0                   1      0  71.2833   First   
2           1  female  26.0                   0      0   7.9250   Third   
3           1  female  35.0                   1      0  53.1000   First   
4           0    male  28.0                   0      0   8.4583   Third   
..        ...     ...   ...                 ...    ...      ...     ...   
622         0    male  28.0                   0      0  10.5000  Second   
623         0    male  25.0                   0      0   7.0500   Third   
624         1  female  19.0                   0      0  30.0000   First   
625         0  female  28.0                   1      2  23.4500   Third   
626         0    male  32.0                   0      0   7.7500   Third   

        deck  embark_town alone  
0    unknown  Southampton     n  
1          C    Cherbourg     n

survived              0
sex                   0
age                   0
n_siblings_spouses    0
parch                 0
fare                  0
class                 0
deck                  0
embark_town           0
alone                 0
dtype: int64

In [10]:
# 데이터 요약

print("전체 데이터 수:", df.shape[0] * df.shape[1])
print(f"결측치 수: {df.isnull().sum().sum()}")
print("총 인원 수:", df["age"].count())
print("중복된 데이터:",df.duplicated().sum())

전체 데이터 수: 6270
결측치 수: 0
총 인원 수: 627
중복된 데이터: 69


In [12]:
feature_names = column_names[1:]
label_name = column_names[0]

print(feature_names)
print(label_name)

batch_size = 32
train_dataset = tf.data.experimental.make_csv_dataset(
    train_file_path,
    batch_size,
    column_names=column_names,
    label_name=label_name,
    num_epochs=1)

test_dataset = tf.data.experimental.make_csv_dataset(
    train_file_path,
    batch_size,
    column_names=column_names,
    label_name=label_name,
    num_epochs=1)

features, labels = next(iter(train_dataset))

print(f"##features \n {features}")


['sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']
survived
##features 
 OrderedDict([('sex', <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'male', b'male', b'male', b'male', b'male', b'female', b'male',
       b'male', b'male', b'female', b'male', b'male', b'female',
       b'female', b'male', b'male', b'female', b'female', b'male',
       b'female', b'female', b'female', b'female', b'male', b'female',
       b'female', b'female', b'male', b'female', b'female', b'male',
       b'male'], dtype=object)>), ('age', <tf.Tensor: shape=(32,), dtype=float32, numpy=
array([48., 28., 28., 34., 16., 35., 28., 32., 11., 25., 28., 20., 50.,
       30., 20., 28., 52., 23., 40., 38., 35., 40.,  2., 29., 28., 36.,
       22., 23., 25., 29., 30., 43.], dtype=float32)>), ('n_siblings_spouses', <tf.Tensor: shape=(32,), dtype=int32, numpy=
array([1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0], dtyp

In [13]:
age_max = df['age'].max()
age_min = df['age'].min()
n_siblings_spouses_max = df['n_siblings_spouses'].max()
n_siblings_spouses_min = df['n_siblings_spouses'].min()
parch_max = df['parch'].max()
parch_min = df['parch'].min()
fare_max = df['fare'].max()
fare_min = df['fare'].min()


# 불필요한 컬럼 삭제 및 데이터 정규화 -> embark_town
def transform(features, labels):
    features['age'] = (features['age'] - age_min) / (age_max - age_min)
    features['n_siblings_spouses'] = (features['n_siblings_spouses'] - n_siblings_spouses_min) / (n_siblings_spouses_max - n_siblings_spouses_min)
    features['parch'] = (features['parch'] - parch_min) / (parch_max - parch_min)
    features['fare'] = (features['fare'] - fare_min) / (fare_max - fare_min)
    del(features['embark_town'])
    return features, labels

train_dataset = train_dataset.map(transform)
test_dataset = test_dataset.map(transform)
features, labels = next(iter(train_dataset))

print(f"features \n {features}")

features 
 OrderedDict([('sex', <tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'male', b'female', b'male', b'female', b'female', b'male',
       b'male', b'male', b'male', b'male', b'female', b'male', b'male',
       b'male', b'male', b'female', b'female', b'male', b'male', b'male',
       b'male', b'male', b'male', b'male', b'male', b'female', b'female',
       b'male', b'male', b'male', b'female', b'female'], dtype=object)>), ('age', <tf.Tensor: shape=(32,), dtype=float32, numpy=
array([0.3438486 , 0.39432177, 0.3438486 , 0.3438486 , 0.3564669 ,
       0.6214511 , 0.36908516, 0.3438486 , 0.44479495, 0.3438486 ,
       0.17981073, 0.3438486 , 0.50157726, 0.43217665, 0.07886435,
       0.43217665, 0.48264983, 0.5835962 , 0.21766561, 0.31861198,
       0.49526814, 0.3438486 , 0.3438486 , 0.39432177, 0.59621453,
       0.5835962 , 0.06624606, 0.2933754 , 0.43217665, 0.20504732,
       0.3438486 , 0.3438486 ], dtype=float32)>), ('n_siblings_spouses', <tf.Tensor: shape=(32,), dtype=

In [14]:
# 문자열 처리 -> sex, deck, alone, class를 문자열로
CAT_COLUMNS = ['sex', 'deck', 'alone', 'class']
NUM_COLUMNS = ['age', 'fare', 'n_siblings_spouses', 'parch']

feature_cols = []

# Create IndicatorColumn for categorical features
for feature in CAT_COLUMNS:
  vocab = df[feature].unique()
  feature_cols.append(tf.feature_column.indicator_column(
      tf.feature_column.categorical_column_with_vocabulary_list(feature, vocab)))

# Create NumericColumn for numerical features
for feature in NUM_COLUMNS:
  feature_cols.append(tf.feature_column.numeric_column(feature, dtype=tf.float32))

print(feature_cols)

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='n_siblings_spouses', shape=(1,), default_value=None, dtype=tf.float32, n

In [15]:
# 모델 생성
model = tf.keras.Sequential()
model.add(tf.keras.layers.DenseFeatures(feature_cols))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(5, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['binary_accuracy'])

model.fit(train_dataset, epochs=100)
        

Epoch 1/100


2022-05-16 23:04:17.848302: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x14f7cc340>

In [16]:
# 모델 평가하기
model.predict(test_dataset)



2022-05-16 23:04:42.480479: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


array([[0.58731997],
       [0.7441584 ],
       [0.12575173],
       [0.1218008 ],
       [0.11045069],
       [0.10860484],
       [0.10928388],
       [0.10934852],
       [0.2952905 ],
       [0.91014296],
       [0.46740618],
       [0.8543506 ],
       [0.12125544],
       [0.9263537 ],
       [0.4676793 ],
       [0.8539536 ],
       [0.46751022],
       [0.90712965],
       [0.10865665],
       [0.10924384],
       [0.11004953],
       [0.3678554 ],
       [0.12717405],
       [0.10940309],
       [0.12078246],
       [0.86317945],
       [0.10968216],
       [0.89505965],
       [0.12676176],
       [0.9424716 ],
       [0.10779478],
       [0.1154998 ],
       [0.92956096],
       [0.17661929],
       [0.12717405],
       [0.70610017],
       [0.16169126],
       [0.9245759 ],
       [0.1095496 ],
       [0.19135894],
       [0.11001623],
       [0.40158287],
       [0.10973606],
       [0.19797698],
       [0.13960436],
       [0.92257917],
       [0.91637754],
       [0.122