In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
# load data
train_data = pd.read_csv(r"./train.csv")
test_data = pd.read_csv(r"./test.csv")

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
def drop_not_concerned_columns(data, columns):
    return data.drop(columns, axis=1)

not_concerned_columns = ["PassengerId","Name", "Ticket", "Fare", "Cabin", "Embarked"]
train_data = drop_not_concerned_columns(train_data, not_concerned_columns)
test_data = drop_not_concerned_columns(test_data, not_concerned_columns)

In [6]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,22.0,1,0
1,1,1,female,38.0,1,0
2,1,3,female,26.0,0,0
3,1,1,female,35.0,1,0
4,0,3,male,35.0,0,0


In [7]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch
0,3,male,34.5,0,0
1,3,female,47.0,1,0
2,2,male,62.0,0,0
3,3,male,27.0,0,0
4,3,female,22.0,1,1


In [8]:
def clean_nan_columns(data, columns):
    for column in columns:
        print("start clean ", column)
        nan_column = np.isnan(data[column])
        if len(nan_column[nan_column]) != 0:
            data = data[nan_column != True]
        print("finish clean ", column)
    return data


nan_columns = ["Age", "SibSp", "Parch"]

train_data = clean_nan_columns(train_data, nan_columns)
print(len(train_data))
test_data = clean_nan_columns(test_data, nan_columns)
print(len(test_data))

start clean  Age
finish clean  Age
start clean  SibSp
finish clean  SibSp
start clean  Parch
finish clean  Parch
714
start clean  Age
finish clean  Age
start clean  SibSp
finish clean  SibSp
start clean  Parch
finish clean  Parch
332


In [9]:
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    return data


dummy_columns = ["Pclass"]
train_data=dummy_data(train_data, dummy_columns)
test_data=dummy_data(test_data, dummy_columns)

In [10]:
train_data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3
0,0,male,22.0,1,0,0,0,1
1,1,female,38.0,1,0,1,0,0
2,1,female,26.0,0,0,0,0,1
3,1,female,35.0,1,0,1,0,0
4,0,male,35.0,0,0,0,0,1


In [11]:
test_data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3
0,male,34.5,0,0,0,0,1
1,female,47.0,1,0,0,0,1
2,male,62.0,0,0,0,1,0
3,male,27.0,0,0,0,0,1
4,female,22.0,1,1,0,0,1


In [12]:
from sklearn.preprocessing import LabelEncoder
def sex_to_int(data):
    le = LabelEncoder()
    le.fit(["male","female"])
    data["Sex"]=le.transform(data["Sex"]) 
    return data

train_data = sex_to_int(train_data)
test_data = sex_to_int(test_data)
train_data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3
0,0,1,22.0,1,0,0,0,1
1,1,0,38.0,1,0,1,0,0
2,1,0,26.0,0,0,0,0,1
3,1,0,35.0,1,0,1,0,0
4,0,1,35.0,0,0,0,0,1


In [13]:
from sklearn.preprocessing import Normalizer

def normalize_age(data):
    max_age = data["Age"].max()
    min_age = data["Age"].min()
    data["Age"] = (data["Age"] - min_age) / (max_age - min_age)
    return data
train_data = normalize_age(train_data)
test_data = normalize_age(test_data)
train_data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3
0,0,1,0.271174,1,0,0,0,1
1,1,0,0.472229,1,0,1,0,0
2,1,0,0.321438,0,0,0,0,1
3,1,0,0.434531,1,0,1,0,0
4,0,1,0.434531,0,0,0,0,1


In [27]:
from sklearn.preprocessing import LabelBinarizer

def split_valid_test_data(data, fraction=0.8):
    data_y = data["Survived"]
    lb = LabelBinarizer()
    data_y = lb.fit_transform(data_y)

    data_x = data.drop(["Survived"], axis=1)

    train_valid_split_idx = int(len(data_x) * fraction)
    train_x = data_x[:train_valid_split_idx]
    train_y = data_y[:train_valid_split_idx]

    valid_x = data_x[train_valid_split_idx:]
    valid_y = data_y[train_valid_split_idx:]

    valid_test_split_idx = train_valid_split_idx // 2
    test_x = valid_x[valid_test_split_idx:]
    test_y = valid_y[valid_test_split_idx:]

    valid_x = valid_x[:valid_test_split_idx]
    valid_y = valid_y[:valid_test_split_idx]

    return train_x.values, train_y, valid_x.values, valid_y, test_x.values, test_y

train_x, train_y, valid_x, valid_y, test_x, test_y = split_valid_test_data(train_data)
print("train_x:{}".format(train_x.shape))
print("train_y:{}".format(train_y.shape))
print("train_y content:{}".format(train_y[:3]))

print("valid_x:{}".format(valid_x.shape))
print("valid_y:{}".format(valid_y.shape))

print("test_x:{}".format(valid_x.shape))
print("test_y:{}".format(valid_y.shape))

train_x:(571, 7)
train_y:(571, 1)
train_y content:[[0]
 [1]
 [1]]
valid_x:(143, 7)
valid_y:(143, 1)
test_x:(143, 7)
test_y:(143, 1)


In [24]:
hidden_units=20

inputs = tf.placeholder(tf.float32, shape=[None, train_x.shape[1]])
labels = tf.placeholder(tf.float32, shape=[None, 1])

fc = tf.contrib.layers.fully_connected(inputs, hidden_units)
logits = tf.contrib.layers.fully_connected(fc, 1, activation_fn=None)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
cost = tf.reduce_mean(cross_entropy)

optimizer = tf.train.AdamOptimizer().minimize(cost)

predicted = tf.nn.softmax(logits)
correct_pred = tf.equal(tf.argmax(predicted, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [25]:
epochs = 50
iteration = 0

saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for e in range(epochs):
        feed = {inputs: train_x,
                labels: train_y}
        
        loss, _ = sess.run([cost, optimizer], feed_dict=feed)
        print("Epoch: {}/{}".format(e + 1, epochs),
              "Training loss: {:.5f}".format(loss))

        if e % 5 == 0:
            feed = {inputs: valid_x,
                    labels: valid_y}
            val_acc = sess.run(accuracy, feed_dict=feed)
            print("Epoch: {}/{}".format(e + 1, epochs),
                  "Validation Acc: {:.4f}".format(val_acc))
    saver.save(sess, "./titanic.ckpt")

Epoch: 1/50 Training loss: 0.00000
Epoch: 1/50 Validation Acc: 1.0000
Epoch: 2/50 Training loss: 0.00000
Epoch: 3/50 Training loss: 0.00000
Epoch: 4/50 Training loss: 0.00000
Epoch: 5/50 Training loss: 0.00000
Epoch: 6/50 Training loss: 0.00000
Epoch: 6/50 Validation Acc: 1.0000
Epoch: 7/50 Training loss: 0.00000
Epoch: 8/50 Training loss: 0.00000
Epoch: 9/50 Training loss: 0.00000
Epoch: 10/50 Training loss: 0.00000
Epoch: 11/50 Training loss: 0.00000
Epoch: 11/50 Validation Acc: 1.0000
Epoch: 12/50 Training loss: 0.00000
Epoch: 13/50 Training loss: 0.00000
Epoch: 14/50 Training loss: 0.00000
Epoch: 15/50 Training loss: 0.00000
Epoch: 16/50 Training loss: 0.00000
Epoch: 16/50 Validation Acc: 1.0000
Epoch: 17/50 Training loss: 0.00000
Epoch: 18/50 Training loss: 0.00000
Epoch: 19/50 Training loss: 0.00000
Epoch: 20/50 Training loss: 0.00000
Epoch: 21/50 Training loss: 0.00000
Epoch: 21/50 Validation Acc: 1.0000
Epoch: 22/50 Training loss: 0.00000
Epoch: 23/50 Training loss: 0.00000
Epo