In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
def format_data(df):
    df.drop(labels=["Name", "Ticket"], axis=1, inplace=True)
    df.Sex.replace(to_replace=["male","female"], value=[1,0], inplace=True)

    # Convert categorical Embarked values into one hot encoding.
    df = df.join(pd.get_dummies(df.Embarked))
    df.drop(labels="Embarked", axis=1, inplace=True)

    # Replace NaNs in Age with mean.
    df.Age.replace(to_replace=np.nan, value=round(df.Age.mean()), inplace=True)

    # Replace NaNs in Fare with mean.
    df.Fare.replace(to_replace=np.nan, value=round(df.Fare.mean()), inplace=True)

    # Replace NaNs in Cabin with 0 and with 1 for anything else.
    df.Cabin.replace(to_replace=np.nan, value=0,inplace=True)
    df.loc[df['Cabin'] != 0] = 1
    
    return df



In [3]:
# Split the data into X and y.
# Read data
df_train = pd.read_csv("train.csv", index_col=0)
df = format_data(df_train)
df_X = df.drop(labels="Survived", axis=1)
df_X = df_X.astype('float32') # Float32 data type in all the columns.
df_y = pd.DataFrame(data=df.Survived, columns=["Survived"]) # Int64 datatype in all the columns.
df_y = df_y.astype('float32')

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=100)

# Tensorflow graph construction phase.
tf.reset_default_graph() # Reset default graph if you want to execute this cell multiple times. 

n_inputs = df_X.shape[1]
n_hidden1 = 10
n_hidden2 = 5
n_outputs = df_y.shape[1]
X = tf.placeholder(tf.float32, shape=(None,n_inputs), name="X")
y = tf.placeholder(tf.float32, shape=(None), name="y")

    
# Define network architecture
hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,
                          kernel_initializer = tf.contrib.layers.xavier_initializer(),
                          name="hidden1")

# This is how you can get the weights or any other property saved in the graph variables.
with tf.variable_scope("hidden1", reuse=True):
    weights1 = tf.get_variable("kernel")

hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
                          kernel_regularizer=tf.contrib.layers.l2_regularizer(weights1), 
                          name="hidden2")

with tf.variable_scope("hidden2", reuse=True):
    weights2 = tf.get_variable("kernel")
    
hidden3 = tf.layers.dense(hidden2, n_hidden2, activation=tf.nn.relu,
                          kernel_regularizer=tf.contrib.layers.l2_regularizer(weights2), 
                          name="hidden3")

logits = tf.layers.dense(hidden3, n_outputs, name="logits")

with tf.name_scope("loss"):
    loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=y)

with tf.name_scope("train"):
    learning_rate = 0.01
    trainer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    
with tf.name_scope("eval"):
    # Define the accuracy
    #The default threshold is 0.5, rounded off directly
    prediction = tf.round(tf.sigmoid(logits))
    #prediction = tf.round(logits)
    prediction_no_round = tf.sigmoid(logits)
    #prediction_no_round = logits
    # Bool into float32 type
    correct = tf.cast(tf.equal(prediction, y), dtype=tf.float32)
    # Average
    accuracy = tf.reduce_mean(correct)
    
    # Using tf.metrics.accuracy method to calculate accuracy
    acc, acc_update = tf.metrics.accuracy(labels=y, predictions=prediction_no_round)
    
    # Using tf.metrics.auc method to calculate auc
    auc = tf.metrics.auc(labels=y,predictions=prediction_no_round)

# Assign variable initializer
init = tf.global_variables_initializer()
init_local = tf.local_variables_initializer()

# Assign model saver
saver = tf.train.Saver()

In [7]:
# TensorFlow execution phase.
n_epochs = 10000

#writer = tf.summary.FileWriter('./graphs', tf.get_default_graph())
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess:
    init.run()
    init_local.run()
    for epoch in range(n_epochs):
        X_train = df_X
        y_train = df_y
        t, loss_value = sess.run([trainer, loss], feed_dict={X:X_train, y:y_train})
        acc_train = accuracy.eval(feed_dict={X:X_train, y:y_train})
        acc_test = accuracy.eval(feed_dict={X:X_test, y:y_test})
        acc1_train = acc_update.eval(feed_dict={X:X_train, y:y_train})
        acc1_test = acc_update.eval(feed_dict={X:X_test, y:y_test})
        auc_compute = sess.run(auc, feed_dict={X:X_test, y:y_test})
        sigmoid_output = sess.run(prediction_no_round, feed_dict={X:X_test, y:y_test})
        avg_loss = np.mean(loss_value) 
        auc_skleran = roc_auc_score(list(y_test.Survived), list(sigmoid_output))
    print("Epoc {0:d} Loss {1:.5f} Training and test accuracy {2:.5f} {3:.5f} Test AUC {4:.3f} AUC SK {5:.5f} {6:.3f} {7:.3f}"
              .format(epoch,avg_loss, acc_train, acc_test, auc_compute[0], auc_skleran, acc1_train, acc1_test))
    
    save_path = saver.save(sess, "./titanic.ckpt")
    
#writer.close()

Epoc 9999 Loss 0.21591 Training and test accuracy 0.91358 0.90299 Test AUC 0.940 AUC SK 0.95332 0.228 0.228


In [5]:
prediction_no_round

<tf.Tensor 'eval/Sigmoid_1:0' shape=(?, 1) dtype=float32>

In [6]:
print("{0:.5f}".format(float(sigmoid_output[15])))

0.13623
