In [1]:
# Import libraries
import pandas as pd
import tensorflow as tf # Here I use tensorflow version 2.2
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Read csv using pandas
data = pd.read_csv('./train_clean.csv', index_col=False)
print('Length : ', len(data))
data.head() # Print top 5 rows 

Length :  891


Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_Size
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,Mr,1
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,Mrs,1
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,Miss,0
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,Mrs,1
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,Mr,0


In [3]:
# Split the data into training and testing data 
# 80% for training and 20% for testing

train_data, test_data = train_test_split(data, test_size=0.2)
print('Number of training samples : ',len(train_data))
print('Number of test samples : ', len(test_data))

Number of training samples :  712
Number of test samples :  179


In [4]:
# Count the number of null values in each column for both training and testing data
print('For training ...')
print(train_data.isna().sum())

print('For testing ...')
print(test_data.isna().sum())

For training ...
Age              0
Cabin          555
Embarked         0
Fare             0
Name             0
Parch            0
PassengerId      0
Pclass           0
Sex              0
SibSp            0
Survived         0
Ticket           0
Title            0
Family_Size      0
dtype: int64
For testing ...
Age              0
Cabin          132
Embarked         0
Fare             0
Name             0
Parch            0
PassengerId      0
Pclass           0
Sex              0
SibSp            0
Survived         0
Ticket           0
Title            0
Family_Size      0
dtype: int64


In [5]:
# Remove null valued column "Cabin"
# Here we remove the entire column since we have more null values than the actual values
train_data.drop('Cabin', axis=1, inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

In [6]:
# Count the number of null values in each column for both training and testing data
print('For training ...')
print(train_data.isna().sum())

print('\nFor testing ...')
print(test_data.isna().sum())

# From the below result we can conclude that there is no null values left in out data

For training ...
Age            0
Embarked       0
Fare           0
Name           0
Parch          0
PassengerId    0
Pclass         0
Sex            0
SibSp          0
Survived       0
Ticket         0
Title          0
Family_Size    0
dtype: int64

For testing ...
Age            0
Embarked       0
Fare           0
Name           0
Parch          0
PassengerId    0
Pclass         0
Sex            0
SibSp          0
Survived       0
Ticket         0
Title          0
Family_Size    0
dtype: int64


In [7]:
# Remove other unneccesary columns

# Remove Name column since it is different for different people and have nothing to do with survival
train_data.drop('Name', axis=1, inplace=True)
test_data.drop('Name', axis=1, inplace=True)

# Similarly drop Fare, PassengerId, Ticket
train_data.drop('Fare', axis=1, inplace=True)
test_data.drop('Fare', axis=1, inplace=True)

train_data.drop('PassengerId', axis=1, inplace=True)
test_data.drop('PassengerId', axis=1, inplace=True)

train_data.drop('Ticket', axis=1, inplace=True)
test_data.drop('Ticket', axis=1, inplace=True)

In [8]:
# Remove the target column from the data and store it in a separate variable 
train_label = train_data.pop('Survived')
test_label = test_data.pop('Survived')

In [9]:
# Print top 5 labels in test set
test_label.head()

359    1.0
747    1.0
370    1.0
659    0.0
637    0.0
Name: Survived, dtype: float64

In [10]:
print('Length of traing data: ', len(train_data))
print('Length of training label : ', len(train_label))
print('Length of test data: ', len(test_data))
print('Length of test label : ', len(test_label))

Length of traing data:  712
Length of training label :  712
Length of test data:  179
Length of test label :  179


In [11]:
#Create train input function
def create_train_input_fn():
    '''
    Returns input function that would feed pandas Dataframe into model for training
    '''
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
        x = train_data,
        y = train_label,
        batch_size = 8, # Update parameters for every 8 training samples
        num_epochs = None, # Since we pass it multiple times through the network
        shuffle= True 
    )

#Create test input function
def create_test_input_fn():
    '''
    Returns input function that would feed pandas Dataframe into model for testing
    '''
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
        x = test_data,
        y = test_label,
        num_epochs = 1, # Pass only one iteration while testing
        shuffle= False # No need of shuffling test set
    )

In [12]:
# Base feature columns
feature_list = []

CATEGORICAL_COLUMN = ['Embarked', 'Sex', 'Title'] # Column name containing string values
NUMERICAL_COLUMN = ['Age', 'Parch', 'Pclass', 'SibSp', 'Family_Size'] # Column name containing numerical values

for feature_name in CATEGORICAL_COLUMN:
    vocab = train_data[feature_name].unique() # This returns a list of unique values in the column
    feature_list.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocab))

for feature_name in NUMERICAL_COLUMN:
    feature_list.append(tf.feature_column.numeric_column(feature_name, dtype = tf.int32)) # Represents numerical features

In [13]:
# Create an instance of input funtion
input_train_fn = create_train_input_fn()

# Create an instance of Linear Classifier estimator
# This writes log to the graphs/linear folder
# Every time you train, it resumes from the latest check point
# For training from beginning, make sure you clean the log directory before you train
estimator = tf.estimator.LinearClassifier(feature_list, model_dir='./graphs/linear', n_classes=2)

# Train the estimator
estimator.train(input_train_fn, steps=1000)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './graphs/linear', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use Varia

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x1ef872c2b48>

In [14]:
# Evaluate the model

# Create an instance of test input function
input_test_fn = create_test_input_fn()

# Pass input_test_fn as an argument to the evaluate function
# This will print the accuracy along with other performance measures
estimator.evaluate(input_test_fn)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-04T20:07:54Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./graphs/linear\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1.57819s
INFO:tensorflow:Finished evaluation at 2020-07-04-20:07:56
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.8156425, accuracy_baseline = 0.61452514, auc = 0.90013176, auc_precision_recall = 0.8701625, average_loss = 0.45590118, global_step = 1000, label/mean = 0.38547486, loss = 0.469847, precision = 0.70454544, pr

{'accuracy': 0.8156425,
 'accuracy_baseline': 0.61452514,
 'auc': 0.90013176,
 'auc_precision_recall': 0.8701625,
 'average_loss': 0.45590118,
 'label/mean': 0.38547486,
 'loss': 0.469847,
 'precision': 0.70454544,
 'prediction/mean': 0.52929664,
 'recall': 0.89855075,
 'global_step': 1000}

In [15]:
# Predict output for test set
input_test_fn = create_test_input_fn()
result = estimator.predict(input_test_fn)

# Verify results
i = 0
for res,actual_result in zip(result, test_label):
    predicted_result = res['class_ids'][0]
    print("\n Example : {} \t Actual : {} \t Predicted : {}".format(i, actual_result, predicted_result))

    i += 1
    if i == 10:
        break

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./graphs/linear\model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

 Example : 0 	 Actual : 1.0 	 Predicted : 1

 Example : 1 	 Actual : 1.0 	 Predicted : 1

 Example : 2 	 Actual : 1.0 	 Predicted : 1

 Example : 3 	 Actual : 0.0 	 Predicted : 1

 Example : 4 	 Actual : 0.0 	 Predicted : 0

 Example : 5 	 Actual : 1.0 	 Predicted : 1

 Example : 6 	 Actual : 1.0 	 Predicted : 1

 Example : 7 	 Actual : 1.0 	 Predicted : 1

 Example : 8 	 Actual : 0.0 	 Predicted : 0

 Example : 9 	 Actual : 0.

In [16]:
# Train a DNN Classifier 

# Let's create a vocabulary list for features with fewer categorical values

Embarked = tf.feature_column.categorical_column_with_vocabulary_list('Embarked',train_data['Embarked'].unique())

Sex = tf.feature_column.categorical_column_with_vocabulary_list('Sex',train_data['Sex'].unique())

Title = tf.feature_column.categorical_column_with_vocabulary_list('Title',train_data['Title'].unique())

In [17]:
# Create feature column
feature_columns = [
    # Indicator column represents multi-hot representation of given categorical column
    tf.feature_column.indicator_column(Embarked),
    tf.feature_column.indicator_column(Sex),
    tf.feature_column.indicator_column(Title),

    # Let's include numerical columns as well
    tf.feature_column.numeric_column('Age'),
    tf.feature_column.numeric_column('Parch'),
    tf.feature_column.numeric_column('Pclass'),
    tf.feature_column.numeric_column('SibSp'),
    tf.feature_column.numeric_column('Family_Size')
]

In [18]:
# Create DNN estimator

estimator = tf.estimator.DNNClassifier(
    hidden_units=[256, 128, 64], # Number of units in each hidden layer
    # Here we have 3 hidden layers
    feature_columns=feature_columns,
    n_classes=2, # Since we have 2 output classes
    model_dir='./graphs/dnn' # For storing the checkpoints
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './graphs/dnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [19]:
# Train the DNN estimator

#Create an instance of input train function
input_train_fn = create_train_input_fn()
# Train DNN estimator in input train function
# Here training resumes from the latest check point
# Inorder to start training from the beginning, make sure that you clear the graph/dnn directory before training
estimator.train(input_train_fn, steps=2000)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into ./graphs/dnn\model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.5733564, step = 0
INFO:tensorflow:global_step/sec: 214.097
INFO:tensorflow:loss = 0.5432509, step = 100 (0.472 sec)
INFO:tensorflow:global_step/sec: 287.06
INFO:tensorflow:loss = 0.65536857, step = 200 (0.348 sec)
INF

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x1ef9b9d9508>

In [20]:
# Evaluate the model
input_test_fn = create_test_input_fn()
estimator.evaluate(input_test_fn) # This will print the test accuracy along with other performance measures

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-04T20:08:21Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./graphs/dnn\model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 0.92568s
INFO:tensorflow:Finished evaluation at 2020-07-04-20:08:22
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.83798885, accuracy_baseline = 0.61452514, auc = 0.902635, auc_precision_recall = 0.87680256, average_loss = 0.4595023, global_step = 2000, label/mean = 0.38547486, loss = 0.4433565, precision = 0.82258064, predi

{'accuracy': 0.83798885,
 'accuracy_baseline': 0.61452514,
 'auc': 0.902635,
 'auc_precision_recall': 0.87680256,
 'average_loss': 0.4595023,
 'label/mean': 0.38547486,
 'loss': 0.4433565,
 'precision': 0.82258064,
 'prediction/mean': 0.3826476,
 'recall': 0.73913044,
 'global_step': 2000}

In [21]:
# Predict output
input_test_fn = create_test_input_fn()
result = estimator.predict(input_test_fn)

# Verify results
i = 0
for res,actual_result in zip(result, test_label):
    predicted_result = res['class_ids'][0]
    print("\n Example : {} \t Actual : {} \t Predicted : {}".format(i, actual_result, predicted_result))

    i += 1
    if i == 10:
        break

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./graphs/dnn\model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

 Example : 0 	 Actual : 1.0 	 Predicted : 1

 Example : 1 	 Actual : 1.0 	 Predicted : 1

 Example : 2 	 Actual : 1.0 	 Predicted : 0

 Example : 3 	 Actual : 0.0 	 Predicted : 0

 Example : 4 	 Actual : 0.0 	 Predicted : 0

 Example : 5 	 Actual : 1.0 	 Predicted : 1

 Example : 6 	 Actual : 1.0 	 Predicted : 1

 Example : 7 	 Actual : 1.0 	 Predicted : 1

 Example : 8 	 Actual : 0.0 	 Predicted : 0

 Example : 9 	 Actual : 0.0 	

In [22]:
# Here we can see DNN Classifier performing well over the test set than the Linear Classifier