#Classification problem: survival on Titanic



Sigmoid function is applied to the output $(x_1,\ldots,x_n)$ of the neural network to map: $\mathbb{R^n} \rightarrow (0,1)$

##Cross-entropy loss for binary classification (where labels are either $0$ or $1$):
$loss = -y \cdot \log \hat{y}-(1-y)\cdot\log(1-\hat{y})$

where $y$ = the label, $\hat{y}$ = the prediction

In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [0]:
# Enter the ID of the Google Drive folder containing train.csv and test.csv files.
# The ID of the folder is the long string of numbers and letters in the URL of the folder in Google Drive.

file_list = drive.ListFile({'q': "'1nim_rYfPJPC1kR0B2AWn2-3hFKbgBlfq' in parents and trashed=false"}).GetList()
for file1 in file_list:
  print('title: %s, id: %s' % (file1['title'], file1['id']))

title: TitanicPrediction.ipynb, id: 12FS_uTJfTzB5BDzxAQhI9IFQpUQD8TCl
title: train.csv, id: 1B0IAhU3nLrSBvUTNRjRPDHYJHvv_PeRx
title: test.csv, id: 1pVLoe3p6rra06KLbNqfKQUp0Z0ZeqPQZ


In [0]:
# Enter the ID of train.csv and test.csv. The IDs are printed in the output of the cell above.
train_downloaded = drive.CreateFile({'id': '1B0IAhU3nLrSBvUTNRjRPDHYJHvv_PeRx'})
train_downloaded.GetContentFile('train.csv')
test_downloaded = drive.CreateFile({'id': '1pVLoe3p6rra06KLbNqfKQUp0Z0ZeqPQZ'})
test_downloaded.GetContentFile('test.csv')

In [0]:
# See the top rows of the train.csv file
!ls
!head train.csv
!head test.csv

In [0]:
# Import pandas, which is the library for the data structures being used
import pandas as pd

# Load train.csv into pandas dataframe and print the summary
df = pd.read_csv('train.csv')
df1 = pd.read_csv('test.csv')


In [0]:
# Show shapes of the data
print ("Train data shape:", df.shape)
print ("Test data shape:", df1.shape)
print list(df)
print list(df1)

('Train data shape:', (891, 12))
('Test data shape:', (418, 11))
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [0]:
df = df.drop(axis=1, columns=['Ticket', 'Cabin', 'Name', 'Embarked'])
df1 = df1.drop(axis=1, columns=['Ticket', 'Cabin', 'Name', 'Embarked'])

print list(df)
print list(df1)

['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']


In [0]:
# Import math
import math

# How many rows we are using for training (the rest are for evaluation)
num_training = 800

df['Age'] = df['Age'].apply(lambda i: 30.0 if math.isnan(i) else i )
df1['Age'] = df1['Age'].apply(lambda i: 30.0 if math.isnan(i) else i )
df['Pclass'] = df['Pclass'].apply(lambda i: 2 if math.isnan(i) else i )
df1['Pclass'] = df1['Pclass'].apply(lambda i: 2 if math.isnan(i) else i )
df['Fare'] = df['Fare'].apply(lambda i: 10.0 if math.isnan(i) else i )
df1['Fare'] = df1['Fare'].apply(lambda i: 10.0 if math.isnan(i) else i )
df['Sex'] = df['Sex'].apply(lambda i: 1 if (not (i == "female")) else 0)
df1['Sex'] = df1['Sex'].apply(lambda i: 1 if (not (i == "female")) else 0)
df['SibSp'] = df['SibSp'].apply(lambda i: 0.0 if math.isnan(i) else i )
df1['SibSp'] = df1['SibSp'].apply(lambda i: 0.0 if math.isnan(i) else i )
df['Parch'] = df['Parch'].apply(lambda i: 0.0 if math.isnan(i) else i )
df1['Parch'] = df1['Parch'].apply(lambda i: 0.0 if math.isnan(i) else i )

# Get the labels as dataframes
labels_df = df.iloc[:num_training,1]
eval_labels_df = df.iloc[num_training:,1]


# Get the features as dataframes
features_df = df.iloc[:num_training,2:]
eval_features_df = df.iloc[num_training:,2:]
features_df1 = df1.iloc[:,1:]

# Get the labels as lists
labels = labels_df.values
eval_labels = eval_labels_df.values

print labels
print eval_labels

print features_df
print features_df1

In [0]:
# Import tensorflow
import tensorflow as tf

# Get and print the feature names
headers = list(df)
headers1 = list(df1)
feature_names = headers[2:]
feature_names1 = headers1[1:]

print len(labels), labels
print len(feature_names), feature_names

NameError: ignored

In [0]:
!rm -r model_dir

In [0]:
!mkdir model_dir

# Create a DNNClassifier with real-valued feature columns
# (the number of columns = number of pixels in an image)

features = [tf.contrib.layers.real_valued_column(f) for f in feature_names]
# The hidden layers have 64, 32, and 16 neurons
# The number of classes is 10 (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
classifier = tf.estimator.DNNClassifier(feature_columns=features,
                                        hidden_units=[16],
                                        n_classes=2,
                                        model_dir='model_dir')


In [0]:
# How many examples are being processed at a time
batch_size = 100

# Train 1000 times
for i in range(200):
  # Construct the training dataset
  def train_input_fn():
    return tf.data.Dataset.from_tensor_slices((dict(features_df), labels)).shuffle(batch_size*7).repeat().batch(batch_size)

  # Construct the evaluation dataset
  def eval_input_fn():
    return tf.data.Dataset.from_tensor_slices((dict(eval_features_df), eval_labels)).shuffle(batch_size*7).repeat().batch(batch_size)

  # Train for 100 steps each time
  classifier.train(input_fn=train_input_fn, steps=200)
  evaluation = classifier.evaluate(input_fn=eval_input_fn, steps=1)
  num_training_steps = evaluation.get('global_step', '?')
  loss = evaluation.get('loss', '?')

In [0]:
# Define the evaluation input function for predictions
def eval_input_fn1():
  eval_dataset1 = tf.data.Dataset.from_tensor_slices((dict(features_df1)))
  #eval_dataset1 = eval_dataset1.shuffle(batch_size).repeat().batch(batch_size)
  eval_dataset1 = eval_dataset1.batch(len(features_df1))
  return eval_dataset1

In [0]:
predictions = classifier.predict(
    input_fn=eval_input_fn1)

submission = ["PassengerId,Survived"]

counter = 0
for pred_dict in predictions:
  class_id = pred_dict['class_ids'][0]
  probability = pred_dict['probabilities'][class_id]
  pid = str(df1.iloc[counter,0])
  toappend =  pid + "," + str(class_id)
  submission.append(toappend)
  counter = counter + 1
  
print submission