# Section 3-1 - Deep Learning

For detailed steps on extracting and cleaning data, please review Sections 1-0 to 1-2.

## Pandas - Extracting data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/train.csv')

## Pandas - Cleaning data

In [2]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

age_mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean)

from scipy.stats import mode

mode_embarked = mode(df['Embarked'])[0][0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)

pd.get_dummies(df['Embarked'], prefix='Embarked').head(10)
df = pd.concat([df, pd.get_dummies(df['Embarked'], prefix='Embarked')], axis=1)

df = df.drop(['Sex', 'Embarked'], axis=1)

cols = df.columns.tolist()
cols = [cols[1]] + cols[0:1] + cols[2:]

df = df[cols]

train_data = df.values

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


## TensorFlow - Training the model

In [3]:
import skflow

model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=2, steps=200)
model.fit(train_data[:800, 2:], train_data[:800, 0])

Step #1, avg. loss: 0.79639
Step #21, avg. loss: 1.40718
Step #41, avg. loss: 0.64990
Step #61, avg. loss: 0.64236
Step #81, avg. loss: 0.63074
Step #101, avg. loss: 0.62585
Step #121, avg. loss: 0.61650
Step #141, avg. loss: 0.62555
Step #161, avg. loss: 0.64157
Step #181, avg. loss: 0.61136


TensorFlowDNNClassifier(batch_size=32, continue_training=False,
            hidden_units=None, learning_rate=0.1, n_classes=2,
            optimizer='SGD', steps=200, tf_master='', tf_random_seed=42)

## TensorFlow - Making predictions

In [4]:
y_test = train_data[800:, 0]
y_prediction = model.predict(train_data[800:, 2:])

print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

prediction accuracy: 0.725274725275


In [8]:
model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=2, steps=1000)
model.fit(train_data[:800, 2:], train_data[:800, 0])

y_test = train_data[800:, 0]
y_prediction = model.predict(train_data[800:, 2:])

print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

Step #1, avg. loss: 2.40378
Step #101, avg. loss: 0.77554
Step #201, avg. loss: 0.66494
Step #301, avg. loss: 0.66361
Step #401, avg. loss: 0.66820
Step #501, avg. loss: 0.66479
Step #601, avg. loss: 0.66905
Step #701, avg. loss: 0.67118
Step #801, avg. loss: 0.67094
Step #901, avg. loss: 0.66608
prediction accuracy: 0.626373626374


In [9]:
model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=2, steps=1000, learning_rate=0.05)
model.fit(train_data[:800, 2:], train_data[:800, 0])

y_test = train_data[800:, 0]
y_prediction = model.predict(train_data[800:, 2:])

print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

Step #1, avg. loss: 1.27684
Step #101, avg. loss: 0.78364
Step #201, avg. loss: 0.61797
Step #301, avg. loss: 0.61116
Step #401, avg. loss: 0.61044
Step #501, avg. loss: 0.60140
Step #601, avg. loss: 0.60690
Step #701, avg. loss: 0.59867
Step #801, avg. loss: 0.60085
Step #901, avg. loss: 0.58348
prediction accuracy: 0.791208791209


In [10]:
model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=2, steps=1000, learning_rate=0.05, batch_size=128)
model.fit(train_data[:800, 2:], train_data[:800, 0])

y_test = train_data[800:, 0]
y_prediction = model.predict(train_data[800:, 2:])

print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

Step #1, avg. loss: 1.08632
Step #101, avg. loss: 0.75644
Step #201, avg. loss: 0.60330
Step #301, avg. loss: 0.59194
Step #401, avg. loss: 0.59376
Step #501, avg. loss: 0.57827
Step #601, avg. loss: 0.58494
Step #701, avg. loss: 0.57669
Step #801, avg. loss: 0.56969
Step #901, avg. loss: 0.57965
prediction accuracy: 0.813186813187


In [11]:
model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=2, steps=1000, learning_rate=0.05, batch_size=256)
model.fit(train_data[:800, 2:], train_data[:800, 0])

y_test = train_data[800:, 0]
y_prediction = model.predict(train_data[800:, 2:])

print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

Step #1, avg. loss: 1.00320
Step #101, avg. loss: 0.74897
Step #201, avg. loss: 0.60120
Step #301, avg. loss: 0.59732
Step #401, avg. loss: 0.58784
Step #501, avg. loss: 0.57602
Step #601, avg. loss: 0.58052
Step #701, avg. loss: 0.57117
Step #801, avg. loss: 0.56389
Step #901, avg. loss: 0.56997
prediction accuracy: 0.813186813187


In [12]:
model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=2, steps=10000, learning_rate=0.001, batch_size=512)
model.fit(train_data[:800, 2:], train_data[:800, 0])

y_test = train_data[800:, 0]
y_prediction = model.predict(train_data[800:, 2:])

print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

Step #1, avg. loss: 0.83791
Step #1001, avg. loss: 0.62424
Step #2001, avg. loss: 0.59769
Step #3001, avg. loss: 0.58587
Step #4001, avg. loss: 0.57555
Step #5001, avg. loss: 0.56656
Step #6001, avg. loss: 0.55524
Step #7001, avg. loss: 0.54483
Step #8001, avg. loss: 0.53159
Step #9001, avg. loss: 0.51746
prediction accuracy: 0.846153846154


In [15]:
model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=2, steps=10000, learning_rate=0.001, batch_size=512)
model.fit(train_data[:800, 2:], train_data[:800, 0])

y_test = train_data[800:, 0]
y_prediction = model.predict(train_data[800:, 2:])

print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

Step #1, avg. loss: 1.07668
Step #1001, avg. loss: 0.62318
Step #2001, avg. loss: 0.59618
Step #3001, avg. loss: 0.58308
Step #4001, avg. loss: 0.57318
Step #5001, avg. loss: 0.56236
Step #6001, avg. loss: 0.55232
Step #7001, avg. loss: 0.54212
Step #8001, avg. loss: 0.52699
Step #9001, avg. loss: 0.51558
prediction accuracy: 0.835164835165


In [19]:
model = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=2, steps=10000, learning_rate=0.001, batch_size=1024)
model.fit(train_data[:800, 2:], train_data[:800, 0])

y_test = train_data[800:, 0]
y_prediction = model.predict(train_data[800:, 2:])

print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

Step #1, avg. loss: 0.94292
Step #1001, avg. loss: 0.62205
Step #2001, avg. loss: 0.59644
Step #3001, avg. loss: 0.58520
Step #4001, avg. loss: 0.57487
Step #5001, avg. loss: 0.56464
Step #6001, avg. loss: 0.55412
Step #7001, avg. loss: 0.54279
Step #8001, avg. loss: 0.52879
Step #9001, avg. loss: 0.51528
prediction accuracy: 0.868131868132


In [20]:
model = skflow.TensorFlowDNNClassifier(hidden_units=[20, 40, 20], n_classes=2, steps=10000, learning_rate=0.001, batch_size=1024)
model.fit(train_data[:800, 2:], train_data[:800, 0])

y_test = train_data[800:, 0]
y_prediction = model.predict(train_data[800:, 2:])

print "prediction accuracy:", np.sum(y_test == y_prediction)*1./len(y_test)

Step #1, avg. loss: 11.46145
Step #1001, avg. loss: 0.98286
Step #2001, avg. loss: 0.57078
Step #3001, avg. loss: 0.54324
Step #4001, avg. loss: 0.52076
Step #5001, avg. loss: 0.49876
Step #6001, avg. loss: 0.48330
Step #7001, avg. loss: 0.47172
Step #8001, avg. loss: 0.46327
Step #9001, avg. loss: 0.45581
prediction accuracy: 0.846153846154


In [21]:
model = skflow.TensorFlowDNNClassifier(hidden_units=[20, 40, 20], n_classes=2, steps=10000, learning_rate=0.001, batch_size=1024)
model.fit(train_data[:, 2:], train_data[:, 0])

Step #1, avg. loss: 10.09338
Step #1001, avg. loss: 0.97243
Step #2001, avg. loss: 0.56455
Step #3001, avg. loss: 0.53610
Step #4001, avg. loss: 0.51265
Step #5001, avg. loss: 0.48926
Step #6001, avg. loss: 0.47181
Step #7001, avg. loss: 0.45922
Step #8001, avg. loss: 0.45077
Step #9001, avg. loss: 0.44543


TensorFlowDNNClassifier(batch_size=1024, continue_training=False,
            hidden_units=None, learning_rate=0.001, n_classes=2,
            optimizer='SGD', steps=10000, tf_master='', tf_random_seed=42)

In [22]:
df_test = pd.read_csv('../data/test.csv')

df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)

df_test['Age'] = df_test['Age'].fillna(age_mean)

fare_means = df.pivot_table('Fare', index='Pclass', aggfunc='mean')
df_test['Fare'] = df_test[['Fare', 'Pclass']].apply(lambda x:
                            fare_means[x['Pclass']] if pd.isnull(x['Fare'])
                            else x['Fare'], axis=1)

df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male': 1}).astype(int)
df_test = pd.concat([df_test, pd.get_dummies(df_test['Embarked'], prefix='Embarked')],
                axis=1)

df_test = df_test.drop(['Sex', 'Embarked'], axis=1)

test_data = df_test.values

output = model.predict(test_data[:,1:])

In [23]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]


df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])
df_result.to_csv('../results/titanic_3-1.csv', index=False)

## Appendix: Installation

For Mac:

For Ubuntu: