# Titanic - Deep learning

2019-1216 aanpassingen voor tensorflow 2

In [1]:
# I case you are using a older version of Sklearn warnings will be generated
# Let's turn this off to keep the code clean

import warnings
warnings.filterwarnings("ignore")

In [2]:
# importeer pandas, 
# laadt de dataset 'train.csv' van folder 'datasets' in.
# print de eerste 5 rows van het ingelezen dataframe op het scherm

In [3]:
import pandas as pd
train_df = pd.read_csv('./datasets/train.csv')
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# X : heeft elke column/feature, behalve 'Survived'.
# y : heeft alleen de 'Surviced' kolom/target
# print de eerste 5 rows van X en y op het scherm

In [5]:
X = train_df[[column for column in train_df.columns if not column == 'Survived']]
y = train_df['Survived']

print(X.head(5))
print('-'*40)
print(y.head(5))

   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
4    male  35.0      0      0            373450   8.0500   NaN        S  
----------------------------------------
0    0
1    1
2    1
3    1
4    0
Name: Survived, d

In [6]:
# opschonen X: verwijder PassengerId, Name en Ticket
# print wederom de eerste 5 regels van X op het scherm

In [7]:
del X['PassengerId']
del X['Name']
del X['Ticket']

X.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.925,,S
3,1,female,35.0,1,0,53.1,C123,S
4,3,male,35.0,0,0,8.05,,S


In [8]:
# we gaan verwijder 'lege' waarden.

# De sklearn module imputer vervangt lege waarden.
# We importeren ook numpy omdat lege waarden standaard worden weergegeven als een numpy.nan-object

In [9]:
import numpy as np
from sklearn.impute import SimpleImputer

In [10]:
# We vervangen alle lege waarden (numpy.nan objecten) in de leeftijdskolom door de mediaan

In [11]:
age_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X['Age'] = age_imputer.fit_transform(X[['Age']])

In [12]:
# We vervangen alle lege waarden (numpy.nan objecten) in de cabinekolom door waarde Unknown

In [13]:
cabin_imputer = SimpleImputer(missing_values=np.nan, fill_value='Unknown', strategy='constant')
X['Cabin'] = cabin_imputer.fit_transform(X[['Cabin']])

In [14]:
# We vervangen alle lege waarden (numpy.nan objecten) in de ingescheepte kolom door de meest voorkomende waarde

In [15]:
embarked_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X['Embarked'] = cabin_imputer.fit_transform(X[['Embarked']])

In [16]:
# print de eerste 5 rows van X op het scherm ter controlle.

In [17]:
X.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,Unknown,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.925,Unknown,S
3,1,female,35.0,1,0,53.1,C123,S
4,3,male,35.0,0,0,8.05,Unknown,S


In [18]:
# Laten we alle categorische waarden coderen.
# Zodat het machine learning model de categorische waarden begrijpt.
# We gebruiken onehot encoding:
# https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f


# custom function encode() dat het leven makkelijker maakt. 
# (Je kunt de syntax vergeten)

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [20]:
def encode(dataframe, column):
    encoder = OneHotEncoder()
    encoded_array = encoder.fit_transform(dataframe[[column]]).toarray()
    
    for index in range(encoded_array.shape[1]):
        dataframe[column + str(index)] = encoded_array[:, index]
    
    del dataframe[column]    
    return dataframe

In [21]:
# encode() de categorie Cabin, Sex en Embarked
# print de eerste 5 regels van X op het scherm ter controlle

In [22]:
X2 = encode(X, 'Cabin')
X3 = encode(X, 'Sex')
X4 = encode(X, 'Embarked')

X.head(5)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin0,Cabin1,Cabin2,Cabin3,Cabin4,...,Cabin144,Cabin145,Cabin146,Cabin147,Sex0,Sex1,Embarked0,Embarked1,Embarked2,Embarked3
0,3,22.0,1,0,7.25,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1,38.0,1,0,71.2833,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,3,26.0,0,0,7.925,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1,35.0,1,0,53.1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,3,35.0,0,0,8.05,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [23]:
# maak de train en test set aan

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)

In [26]:
# Tensorflow2 wertk met numpy-arrays en (nog) niet met pandas dataframs.
# converteer dataframes naar numpy-arrays

In [27]:
X_train= np.array(train_X)
y_train= np.array(train_y)
X_test= np.array(test_X)
y_test = np.array(test_y)

In [28]:
# Deeeeeep learning time!
# Import tensorflow module en print de versie (moet 2.0.0 zijn)

In [29]:
import tensorflow as tf
print("Tensorflow version: {}".format(tf.__version__))

Tensorflow version: 2.0.0


In [30]:
# Definieer deep learning model Sequential
# en specificeer de deep-learning layers

In [31]:
model = tf.keras.models.Sequential()

# Argument 1 is the amount if neurons.
# Activation: is the formula that gets applied at the end of a neuron
# Input_dim: amount of features. In this case the amount of columns
print(X.columns)

model.add(tf.keras.layers.Dense(32, activation='relu', input_dim=len(X.columns))) # Input layer

model.add(tf.keras.layers.Dense(30, activation='relu')) # Hidden layer
model.add(tf.keras.layers.Dense(30, activation='relu')) # Hidden layer

model.add(tf.keras.layers.Dense(1, activation='sigmoid')) # Output layer -> accuracy=.78
# model.add(tf.keras.layers.Dense(1, activation='softmax')) # Output layer -> accuracy=.39

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin0', 'Cabin1', 'Cabin2',
       'Cabin3', 'Cabin4',
       ...
       'Cabin144', 'Cabin145', 'Cabin146', 'Cabin147', 'Sex0', 'Sex1',
       'Embarked0', 'Embarked1', 'Embarked2', 'Embarked3'],
      dtype='object', length=159)


In [32]:
# Compileer de model-lagen

In [33]:
model.compile(
              optimizer='adam', # The optimize algorithm
              loss='binary_crossentropy', # Because of our predict a binary (1=survived, 0=died)
              metrics=['accuracy'] # Aiming for the best accuracy
        )

In [34]:
# Train het model
# Epochs is de hoeveel keer het algoritme zichzelf opnieuw zal trainen

In [35]:
model.fit(X_train, y_train, epochs=25)

Train on 712 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x1a4b25af90>

In [36]:
# Geef de nauwkeurigheid van het getrainde model.

In [37]:
from sklearn.metrics import accuracy_score

In [38]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred.round())

0.770949720670391