In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

"""
TODO: try StandardScaler as described in https://github.com/savarin/neural-networks/blob/master/1-1_Basic_NN-Titanic.ipynb
Usage: 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features = ['Class', 'Sex', 'Age', 'Fare']
X_train = scaler.fit_transform(df_train[features].values)
"""

"\nTODO: try StandardScaler as described in https://github.com/savarin/neural-networks/blob/master/1-1_Basic_NN-Titanic.ipynb\nUsage: \nfrom sklearn.preprocessing import StandardScaler\n\nscaler = StandardScaler()\nfeatures = ['Class', 'Sex', 'Age', 'Fare']\nX_train = scaler.fit_transform(df_train[features].values)\n"

In [37]:
"""
Load data
"""
df_train = pd.read_csv('data/titanic/train.csv')
df_test = pd.read_csv('data/titanic/test.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:
df_train[df_train['Fare'] == 0.0]
#df_train.isnull().sum()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
179,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,,S
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,S
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,,S
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,,S
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,,S
413,414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0.0,,S
466,467,0,2,"Campbell, Mr. William",male,,0,0,239853,0.0,,S
481,482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,,S
597,598,0,3,"Johnson, Mr. Alfred",male,49.0,0,0,LINE,0.0,,S
633,634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0.0,,S


In [3]:
"""
Missing values
"""
df_train = df_train.drop(['Name', 'Ticket', 'Cabin'], axis=1).dropna()
df_test = df_test.fillna(method='ffill')
#df_train.fillna(df_train.mean(), inplace=True)
#df_train.fillna(method='ffill', inplace=True)
#df_test.fillna(df_train.mean(), inplace=True)
#df_test.fillna(method='ffill', inplace=True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [4]:
"""
Inspect
"""
print(df_train.count())
df_train.describe()
df_train

PassengerId    712
Survived       712
Pclass         712
Sex            712
Age            712
SibSp          712
Parch          712
Fare           712
Embarked       712
dtype: int64


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
6,7,0,1,male,54.0,0,0,51.8625,S
7,8,0,3,male,2.0,3,1,21.0750,S
8,9,1,3,female,27.0,0,2,11.1333,S
9,10,1,2,female,14.0,1,0,30.0708,C
10,11,1,3,female,4.0,1,1,16.7000,S


In [5]:
def to_categorical(y, num_classes=None):
    """Converts a class vector (integers) to binary class matrix.
    E.g. for use with categorical_crossentropy.
    # Arguments
        y: class vector to be converted into a matrix
            (integers from 0 to num_classes).
        num_classes: total number of classes.
    # Returns
        A binary matrix representation of the input.
    """
    y = np.array(y, dtype='int').ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes))
    categorical[np.arange(n), y] = 1
    return categorical

In [6]:
"""
Feature engineering
"""
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
#from keras.utils import to_categorical

le = LabelEncoder()
mms = MinMaxScaler()

# Embarked
embarked_train = to_categorical(le.fit_transform(df_train['Embarked']))
embarked_test = to_categorical(le.fit_transform(df_test['Embarked']))

# Fare
fare_train = df_train['Fare'].values.reshape(-1, 1)
fare_test = df_test['Fare'].values.reshape(-1, 1)

# Siblings
siblings_train = df_train['SibSp'].values.reshape(-1, 1)
siblings_test = df_test['SibSp'].values.reshape(-1, 1)

# Parents
parents_train = df_train['Parch'].values.reshape(-1, 1)
parents_test = df_test['Parch'].values.reshape(-1, 1)

# Sex
sex_train = to_categorical(le.fit_transform(df_train['Sex']))
sex_test = to_categorical(le.fit_transform(df_test['Sex']))

# Pclass
pclass_train = to_categorical(le.fit_transform(df_train['Pclass']))
pclass_test = to_categorical(le.fit_transform(df_test['Pclass']))

# Age
age_train = df_train['Age'].values.reshape(-1, 1)
age_test  = df_test['Age'].values.reshape(-1, 1)

# X and Y
num = len(df_train)
split = int(num * .75)

x = np.hstack([pclass_train, sex_train, age_train, siblings_train, parents_train, embarked_train])
x = mms.fit_transform(x)
x_train, x_validate = x[:split], x[split:]

y = df_train['Survived'].values
y_train, y_validate = y[:split], y[split:]

# X test
x_test = np.hstack([pclass_test,  sex_test,  age_test, siblings_test, parents_test, embarked_test])
x_test = mms.fit_transform(x_test)

pd.DataFrame(x_train).head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,0.0,1.0,0.0,1.0,0.271174,0.2,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.472229,0.2,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,1.0,0.0,0.321438,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.434531,0.2,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0,0.434531,0.0,0.0,0.0,0.0,1.0


In [17]:
df_train[0:1]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S


In [7]:
"""
Logistic regression
"""
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(x_train, y_train)

f1_score(y_validate, logreg.predict(x_validate))


0.76335877862595414

In [12]:
"""
AdaBoost
"""
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=100)
ada.fit(x_train, y_train)

f1_score(y_validate, ada.predict(x_validate))

0.81379310344827571

In [36]:
mig = np.array([[
    1., # 1. class
    0., # 2. class
    0., # 3. class

    0., # kvinde
    1., # mand .... eller omvendt
    
    .7, # alder
    
    0., # antal soeskende
    
    0., # antal boern/foraeldre
    
    0., # havn 1
    1., # havn 2
    0.  # havn 3
]])
ada.predict(mig)

array([0])

In [26]:
"""
Support Vector Machine
"""
from sklearn.svm import SVC

svc = SVC(kernel='rbf')
svc.fit(x_train, y_train)

f1_score(y_validate, svc.predict(x_validate))

0.71755725190839692

In [11]:
"""
Nearest Neighbors
"""
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', p=4, weights='uniform')
knn.fit(x_train, y_train)
f1_score(y_validate, knn.predict(x_validate))

0.76258992805755399

In [12]:
"""
Stochastic Gradient Descent
"""
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
sgd.fit(x_train, y_train)
f1_score(y_validate, sgd.predict(x_validate))

0.15789473684210528

In [13]:
"""
Random Forest
"""
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=16, max_depth=None, min_samples_split=2, random_state=0)
rf.fit(x_train, y_train)
f1_score(y_validate, rf.predict(x_validate))

0.68531468531468531

In [14]:
"""
Multi-layer perceptron
"""
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(8), activation='relu', learning_rate='constant', max_iter=10000)
mlp.fit(x_train, y_train)
f1_score(y_validate, mlp.predict(x_validate))

0.77611940298507476

In [15]:
"""
Gaussian Process
"""
from sklearn.gaussian_process import GaussianProcessClassifier

gp = GaussianProcessClassifier()
gp.fit(x_train, y_train)
f1_score(y_validate, gp.predict(x_validate))

0.71186440677966101

In [16]:
survived = (ada.predict(x_test) > 0.5).astype('int').reshape(-1, 1)
passenger_ids = df_test['PassengerId'].values
passenger_ids = passenger_ids.reshape(len(passenger_ids), 1)

print(survived.shape)
print(passenger_ids.shape)

(418, 1)
(418, 1)


In [17]:
submission = pd.DataFrame(np.hstack([passenger_ids, survived]), columns=['PassengerId', 'Survived'])
submission.to_csv('submissions/titanic.csv', index=False)
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
