In [1]:
import numpy as np
import pandas as pd
import re
import operator
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tflearn

In [2]:
train = pd.read_csv("input/train.csv", dtype={"Age": np.float64}, )
test = pd.read_csv("input/test.csv", dtype={"Age": np.float64}, )

target = train["Survived"].values
full = pd.concat([train, test])

In [3]:
full['surname'] = full["Name"].apply(lambda x: x.split(',')[0].lower())

full["Title"] = full["Name"].apply(lambda x: re.search(' ([A-Za-z]+)\.',x).group(1))
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 2, "Mme": 3,"Don": 9,"Dona": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
full["TitleCat"] = full.loc[:,'Title'].map(title_mapping)

full["FamilySize"] = full["SibSp"] + full["Parch"] + 1
full["FamilySize"] = pd.cut(full["FamilySize"], bins=[0,1,4,20], labels=[0,1,2])

full["NameLength"] = full["Name"].apply(lambda x: len(x))

full["Embarked"] = pd.Categorical(full.Embarked).codes

full["Fare"] = full["Fare"].fillna(8.05)

full = pd.concat([full,pd.get_dummies(full['Sex'])],axis=1)

full['CabinCat'] = pd.Categorical(full.Cabin.fillna('0').apply(lambda x: x[0])).codes

In [4]:
# function to get oven/odd/null from cabine 
def get_type_cabine(cabine):
    # Use a regular expression to search for a title. 
    cabine_search = re.search('\d+', cabine)
    # If the title exists, extract and return it.
    if cabine_search:
        num = cabine_search.group(0)
        if np.float64(num) % 2 == 0:
            return '2'
        else:
            return '1'
    return '0'
full["Cabin"] = full["Cabin"].fillna(" ")

full["CabinType"] = full["Cabin"].apply(get_type_cabine)

## CHILD/FEMALE ADULT/MALE ADULT

In [5]:
child_age = 18
def get_person(passenger):
    age, sex = passenger
    if (age < child_age):
        return 'child'
    elif (sex == 'female'):
        return 'female_adult'
    else:
        return 'male_adult'
full = pd.concat([full, pd.DataFrame(full[['Age', 'Sex']].apply(get_person, axis=1), columns=['person'])],axis=1)
full = pd.concat([full,pd.get_dummies(full['person'])],axis=1)

## FEATURES BASED ON TICKET

In [6]:
table_ticket = pd.DataFrame(full["Ticket"].value_counts())
table_ticket.rename(columns={'Ticket':'Ticket_Members'}, inplace=True)

table_ticket['Ticket_perishing_women'] = full.Ticket[(full.female_adult == 1.0) 
                                    & (full.Survived == 0.0) 
                                    & ((full.Parch > 0) | (full.SibSp > 0))].value_counts()
table_ticket['Ticket_perishing_women'] = table_ticket['Ticket_perishing_women'].fillna(0)
table_ticket['Ticket_perishing_women'][table_ticket['Ticket_perishing_women'] > 0] = 1.0 

table_ticket['Ticket_surviving_men'] = full.Ticket[(full.male_adult == 1.0) 
                                    & (full.Survived == 1.0) 
                                    & ((full.Parch > 0) | (full.SibSp > 0))].value_counts()
table_ticket['Ticket_surviving_men'] = table_ticket['Ticket_surviving_men'].fillna(0)
table_ticket['Ticket_surviving_men'][table_ticket['Ticket_surviving_men'] > 0] = 1.0 

table_ticket["Ticket_Id"]= pd.Categorical(table_ticket.index).codes
# compress under 3 members into one code.
table_ticket["Ticket_Id"][table_ticket["Ticket_Members"] < 3 ] = -1
table_ticket["Ticket_Members"] = pd.cut(table_ticket["Ticket_Members"], bins=[0,1,4,20], labels=[0,1,2])

full = pd.merge(full, table_ticket, left_on="Ticket",right_index=True,how='left', sort=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## FEATURES BASED ON SURNAME

In [7]:
table_surname = pd.DataFrame(full["surname"].value_counts())
table_surname.rename(columns={'surname':'Surname_Members'}, inplace=True)

table_surname['Surname_perishing_women'] = full.surname[(full.female_adult == 1.0) 
                                    & (full.Survived == 0.0) 
                                    & ((full.Parch > 0) | (full.SibSp > 0))].value_counts()
table_surname['Surname_perishing_women'] = table_surname['Surname_perishing_women'].fillna(0)
table_surname['Surname_perishing_women'][table_surname['Surname_perishing_women'] > 0] = 1.0 

table_surname['Surname_surviving_men'] = full.surname[(full.male_adult == 1.0) 
                                    & (full.Survived == 1.0) 
                                    & ((full.Parch > 0) | (full.SibSp > 0))].value_counts()
table_surname['Surname_surviving_men'] = table_surname['Surname_surviving_men'].fillna(0)
table_surname['Surname_surviving_men'][table_surname['Surname_surviving_men'] > 0] = 1.0 

table_surname["Surname_Id"]= pd.Categorical(table_surname.index).codes
# compress under 3 members into one code.
table_surname["Surname_Id"][table_surname["Surname_Members"] < 3 ] = -1

table_surname["Surname_Members"] = pd.cut(table_surname["Surname_Members"], bins=[0,1,4,20], labels=[0,1,2])

full = pd.merge(full, table_surname, left_on="surname",right_index=True,how='left', sort=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## AGE PROCESSING

In [8]:
classers = ['Fare','Parch','Pclass','SibSp','TitleCat', 
'CabinCat','female','male', 'Embarked', 'FamilySize', 'NameLength','Ticket_Members','Ticket_Id']
etr = ExtraTreesRegressor(n_estimators=200)
X_train = full[classers][full['Age'].notnull()]
Y_train = full['Age'][full['Age'].notnull()]
X_test = full[classers][full['Age'].isnull()]
etr.fit(X_train,np.ravel(Y_train))
age_preds = etr.predict(X_test)
full['Age'][full['Age'].isnull()] = age_preds

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [9]:
etr.score(X = X_train, y = np.ravel(Y_train))

0.9459053107016463

In [10]:
pd.DataFrame({'name': X_train.columns, 'value': etr.feature_importances_}).sort_values('value', ascending=False)

Unnamed: 0,name,value
2,Pclass,0.181594
10,NameLength,0.171506
4,TitleCat,0.148499
0,Fare,0.144043
1,Parch,0.078293
12,Ticket_Id,0.061693
11,Ticket_Members,0.040576
5,CabinCat,0.039799
8,Embarked,0.03518
3,SibSp,0.034801


## Features

In [11]:
features = ['female','male','Age','male_adult','female_adult', 'child','TitleCat', 'Pclass',
'Pclass','Ticket_Id','NameLength','CabinType','CabinCat', 'SibSp', 'Parch',
'Fare','Embarked','Surname_Members','Ticket_Members','FamilySize',
'Ticket_perishing_women','Ticket_surviving_men',
'Surname_perishing_women','Surname_surviving_men']
scaler = StandardScaler()
X_full = scaler.fit_transform(full[features])
X = X_full[0:891].copy()
test_passenger_id = full[891:]['PassengerId']
test = X_full[891:].copy()

## RNN

In [12]:
y = np.stack((1-target, target), axis=-1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [38]:
# Define the neural network
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    # Inputs
    net = tflearn.input_data([None, len(features)])

    # Hidden layer(s) - sigmoid ReLU
    net = tflearn.fully_connected(net, 12, activation='ReLU')
    net = tflearn.fully_connected(net, 6, activation='ReLU')
    
    # Output layer and training model
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, learning_rate=0.01, loss='categorical_crossentropy')
    
    model = tflearn.DNN(net, tensorboard_verbose=3)
    return model
model = build_model()

In [48]:
model.fit(X_train, y_train, validation_set=0.1, show_metric=True, batch_size=64, n_epoch=10)

Training Step: 359  | total loss: [1m[32m0.23973[0m[0m | time: 0.274s
| Adam | epoch: 030 | loss: 0.23973 - acc: 0.9029 -- iter: 576/640
Training Step: 360  | total loss: [1m[32m0.36210[0m[0m | time: 1.324s
| Adam | epoch: 030 | loss: 0.36210 - acc: 0.8642 | val_loss: 0.40570 - val_acc: 0.8472 -- iter: 640/640
--


In [49]:
model.evaluate(X_test, y_test)

[0.89944134178108348]

## Prediction

In [40]:
predictions = (np.array(model.predict(test))[:,1] >= 0.5).astype(int)
PassengerId = test_passenger_id.astype(int)
my_prediction = pd.DataFrame(predictions, PassengerId, columns = ["Survived"])

my_prediction.to_csv("my_prediction2.csv", index_label = ["PassengerId"])

In [47]:
old_prediction = pd.read_csv('best_prediction.csv', index_col='PassengerId')
old_prediction['proba'] = np.array(model.predict(test))[:,1]
old_prediction['diff'] = (old_prediction['Survived'] == my_prediction['Survived'])
display(old_prediction.groupby('diff').count())
old_prediction[old_prediction['diff'] == False]

Unnamed: 0_level_0,Survived,proba
diff,Unnamed: 1_level_1,Unnamed: 2_level_1
False,22,22
True,396,396


Unnamed: 0_level_0,Survived,proba,diff
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
910,0,0.53973,False
927,1,0.293803,False
928,0,0.516613,False
929,0,0.682745,False
964,0,0.516168,False
978,0,0.721622,False
979,0,0.605801,False
990,0,0.588924,False
1030,0,0.567831,False
1036,1,0.39186,False
