In [1054]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [1055]:
# load data
train_data = pd.read_csv(r"./data/train.csv")
test_data = pd.read_csv(r"./data/test.csv")

In [1056]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [1057]:
# Feature Engineering
from sklearn.preprocessing import Imputer

def nan_padding(data, columns):
    for column in columns:
        imputer=Imputer()
        data[column]=imputer.fit_transform(data[column].values.reshape(-1,1))
    return data


nan_columns = ["Age", "SibSp", "Parch"]

train_data = nan_padding(train_data, nan_columns)
test_data = nan_padding(test_data, nan_columns)

In [1058]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S


In [1059]:
train_data['Surname'] = train_data.apply(lambda row: row.Name.split(',')[0], axis = 1)
test_data['Surname'] = test_data.apply(lambda row: row.Name.split(',')[0], axis = 1)
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Surname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S,Allen


In [1060]:
#save PassengerId for evaluation
test_passenger_id=test_data["PassengerId"]

In [1061]:
def drop_not_concerned(data, columns):
    return data.drop(columns, axis=1)

not_concerned_columns = ["PassengerId","Name", "Ticket", "Fare", "Cabin", "Embarked"]
train_data = drop_not_concerned(train_data, not_concerned_columns)
test_data = drop_not_concerned(test_data, not_concerned_columns)

In [1062]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Surname
0,0,3,male,22.0,1.0,0.0,Braund
1,1,1,female,38.0,1.0,0.0,Cumings
2,1,3,female,26.0,0.0,0.0,Heikkinen
3,1,1,female,35.0,1.0,0.0,Futrelle
4,0,3,male,35.0,0.0,0.0,Allen


In [1063]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Surname
0,3,male,34.5,0.0,0.0,Kelly
1,3,female,47.0,1.0,0.0,Wilkes
2,2,male,62.0,0.0,0.0,Myles
3,3,male,27.0,0.0,0.0,Wirz
4,3,female,22.0,1.0,1.0,Hirvonen


In [1064]:
def dummy_data(data, columns):
    for column in columns:
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column)], axis=1)
        data = data.drop(column, axis=1)
    return data


dummy_columns = ["Pclass", "Surname"]
train_data=dummy_data(train_data, dummy_columns)
test_data=dummy_data(test_data, dummy_columns)

In [1065]:
test_data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Surname_Abbott,Surname_Abelseth,Surname_Abrahamsson,...,Surname_Williams,Surname_Wilson,Surname_Wirz,Surname_Wittevrongel,Surname_Wright,Surname_Zakarian,Surname_de Brito,Surname_de Messemaeker,Surname_del Carlo,Surname_van Billiard
0,male,34.5,0.0,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,female,47.0,1.0,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,male,62.0,0.0,0.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,male,27.0,0.0,0.0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,female,22.0,1.0,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1066]:
from sklearn.preprocessing import LabelEncoder
def sex_to_int(data):
    le = LabelEncoder()
    le.fit(["male","female"])
    data["Sex"]=le.transform(data["Sex"]) 
    return data

train_data = sex_to_int(train_data)
test_data = sex_to_int(test_data)
train_data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Surname_Abbing,Surname_Abbott,...,Surname_Yousseff,Surname_Yrois,Surname_Zabour,Surname_Zimmerman,Surname_de Messemaeker,Surname_de Mulder,Surname_de Pelsmaeker,Surname_del Carlo,Surname_van Billiard,Surname_van Melkebeke
0,0,1,22.0,1.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,38.0,1.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,26.0,0.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,35.0,1.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,35.0,0.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1067]:
from sklearn.preprocessing import LabelBinarizer
data_y = train_data["Survived"]
lb = LabelBinarizer()
data_y = lb.fit_transform(data_y)

In [1068]:
from sklearn.preprocessing import MinMaxScaler

def normalize_age(data):
    scaler = MinMaxScaler()
    data["Age"] = scaler.fit_transform(data["Age"].values.reshape(-1,1))
    return data
train_data = normalize_age(train_data)
test_data = normalize_age(test_data)
train_data.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Surname_Abbing,Surname_Abbott,...,Surname_Yousseff,Surname_Yrois,Surname_Zabour,Surname_Zimmerman,Surname_de Messemaeker,Surname_de Mulder,Surname_de Pelsmaeker,Surname_del Carlo,Surname_van Billiard,Surname_van Melkebeke
0,0,1,0.271174,1.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0.472229,1.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0.321438,0.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0.434531,1.0,0.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0.434531,0.0,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1069]:
data_x = train_data.drop(["Survived"], axis=1)

In [1070]:
def add_missing_dummy_columns( data, columns ):
    missing_cols = set( columns ) - set( data.columns )
    for c in missing_cols:
        data[c] = 0

In [1071]:
def fix_columns( data, columns ):  

    add_missing_dummy_columns( data, columns )

    # make sure we have all the columns we need
    assert( set( columns ) - set( data.columns ) == set())

    extra_cols = set( data.columns ) - set( columns )
    if extra_cols:
        print("extra columns:", extra_cols)

    data = data[ columns ]
    return data

In [1072]:
test_data = fix_columns(test_data, list(data_x))

extra columns: {'Surname_Sincock', 'Surname_Salomon', 'Surname_Chevre', "Surname_O'Keefe", 'Surname_Riihivouri', 'Surname_Collett', 'Surname_Rasmussen', 'Surname_Thomson', 'Surname_Conlon', 'Surname_Fillbrook', 'Surname_Rosenshine', 'Surname_Zakarian', 'Surname_Saether', 'Surname_Howard', 'Surname_Omont', 'Surname_Harbeck', 'Surname_Assaf', 'Surname_Chaudanson', 'Surname_Oxenham', 'Surname_Candee', 'Surname_Stokes', 'Surname_Portaluppi', 'Surname_Beauchamp', 'Surname_Myles', 'Surname_Roth', 'Surname_Dulles', 'Surname_Evans', 'Surname_Wirz', 'Surname_Borebank', 'Surname_Walcroft', 'Surname_Nieminen', 'Surname_Brady', 'Surname_Malachard', 'Surname_Maguire', 'Surname_McCrae', 'Surname_Baimbrigge', 'Surname_Hellstrom', 'Surname_Loring', 'Surname_Jefferys', 'Surname_Enander', 'Surname_Assaf Khalil', 'Surname_Andersen', 'Surname_Gilbert', 'Surname_Daniels', 'Surname_Colbert', 'Surname_Lundstrom', 'Surname_Wittevrongel', 'Surname_Earnshaw', 'Surname_Corbett', 'Surname_Guest', 'Surname_Wenzel'

In [1073]:
train_data = fix_columns(data_x, list(test_data))

In [1074]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(data_x, data_y, test_size=0.2, random_state=42)

print("train_x:{}".format(X_train.shape))
print("train_y:{}".format(y_train.shape))
print("train_y content:{}".format(y_train[:3]))

print("valid_x:{}".format(X_valid.shape))
print("valid_y:{}".format(y_valid.shape))
print("test_x:{}".format(test_data.shape))

train_x:(712, 674)
train_y:(712, 1)
train_y content:[[0]
 [0]
 [0]]
valid_x:(179, 674)
valid_y:(179, 1)
test_x:(418, 674)


In [1075]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
clf1 = LinearSVC(random_state=41)
clf2 = SGDClassifier(random_state=13)
clf3 = KNeighborsClassifier(n_neighbors=3)
clf4 = RandomForestClassifier(n_estimators = 100)
clf5 = GaussianNB()

eclf1 = VotingClassifier(estimators= \
                         [('lsvc', clf1),\
                          ('sgdc', clf2),\
                          ('knn', clf3),\
                          ('rf', clf4),\
                          ('gnb',clf5)])
eclf1.fit(X_train, y_train.ravel())

VotingClassifier(estimators=[('lsvc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=41, tol=0.0001,
     verbose=0)), ('sgdc', SGDClassifier(alpha=0.0001, average=False, class_weight...lse, random_state=None,
            verbose=0, warm_start=False)), ('gnb', GaussianNB(priors=None))],
         n_jobs=1, voting='hard', weights=None)

In [1076]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

# Take a poll of some classifiers
y_valid_pred = eclf1.predict(X_valid)

score_f1 = f1_score(y_valid_pred, y_valid.ravel())
print("F1 Accuracy: {0:.2f}%".format(score_f1 * 100.0))
score = eclf1.score(X_valid, y_valid)
print("Test Accuracy: {0:.2f}%".format(score * 100.0))
confusion_matrix(y_valid, y_valid_pred)

F1 Accuracy: 82.19%
Test Accuracy: 85.47%


array([[93, 12],
       [14, 60]])

In [1077]:
test_data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Pclass_1,Pclass_2,Pclass_3,Surname_Abbing,Surname_Abbott,Surname_Abelson,...,Surname_Yousseff,Surname_Yrois,Surname_Zabour,Surname_Zimmerman,Surname_de Messemaeker,Surname_de Mulder,Surname_de Pelsmaeker,Surname_del Carlo,Surname_van Billiard,Surname_van Melkebeke
0,1,0.452723,0.0,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0.617566,1.0,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.815377,0.0,0.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0.353818,0.0,0.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0.287881,1.0,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1078]:
predictions = clf.predict(test_data)

final_results = pd.DataFrame(np.array(list(zip(test_passenger_id,predictions))), columns=["PassengerId", "Survived"])
final_results.to_csv(r"data\predictions.csv", index=False)