In [188]:
import pandas as pd

train_df = pd.read_csv('titanic/train.csv')
test_df = pd.read_csv('titanic/test.csv')

In [231]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import math


class ExtractCabinBlock(BaseEstimator, TransformerMixin):
    def __init__(self, field_name):
        self.field_name = field_name
        self.blocks = {"A": 0,
                       "B": 1,
                       "C": 2,
                       "D": 3,
                       "E": 4,
                       "F": 5,
                       "G": 6,
                       "H": 7,
                       "K": 8,
                       "T": 9}
        
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None):
        source = X[self.field_name]
        block_col = np.empty((len(source), 11), dtype=int)
        #seat_col = np.empty((len(source), 11), dtype=int)
        block_col[:, :] = 0
        #seat_col[:, :] = np.nan
        for i in range(0, len(source)):
            x = source[i]
            target = block_col[i]
            if pd.isnull(x):
                target[10] = 1
                continue
            splitted = x.split(" ")
            seats = np.empty((len(splitted), 2), dtype=object)
            for j in range(0, len(splitted)):
                cabin_seat = splitted[j]
                cabin_seat_number = cabin_seat[1:]
                if not cabin_seat_number:
                    cabin_seat_number = np.nan
                else:
                    cabin_seat_number = int(cabin_seat_number)
                seats[j] = [self.blocks[cabin_seat[0]], cabin_seat_number]
            for block in seats[:, 0]:
                target[block] = 1

        return block_col
        #return X[[self.field_name]].fillna("NaN").values


class FareImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.means = {}

    def fit(self, X, y=None, **fit_params):
        for key, class_group in train_df.groupby(["Pclass"]):
            self.means[key] = class_group.Fare.mean()
        return self

    def transform(self, X):
        for idx, row in X.iterrows():
            if row["Fare"] == 0.0 or math.isnan(row["Fare"]):
                X.set_value(idx, "Fare", self.means[row["Pclass"]])
        return X[["Fare"]].values


class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, fill_na=None):
        self.attribute_names = attribute_names
        self.fill_na = fill_na
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X):
        if not self.fill_na:
            return X[self.attribute_names].values
        return X[self.attribute_names].fillna(self.fill_na).values


class LabelBinarizerX(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None,**fit_params):
        return self
    def transform(self, X):
        return LabelBinarizer().fit(X).transform(X)


class AgeCategoricalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None,**fit_params):
        return self
    def transform(self, X):
        X["Age"] = np.floor(X["Age"] / 10)
        return X[["Age"]].values


class AdditionalFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None,**fit_params):
        return self
    def transform(self, X):
        res = (X["SibSp"] + X["Parch"])
        res = np.where(res == 0, 1.0, 0.0)
        return np.reshape(res, (-1, 1))


# e = FillEmbarkedNa("Embarked")
# b = LabelBinarizer()
# b.fit_transform(e.fit_transform(train_df))

In [232]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer, Imputer

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Pclass", "SibSp", "Parch"])),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

fare_pipeline = Pipeline([
        ('selector', FareImputer()),
        ('std_scaler', StandardScaler())
    ])

embarked_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Embarked"], "S")),
        ('cat_encoder', LabelBinarizerX())
    ])

sex_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Sex"])),
        ('cat_encoder', LabelBinarizerX())
    ])

cabin_pipeline = Pipeline([
        ('selector', ExtractCabinBlock("Cabin")),
        #('cat_encoder', LabelBinarizerX())
    ])

age_pipeline = Pipeline([
        ('bin', AgeCategoricalEncoder()),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

additional_pipeline = Pipeline([
        ('additional', AdditionalFeatures()),
        ('std_scaler', StandardScaler())
    ])

In [248]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, Imputer, Binarizer

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Pclass", "SibSp", "Parch"])),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

final_transform = FeatureUnion(transformer_list=[
        ("embarked_pipeline", embarked_pipeline),
        ("num_pipeline", num_pipeline),
        ("sex_pipeline", sex_pipeline),
        ("cabin_pipeline", cabin_pipeline),
        ("fare_pipeline", fare_pipeline),
        ("age_pipeline", age_pipeline),
      #  ("additional_pipeline", additional_pipeline)
    ])

y_train = train_df.Survived.values
# c, r = y_train.shape
# y_train = y_train.reshape(c,)

#final_pipeline = Pipeline([
#        ('transformer', final_transform),
#        ('predictor', LogisticRegression())
#    ])

#predictor = LogisticRegression()

#train_df["is_child_1"] = (train_df["Age"] < 10)
#train_df["is_male"] = (train_df["Age"] > 30) & (train_df["Sex"] == "male") & (train_df["Pclass"] > 1)

#scores = cross_val_score(final_pipeline, train_df, train_df.Survived, scoring="accuracy", cv=5)
#np.median(scores)
#X_train = final_transform.fit_transform(train_df, y_train)

#print(cross_val_score(MLPClassifier(hidden_layer_sizes=(128,64,), tol=0.0001, epsilon=1e-08), X_train, y_train, cv=12, scoring="accuracy").mean())
#print(cross_val_score(MLPClassifier(hidden_layer_sizes=(128,32,), tol=0.0001, epsilon=1e-08), X_train, y_train, cv=12, scoring="accuracy").mean())

X_train = final_transform.fit_transform(train_df, y_train)
#print(cross_val_score(MLPClassifier(hidden_layer_sizes=(128,128,64), tol=0.0001, epsilon=1e-08), X_train, y_train, cv=12, scoring="accuracy").mean())



#print(cross_val_score(LogisticRegression(), X_train, y_train, cv=8, scoring="accuracy").mean())

predictor = MLPClassifier(hidden_layer_sizes=(128,128,64), tol=0.0001, epsilon=1e-08)
X_train = final_transform.fit_transform(train_df, y_train)


#print(X_train.shape)

predictor.fit(X_train, y_train)
X_test = final_transform.transform(test_df)

#print(X_test.shape)

predicted = predictor.predict(X_test)

#test_df.PassengerId
test_df["Survived"] = predicted.tolist()
test_df[["PassengerId", "Survived"]].to_csv('Submission.csv', index=False)