In [64]:
import pandas as pd

train_df = pd.read_csv('titanic/train.csv')
test_df = pd.read_csv('titanic/test.csv')
train_df.head(n = 100)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.00,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.00,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.00,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.00,1,0,237736,30.0708,,C


In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np


class ExtractCabinBlock(BaseEstimator, TransformerMixin):
    def __init__(self, field_name):
        self.field_name = field_name
        self.blocks = {"A": 0,
                       "B": 1,
                       "C": 2,
                       "D": 3,
                       "E": 4,
                       "F": 5,
                       "G": 6,
                       "H": 7,
                       "K": 8,
                       "T": 9}
        
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None):
        source = X[self.field_name]
        block_col = np.empty((len(source), 11), dtype=int)
        #seat_col = np.empty((len(source), 11), dtype=int)
        block_col[:, :] = 0
        #seat_col[:, :] = np.nan
        for i in range(0, len(source)):
            x = source[i]
            target = block_col[i]
            if pd.isnull(x):
                target[10] = 1
                continue
            splitted = x.split(" ")
            seats = np.empty((len(splitted), 2), dtype=object)
            for j in range(0, len(splitted)):
                cabin_seat = splitted[j]
                cabin_seat_number = cabin_seat[1:]
                if not cabin_seat_number:
                    cabin_seat_number = np.nan
                else:
                    cabin_seat_number = int(cabin_seat_number)
                seats[j] = [self.blocks[cabin_seat[0]], cabin_seat_number]
            for block in seats[:, 0]:
                target[block] = 1

        return block_col
        #return X[[self.field_name]].fillna("NaN").values


class FareImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.means = {}

    def fit(self, X, y=None, **fit_params):
        for key, class_group in train_df.groupby(["Pclass"]):
            self.means[key] = class_group.Fare.mean()
        return self

    def transform(self, X):
        for idx, row in X.iterrows():
            if row["Fare"] == 0.0:
                row["Fare"] = self.means[row["Pclass"]]
        return X[["Fare"]].values


class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, fill_na=None):
        self.attribute_names = attribute_names
        self.fill_na = fill_na
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X):
        if not self.fill_na:
            return X[self.attribute_names].values
        return X[self.attribute_names].fillna(self.fill_na).values


class LabelBinarizerX(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None,**fit_params):
        return self
    def transform(self, X):
        return LabelBinarizer().fit(X).transform(X)


# e = FillEmbarkedNa("Embarked")
# b = LabelBinarizer()
# b.fit_transform(e.fit_transform(train_df))

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelBinarizer, Imputer

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Pclass", "Age", "SibSp", "Parch"])),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

fare_pipeline = Pipeline([
        ('selector', FareImputer()),
        ('std_scaler', StandardScaler())
    ])

embarked_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Embarked"], "NaN")),
        ('cat_encoder', LabelBinarizerX())
    ])

sex_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Sex"])),
        ('cat_encoder', LabelBinarizerX())
    ])

cabin_pipeline = Pipeline([
        ('selector', ExtractCabinBlock("Cabin")),
        #('cat_encoder', LabelBinarizerX())
    ])

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, Imputer, Binarizer

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Pclass", "c_1", "c_2", "c_3", "c_4", "c_5", "c_6", "c_7", "c_8"])),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

final_transform = FeatureUnion(transformer_list=[
        ("embarked_pipeline", embarked_pipeline),
        ("num_pipeline", num_pipeline),
        ("sex_pipeline", sex_pipeline),
        ("cabin_pipeline", cabin_pipeline),
        ("fare_pipeline", fare_pipeline)
    ])

y_train = train_df.Survived.values
# c, r = y_train.shape
# y_train = y_train.reshape(c,)

#final_pipeline = Pipeline([
#        ('transformer', final_transform),
#        ('predictor', LogisticRegression())
#    ])

#predictor = LogisticRegression()

train_df["is_child_1"] = (train_df["Age"] < 10)
train_df["is_male"] = (train_df["Age"] > 30) & (train_df["Sex"] == "male") & (train_df["Pclass"] > 1)

#scores = cross_val_score(final_pipeline, train_df, train_df.Survived, scoring="accuracy", cv=5)
#np.median(scores)
X_train = final_transform.fit_transform(train_df, train_df.Survived)
print(cross_val_score(MLPClassifier(hidden_layer_sizes=(20, 20)), X_train, y_train, cv=10, scoring="accuracy").mean())
#print(cross_val_score(LogisticRegression(), X_train, y_train, cv=8, scoring="accuracy").mean())

0.7734786062875949


In [49]:
for i, r in train_df.iterrows():
    print(r["Fare"] == np.nan)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals