In [1]:
import pandas as pd

train_df = pd.read_csv('titanic/train.csv')
test_df = pd.read_csv('titanic/test.csv')
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [29]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np


class ExtractCabinBlock(BaseEstimator, TransformerMixin):
    def __init__(self, field_name):
        self.field_name = field_name
        self.blocks = {"A": 0,
                       "B": 1,
                       "C": 2,
                       "D": 3,
                       "E": 4,
                       "F": 5,
                       "G": 6,
                       "H": 7,
                       "K": 8,
                       "T": 9}
        
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None):
        source = X[self.field_name]
        block_col = np.empty((len(source), 11), dtype=int)
        #seat_col = np.empty((len(source), 11), dtype=int)
        block_col[:, :] = 0
        #seat_col[:, :] = np.nan
        for i in range(0, len(source)):
            x = source[i]
            target = block_col[i]
            if pd.isnull(x):
                target[10] = 1
                continue
            splitted = x.split(" ")
            seats = np.empty((len(splitted), 2), dtype=object)
            for j in range(0, len(splitted)):
                cabin_seat = splitted[j]
                cabin_seat_number = cabin_seat[1:]
                if not cabin_seat_number:
                    cabin_seat_number = np.nan
                else:
                    cabin_seat_number = int(cabin_seat_number)
                seats[j] = [self.blocks[cabin_seat[0]], cabin_seat_number]
            for block in seats[:, 0]:
                target[block] = 1

        return block_col
        #return X[[self.field_name]].fillna("NaN").values


class FareImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.means = {}

    def fit(self, X, y=None, **fit_params):
        for key, class_group in train_df.groupby(["Pclass"]):
            self.means[key] = class_group.Fare.mean()
        return self

    def transform(self, X):
        for idx, row in X.iterrows():
            print(row["Fare"] == np.nan)


class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names, fill_na=None):
        self.attribute_names = attribute_names
        self.fill_na = fill_na
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X):
        if not self.fill_na:
            return X[self.attribute_names].values
        return X[self.attribute_names].fillna(self.fill_na).values


class LabelBinarizerX(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None,**fit_params):
        return self
    def transform(self, X):
        return LabelBinarizer().fit(X).transform(X)


# e = FillEmbarkedNa("Embarked")
# b = LabelBinarizer()
# b.fit_transform(e.fit_transform(train_df))

In [30]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelBinarizer, Imputer

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Pclass", "Age", "SibSp", "Parch"])),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

fare_pipeline = Pipeline([
        ('selector', FareImputer()),
        ('std_scaler', StandardScaler())
    ])

embarked_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Embarked"], "NaN")),
        ('cat_encoder', LabelBinarizerX())
    ])

sex_pipeline = Pipeline([
        ('selector', DataFrameSelector(["Sex"])),
        ('cat_encoder', LabelBinarizerX())
    ])

cabin_pipeline = Pipeline([
        ('selector', ExtractCabinBlock("Cabin")),
        #('cat_encoder', LabelBinarizerX())
    ])

final_transform = FeatureUnion(transformer_list=[
        # ("embarked_pipeline", embarked_pipeline),
        # ("num_pipeline", num_pipeline),
        # ("sex_pipeline", sex_pipeline),
        # ("cabin_pipeline", cabin_pipeline),
        ("fare_pipeline", fare_pipeline)
    ])

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

y_train = train_df.Survived.values
# c, r = y_train.shape
# y_train = y_train.reshape(c,)

#final_pipeline = Pipeline([
#        ('transformer', final_transform),
#        ('predictor', LogisticRegression())
#    ])

#predictor = LogisticRegression()


#scores = cross_val_score(final_pipeline, train_df, train_df.Survived, scoring="accuracy", cv=5)
#np.median(scores)
X_train = final_transform.fit_transform(train_df, train_df.Survived)
print(cross_val_score(MLPClassifier(hidden_layer_sizes=(120,50,40)), X_train, y_train, cv=8, scoring="accuracy").mean())
print(cross_val_score(LogisticRegression(), X_train, y_train, cv=8, scoring="accuracy").mean())

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [35]:
for i, r in train_df.iterrows():
    print(r["Fare"] == np.nan)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

False
False
False
False
False
False
False
False
False
False
False
