In [405]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import label_binarize
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MultiLabelBinarizer
import numpy.testing as test

%matplotlib inline


class DataFrameSelector(BaseEstimator, TransformerMixin) :    
    def __init__(self, columns):
        self.columns=columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns].values

In [423]:
class MultiFeatureEncoder (MultiLabelBinarizer):
    def fit(self, X, y=None):
        return super().fit(X)
    def transform(self, X):
        return X[self.columns].values
    def fit_transform(self, X, y=None):
        return super().fit_transform(X.astype(str))

In [201]:

class NameEncoder(BaseEstimator, TransformerMixin) :    
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        flop=['Master', 'Miss', 'Mrs.', 'Mr.']
        f_b = np.zeros(X.size, dtype=bool) # initialize as false
        for f in flop:
            f1 = (np.char.find(X[:,0].astype(str), f)!=-1) # which entries contain the f key
            X[f1]=flop.index(f) # replace with the actual index in the array string            
            f_b = f_b | f1 # retain the matched ones
        X[~f_b]=4 # undefined
        return X.astype(np.int)
    @staticmethod
    def test():
        nameEncoder = NameEncoder()
        res = nameEncoder.fit_transform(np.array([['Master Jimmy'], ['Mrs. Smith'], ['Mr. Smith']]))
        test.assert_array_equal(res, [[0],[2],[3]])


In [362]:
# age imputer expects an 2D array having on the first column
# the encoded name 0,1,2,3,4 and on the second column the age
class AgeImputer(BaseEstimator, TransformerMixin) :        
    # fit will calculate the averages per encoded name
    def fit(self, X, y=None):
        self.titleAgeDictionary = {}
        for key in np.arange(0, 5):
            try: # just to make sure for unit tests add a try/catch
                self.titleAgeDictionary[key] = np.nanmedian(X[X[:,0] == key][1])
            except:
                pass
        return self

    def transform(self, X, y=None):        
        for key in self.titleAgeDictionary.keys():
            X[(X[:,0] == key) & (np.isnan(X[:,1])),1]=self.titleAgeDictionary[key]
        return X

    def test(self):
        X=np.array([[1,12],
          [2,12],
          [1, np.NaN]])
        res = self.fit_transform(X)


[[ 1. 12.]
 [ 2. 12.]
 [ 1.  1.]]


In [490]:
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import make_pipeline, make_union

# read the dataframe
df = pd.read_csv('train.csv')

def hasCabin(x):
    return np.apply_along_axis(lambda t : 0 if ( t != t) else 1 ,arr=x ,axis=1)


# encode the name, join the age, fine median of age per name title, 
# apply the median ages for missing age entries
nameAgePipeLine = Pipeline([('name_age_preparation', FeatureUnion(transformer_list=[
                        ('pipelineName', Pipeline([('selector', DataFrameSelector(['Name'])), # select name column 
                                               ('name_encoder', NameEncoder())])), #encode the name
                        ('ageSelector', DataFrameSelector(['Age'])) ])), # fillup age based on name title medians
             ('age_imputer', AgeImputer())])
#encode Sex, Embarked and Pclass
catEncoderPipeline = Pipeline([
            ('select', DataFrameSelector(["Sex","Embarked","Pclass"])),
            ('encodeCat', MultiFeatureEncoder() )])
classPipeline = Pipeline([
            ('select', DataFrameSelector(['Cabin'])),
            ('createCat', FunctionTransformer(hasCabin, validate=False)), # use validate = false otherwise will complaint about not being a float
            ('encode', MultiFeatureEncoder() )])

#put everything together 
f = lambda t : np.log(t, out=np.zeros_like(fare), where=(fare>0))
numFeatures = Pipeline([
    ('select', DataFrameSelector(['Fare'])),
    ('log', FunctionTransformer(f)),  
    ('scaler', StandardScaler())
])

numFeatures.fit_transform(df)

StandardScaler().fit_transform([[0, 11.1], [1, 10.9], [0, 13.2], [1,12.1], [1, 7.3]])


array([[-1.22474487,  0.09061831],
       [ 0.81649658, -0.0100687 ],
       [-1.22474487,  1.14783198],
       [ 0.81649658,  0.59405339],
       [ 0.81649658, -1.82243498]])

In [487]:
# 
fare = df['Fare'].dropna().values.reshape(-1,1)
f = lambda t : np.log2(t, out=np.zeros_like(fare), where=(fare>0))
fare= f(fare)
fare = df[['Fare']].values
fare



array([[  7.25  ],
       [ 71.2833],
       [  7.925 ],
       [ 53.1   ],
       [  8.05  ],
       [  8.4583],
       [ 51.8625],
       [ 21.075 ],
       [ 11.1333],
       [ 30.0708],
       [ 16.7   ],
       [ 26.55  ],
       [  8.05  ],
       [ 31.275 ],
       [  7.8542],
       [ 16.    ],
       [ 29.125 ],
       [ 13.    ],
       [ 18.    ],
       [  7.225 ],
       [ 26.    ],
       [ 13.    ],
       [  8.0292],
       [ 35.5   ],
       [ 21.075 ],
       [ 31.3875],
       [  7.225 ],
       [263.    ],
       [  7.8792],
       [  7.8958],
       [ 27.7208],
       [146.5208],
       [  7.75  ],
       [ 10.5   ],
       [ 82.1708],
       [ 52.    ],
       [  7.2292],
       [  8.05  ],
       [ 18.    ],
       [ 11.2417],
       [  9.475 ],
       [ 21.    ],
       [  7.8958],
       [ 41.5792],
       [  7.8792],
       [  8.05  ],
       [ 15.5   ],
       [  7.75  ],
       [ 21.6792],
       [ 17.8   ],
       [ 39.6875],
       [  7.8   ],
       [ 76.