In [68]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import label_binarize
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MultiLabelBinarizer
import numpy.testing as test
import warnings 

warnings.filterwarnings("ignore", category=FutureWarning) 

%matplotlib inline


class DataFrameSelector(BaseEstimator, TransformerMixin) :    
    def __init__(self, columns):
        self.columns=columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns].values

In [67]:
class MultiFeatureEncoder (MultiLabelBinarizer):
    def fit(self, X, y=None):
        return super().fit(X)
    def transform(self, X):
        return X[self.columns].values
    def fit_transform(self, X, y=None):
        return super().fit_transform(X.astype(str))

In [66]:

class NameEncoder(BaseEstimator, TransformerMixin) :    
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        flop=['Master', 'Miss', 'Mrs.', 'Mr.']
        f_b = np.zeros(X.size, dtype=bool) # initialize as false
        for f in flop:
            f1 = (np.char.find(X[:,0].astype(str), f)!=-1) # which entries contain the f key
            X[f1]=flop.index(f) # replace with the actual index in the array string            
            f_b = f_b | f1 # retain the matched ones
        X[~f_b]=4 # undefined
        return X.astype(np.int)
    @staticmethod
    def test():
        nameEncoder = NameEncoder()
        res = nameEncoder.fit_transform(np.array([['Master Jimmy'], ['Mrs. Smith'], ['Mr. Smith']]))
        test.assert_array_equal(res, [[0],[2],[3]])


In [65]:
# age imputer expects an 2D array having on the first column
# the encoded name 0,1,2,3,4 and on the second column the age
class AgeImputer(BaseEstimator, TransformerMixin) :        
    # fit will calculate the averages per encoded name
    def fit(self, X, y=None):
        self.titleAgeDictionary = {}
        for key in np.arange(0, 5):
            try: # just to make sure for unit tests add a try/catch
                self.titleAgeDictionary[key] = np.nanmedian(X[X[:,0] == key][1])
            except:
                pass
        return self

    def transform(self, X, y=None):        
        for key in self.titleAgeDictionary.keys():
            X[(X[:,0] == key) & (np.isnan(X[:,1])),1]=self.titleAgeDictionary[key]
        return X

    def test(self):
        X=np.array([[1,12],
          [2,12],
          [1, np.NaN]])
        res = self.fit_transform(X)


In [92]:
from sklearn.preprocessing import FunctionTransformer, StandardScaler, Normalizer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer

# read the dataframe
df = pd.read_csv('train.csv')

def hasCabin(x):
    return np.apply_along_axis(lambda t : 0 if ( t != t) else 1 ,arr=x ,axis=1)

pipeline = make_column_transformer(
    (['Name', 'Age'], make_pipeline(
                        make_union(
                            make_pipeline(DataFrameSelector(['Name']), NameEncoder()),
                            make_pipeline(DataFrameSelector(['Age']), StandardScaler()) ), # union
                       AgeImputer() ) # pipeline age
    ), #tuple
    (['Sex', 'Pclass'], OneHotEncoder(sparse=False)),
    (['Embarked'], make_pipeline( SimpleImputer( strategy='constant', fill_value='U' ), OneHotEncoder(sparse=False))), 
    (['Fare'], make_pipeline( SimpleImputer(strategy='mean'), FunctionTransformer( lambda x : np.log2(x, where=x>0)), StandardScaler() )),
    (['PassengerId', 'Survived'], 'drop'),
    (['SibSp', 'Parch'], 'passthrough')
    )

x = pipeline.fit_transform(df)
print(x.shape)
x
 

(891, 14)


array([[ 3.        , -0.53037664,  0.        , ..., -0.91071735,
         1.        ,  0.        ],
       [ 2.        ,  0.57183099,  1.        , ...,  1.36961644,
         1.        ,  0.        ],
       [ 1.        , -0.25482473,  1.        , ..., -0.82190397,
         0.        ,  0.        ],
       ...,
       [ 1.        , -0.38518011,  1.        , ...,  0.26041593,
         1.        ,  2.        ],
       [ 3.        , -0.25482473,  0.        , ...,  0.50616874,
         0.        ,  0.        ],
       [ 3.        ,  0.15850313,  0.        , ..., -0.8441814 ,
         0.        ,  0.        ]])