In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import label_binarize
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MultiLabelBinarizer
import numpy.testing as test
import warnings 

warnings.filterwarnings("ignore", category=FutureWarning) 

%matplotlib inline


class DataFrameSelector(BaseEstimator, TransformerMixin) :    
    def __init__(self, columns):
        self.columns=columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns].values

In [4]:
class MultiFeatureEncoder (MultiLabelBinarizer):
    def fit(self, X, y=None):
        return super().fit(X)
    def transform(self, X):
        return X[self.columns].values
    def fit_transform(self, X, y=None):
        return super().fit_transform(X.astype(str))

In [174]:

class NameEncoder(BaseEstimator, TransformerMixin) :    
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        flop=['Master', 'Miss', 'Mrs.', 'Mr.']
        f_b = np.zeros(X.size, dtype=bool) # initialize as false
        print(X.shape, type(X))
        for f in flop:
            f1 = (np.char.find(X.iloc[:,0].astype(str), f)!=-1) # which entries contain the f key
            X[f1]=flop.index(f) # replace with the actual index in the array string            
            f_b = f_b | f1 # retain the matched ones
        X[~f_b]=4 # undefined
        return X.astype(np.int)
    @staticmethod
    def test():
        nameEncoder = NameEncoder()
        res = nameEncoder.fit_transform(np.array([['Master Jimmy'], ['Mrs. Smith'], ['Mr. Smith']]))
        test.assert_array_equal(res, [[0],[2],[3]])


In [8]:
# age imputer expects an 2D array having on the first column
# the encoded name 0,1,2,3,4 and on the second column the age
class AgeImputer(BaseEstimator, TransformerMixin) :        
    # fit will calculate the averages per encoded name
    def fit(self, X, y=None):
        self.titleAgeDictionary = {}
        for key in np.arange(0, 5):
            try: # just to make sure for unit tests add a try/catch
                self.titleAgeDictionary[key] = np.nanmedian(X[X[:,0] == key][1])
            except:
                pass
        return self

    def transform(self, X, y=None):        
        for key in self.titleAgeDictionary.keys():
            X[(X[:,0] == key) & (np.isnan(X[:,1])),1]=self.titleAgeDictionary[key]
        return X

    def test(self):
        X=np.array([[1,12],
          [2,12],
          [1, np.NaN]])
        res = self.fit_transform(X)


In [162]:
class IdentityTransformer(BaseEstimator, TransformerMixin) : 
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        print(type(X))
        print(X.shape)
        return X.values

In [191]:
from sklearn.preprocessing import FunctionTransformer, StandardScaler, Normalizer, OneHotEncoder
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer
import inspect

# read the dataframe
df = pd.read_csv('train.csv')

def hasCabin(x):
    return np.apply_along_axis(lambda t : 0 if ( t != t) else 1 ,arr=x ,axis=1)


#t = make_column_transformer((['Name'], NameEncoder()))
# t = make_column_transformer((['Sex'], OneHotEncoder(sparse=False)))
# t = make_pipeline (DataFrameSelector(['Name']),  NameEncoder())

#x = t.fit_transform(df)
#x

# cd = ColumnTransformer([('test', NameEncoder(), ['Name'])] );

# x = cd.fit_transform(df)
x = np.arange(10)
hasattr(x, 'iloc')

False

In [168]:

# encode the name, join the age, fine median of age per name title, 
# apply the median ages for missing age entries
nameAgePipeLine = make_pipeline( 
                    # join encoded name and age columns
                    make_union( # select name column and encode name based on title
                        make_pipeline (
                            DataFrameSelector(['Name']), NameEncoder()), 
                            # fillup age based on name title medians
                        DataFrameSelector(['Age']),
                        make_column_transformer((['Sex','Embarked','Pclass'], OneHotEncoder(sparse=False))) ), 
                    # fill in the age where missing
                    AgeImputer())
#encode Sex, Embarked and Pclass
# catEncoderPipeline = make_pipeline( DataFrameSelector(["Sex","Embarked","Pclass"]), MultiFeatureEncoder() )
# encode cabin 1,2,3
 
src = inspect.getsource(ColumnTransformer)
# print(src)
d1 = df.loc[:,['Name']]
type(d1)

pandas.core.frame.DataFrame