# transformers 

In [38]:
from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator
from sklearn.impute import SimpleImputer
import pandas as pd

### SimpleImputer NB - this returns np.array

In [128]:
df = pd.DataFrame({'small':[1,None,3], 'big':[20,30,None]})
df2 = pd.DataFrame({'big':[7,None,8],'small':[1,2,None]})

In [129]:
imputer = SimpleImputer()

In [130]:
imputer.fit_transform(df)

array([[ 1., 20.],
       [ 2., 30.],
       [ 3., 25.]])

In [132]:
imputer.transform(df2)
# so column order matters

array([[ 7.,  1.],
       [ 2.,  2.],
       [ 8., 25.]])

### custom transformer

In [133]:
class FeatureScale(BaseEstimator, TransformerMixin):
    def __init__(self, feature_scale = 2):
        self.feature_scale = 2
        print(f'feature_scale set to {self.feature_scale}')

    def fit(self,X,y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X = X * self.feature_scale
        return X

In [134]:
feature_scaler = FeatureScale()

feature_scale set to 2


In [135]:
df = pd.DataFrame({'small':[1,None,3]})

In [136]:
feature_scaler.fit_transform(df)

Unnamed: 0,small
0,2.0
1,
2,6.0


### In this example we see that the column transormers do not act sequentionally, they each make their own colmn which are ordered sequentially

In [137]:
column_transformer = ColumnTransformer(transformers=[
    ('imputer',SimpleImputer(),[0]),
    ('feature_scaler',FeatureScale(2),[0])
])

feature_scale set to 2


In [138]:
column_transformer.fit_transform(df)

feature_scale set to 2


array([[ 1.,  2.],
       [ 2., nan],
       [ 3.,  6.]])

### use a pipeline to get sequentionality

In [139]:
from sklearn.preprocessing import Imputer
pipeline = Pipeline([
    ('imputer',SimpleImputer()),
    ('feature_scaler',FeatureScale(2))
])

feature_scale set to 2


In [140]:
pipeline.fit_transform(df)

array([[2.],
       [4.],
       [6.]])

### onehotencoder

In [169]:
animals = pd.DataFrame({
    'animal':['dog','dog','cat','pig'],
    'trained':[True,True,False,False]})

In [170]:
animals

Unnamed: 0,animal,trained
0,dog,True
1,dog,True
2,cat,False
3,pig,False


In [171]:
from sklearn.preprocessing import OneHotEncoder

In [175]:
one_hot_encoder = OneHotEncoder(drop='first')
transformed_animals = one_hot_encoder.fit_transform(animals)
pd.DataFrame(
    data=transformed_animals.toarray(),
    columns=one_hot_encoder.get_feature_names())

Unnamed: 0,x0_dog,x0_pig,x1_True
0,1.0,0.0,1.0
1,1.0,0.0,1.0
2,0.0,0.0,0.0
3,0.0,1.0,0.0


In [164]:
one_hot_encoder.categories_

[array(['cat', 'dog', 'pig'], dtype=object), array([False,  True])]