# Re-create your own One_Hot_Encoder 

## Load data

In [8]:
import pandas as pd
import seaborn as sns

 # visualizing pipelines in HTML
from sklearn import set_config; set_config(display='diagram')


In [15]:
data = sns.load_dataset('titanic').sample(frac=1)

In [19]:
train_frac = 0.7
n_train = round(len(data)*train_frac)
n_test = len(data) - n_train

data_train = data.iloc[:n_train,:]
data_test = data.iloc[n_train:,:]

X_train = data_train.drop(columns=['survived', 'alive','who','adult_male'])
y_train = data_train['survived']

X_test = data_test.drop(columns=['survived','alive','who','adult_male'])
y_test = data_test['survived']

X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
11,1,female,58.0,0,0,26.5500,S,First,C,Southampton,True
60,3,male,22.0,0,0,7.2292,C,Third,,Cherbourg,True
646,3,male,19.0,0,0,7.8958,S,Third,,Southampton,True
13,3,male,39.0,1,5,31.2750,S,Third,,Southampton,False
567,3,female,29.0,0,4,21.0750,S,Third,,Southampton,False
...,...,...,...,...,...,...,...,...,...,...,...
413,2,male,,0,0,0.0000,S,Second,,Southampton,True
881,3,male,33.0,0,0,7.8958,S,Third,,Southampton,True
766,1,male,,0,0,39.6000,C,First,,Cherbourg,True
781,1,female,17.0,1,0,57.0000,S,First,B,Southampton,False


## A first preprocessing pipeline

👉 Create a basic pipeline one-hot-encoding categorical features

In [16]:
# Differentiate between categorical and numerical
cat = ['pclass', 'sex', 'embarked', 'class', 'deck', 'embark_town', 'alone']
num = ['age', 'fare','sibsp', 'parch']




In [17]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer



num_scaler = make_pipeline( 
                            SimpleImputer(strategy = 'mean'), 
                            StandardScaler()
                            )

cat_scaler = make_pipeline(
                            SimpleImputer(strategy = "most_frequent"),
                            OneHotEncoder(handle_unknown = 'ignore')
                            )


    
prep= make_column_transformer(
    (num_scaler, num),
     (cat_scaler, cat)   
        )
prep

In [20]:
prep.fit_transform(X_train)

array([[ 2.12555959, -0.10573931, -0.489315  , ...,  1.        ,
         0.        ,  1.        ],
       [-0.59895243, -0.50004831, -0.489315  , ...,  0.        ,
         0.        ,  1.        ],
       [-0.8259951 , -0.48644398, -0.489315  , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.        ,  0.16059191, -0.489315  , ...,  0.        ,
         0.        ,  1.        ],
       [-0.97735688,  0.51570022,  0.42212548, ...,  1.        ,
         1.        ,  0.        ],
       [-0.52327154, -0.3822748 , -0.489315  , ...,  1.        ,
         0.        ,  1.        ]])

## Custom OHEncoder to keep track of column names?

In [22]:
# By default OneHot works with numpy and loses track of column names
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(X_train[['sex']])

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [50]:
# Hopefully we can access the one-hot-encoded names as follows
ohe.get_feature_names_out()

array(['sex_female', 'sex_male'], dtype=object)

👉 Try to create your own OneHotEncoder so that it preserves the column names when piping

In [77]:
from sklearn.preprocessing import OneHotEncoder


class MyCustomOHE(OneHotEncoder): #Inherit from OneHotEncoder
    
#     # Define the function
    def transform(self, *args, **kwargs):
        array = super().transform(*args, **kwargs)
        df = pd.DataFrame(array,columns = self.get_feature_names_out())
        return df
    
    def fit_transform(self, *args, **kwargs):
        array = super().fit_transform(*args, **kwargs)
        df = pd.DataFrame(array,columns = self.get_feature_names_out())
        return df
        

In [78]:
c = MyCustomOHE(sparse = False)
c.fit(X_train[['sex']])
c.transform(X_train[['sex']])
c.fit_transform(X_train[['sex']]).head()

Unnamed: 0,sex_female,sex_male
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0


### Test it within a Pipeline and a ColumnTransformer

In [79]:
# For the Pipeline, it's working as expected and returns columns' names
pipeline = make_pipeline(MyCustomOHE(sparse=False))

pd.DataFrame(pipeline.fit_transform(X_train[['sex']])).head()

Unnamed: 0,sex_female,sex_male
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0


**⚠️ But then again we lose column names when passing that into a ColumnTransformer!**

In [80]:
# Test within a ColumnTransformer
preprocessor = make_column_transformer(
    (MyCustomOHE(sparse=False), ['sex'])
)

pd.DataFrame(preprocessor.fit_transform(X_train)).head()

Unnamed: 0,0,1
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0


**🤯🤯🤯 We also have to recode the ColumnTransformer ourself!**  

In [81]:
# from sklearn.compose import ColumnTransformer

# class MyCustomColumnTransformer(ColumnTransformer):
#     def transform(self, *args, **kwargs):
#         return pd.DataFrame(super().transform(*args, **kwargs), columns=self.get_feature_names_out())
#     def fit_transform(self, *args, **kwargs):
#         return pd.DataFrame(super().fit_transform(*args, **kwargs), columns=self.get_feature_names_out())

In [85]:
from sklearn.compose import ColumnTransformer

class MyCustomColumnTransformer(ColumnTransformer):
   # Define the function
    def transform(self, *args, **kwargs):
        array = super().transform(*args, **kwargs)
        df = pd.DataFrame(array,columns = self.get_feature_names_out())
        return df
    
    def fit_transform(self, *args, **kwargs):
        array = super().fit_transform(*args, **kwargs)
        df = pd.DataFrame(array,columns = self.get_feature_names_out())
        return df

In [86]:
num_scaler = make_pipeline( 
                            SimpleImputer(strategy = 'mean'), 
                            StandardScaler()
                            )

cat_scaler = make_pipeline(
                            SimpleImputer(strategy = "most_frequent"),
                            MyCustomOHE(sparse = False,handle_unknown = 'ignore')
                            )


    
prep= MyCustomColumnTransformer([
    ('num', num_scaler, num),
     ('cat', cat_scaler, cat)   
])
prep

In [87]:
preprocessor = MyCustomColumnTransformer([
    ('custom_ohe', MyCustomOHE(sparse=False), ['sex'])
])
preprocessor.fit(X_train)
preprocessor.transform(X_train)
preprocessor.fit_transform(X_train).head()

Unnamed: 0,custom_ohe__sex_female,custom_ohe__sex_male
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0


🏁 In conclusion, it's rather difficult to keep column names and dataframes when dealing with pipelines in Sklearn.