# Re-create your own One_Hot_Encoder 

## Load data

In [10]:
import pandas as pd
import seaborn as sns
# visualizing pipelines in HTML
from sklearn import set_config; set_config(display='diagram')

In [3]:
data = sns.load_dataset('titanic').sample(frac=1)

In [4]:
train_frac = 0.7
n_train = round(len(data)*train_frac)
n_test = len(data) - n_train

data_train = data.iloc[:n_train,:]
data_test = data.iloc[n_train:,:]

X_train = data_train.drop(columns=['survived', 'alive','who','adult_male'])
y_train = data_train['survived']

X_test = data_test.drop(columns=['survived','alive','who','adult_male'])
y_test = data_test['survived']

X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
118,1,male,24.0,0,1,247.5208,C,First,B,Cherbourg,False
682,3,male,20.0,0,0,9.2250,S,Third,,Southampton,True
828,3,male,,0,0,7.7500,Q,Third,,Queenstown,True
584,3,male,,0,0,8.7125,C,Third,,Cherbourg,True
548,3,male,33.0,1,1,20.5250,S,Third,,Southampton,False
...,...,...,...,...,...,...,...,...,...,...,...
657,3,female,32.0,1,1,15.5000,Q,Third,,Queenstown,False
706,2,female,45.0,0,0,13.5000,S,Second,,Southampton,True
741,1,male,36.0,1,0,78.8500,S,First,C,Southampton,False
372,3,male,19.0,0,0,8.0500,S,Third,,Southampton,True


In [5]:
y_train

118    0
682    0
828    1
584    0
548    0
      ..
657    0
706    1
741    0
372    0
272    1
Name: survived, Length: 624, dtype: int64

## A first pipe

In [8]:
# Differentiate between categorical and numerical
num = ['age', 'sibsp', 'parch', 'fare']
cat = ['pclass', 'sex', 'embarked','deck', 'embark_town', 'alone']

In [6]:
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
118,1,male,24.0,0,1,247.5208,C,First,B,Cherbourg,False
682,3,male,20.0,0,0,9.2250,S,Third,,Southampton,True
828,3,male,,0,0,7.7500,Q,Third,,Queenstown,True
584,3,male,,0,0,8.7125,C,Third,,Cherbourg,True
548,3,male,33.0,1,1,20.5250,S,Third,,Southampton,False
...,...,...,...,...,...,...,...,...,...,...,...
657,3,female,32.0,1,1,15.5000,Q,Third,,Queenstown,False
706,2,female,45.0,0,0,13.5000,S,Second,,Southampton,True
741,1,male,36.0,1,0,78.8500,S,First,C,Southampton,False
372,3,male,19.0,0,0,8.0500,S,Third,,Southampton,True


👉 Create a pre-processing pipeline

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder



num_transformer = Pipeline([
                    ('imputer', SimpleImputer()),
                    ('scaler', MinMaxScaler())
                    ])

cat_transformer = Pipeline([
                ('imputer', SimpleImputer(strategy = 'most_frequent')),
                ('OHE', OneHotEncoder(handle_unknown = "ignore", sparse = False))
                 ])


prep = make_column_transformer(
            (num_transformer, num),
            (cat_transformer, cat)
)


prep

## Custom OHEncoder to keep track of column names?

In [12]:
# By default OneHot works with numpy and loses track of column names
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(X_train[['sex']])

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [14]:
# Hopefully we can access the one-hot-encoded names as follows
ohe.get_feature_names_out()

array(['sex_female', 'sex_male'], dtype=object)

In [25]:
pd.DataFrame(ohe.fit_transform(X_train[['sex']]), columns = ohe.get_feature_names_out())

Unnamed: 0,sex_female,sex_male
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
619,1.0,0.0
620,1.0,0.0
621,0.0,1.0
622,0.0,1.0


👉 Try to create your own OneHotEncoder so that it preserves the column names when piping

In [28]:
# Custom OHE
from sklearn.preprocessing import OneHotEncoder

class MyCustomOHE(OneHotEncoder): #Creating a copy of OHE to be modified
    
    #transform
    def transform(self,*args, **kwargs):
        array = super().transform(*args, **kwargs)
        df = pd.DataFrame(array, columns = self.get_feature_names_out())
        return df
    
    #fit_transform
    def fit_transform(self,*args, **kwargs):
        array = super().fit_transform(*args, **kwargs)
        df = pd.DataFrame(array, columns = self.get_feature_names_out())
        return df



### Test it within a Pipeline and a ColumnTransformer

In [27]:
# Test within a Pipeline
ohe.fit_transform(X_train[['sex']])


array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [30]:
cohe = MyCustomOHE(sparse=False)
cohe.fit_transform(X_train[['sex']])

Unnamed: 0,sex_female,sex_male
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
619,1.0,0.0
620,1.0,0.0
621,0.0,1.0
622,0.0,1.0


**⚠️ But then again we lose column names when passing that into a ColumnTransformer!**

In [39]:
# Test within a Pipeline
import pandas as pd
from sklearn.pipeline import make_pipeline
pipeline = make_column_transformer(
    (MyCustomOHE(sparse=False), ['sex']))
    
pd.DataFrame(pipeline.fit_transform(X_train)).head()





Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


**🤯🤯🤯 We also have to recode the ColumnTransformer ourself!**  

In [42]:
# Create a custom ColumnTransformer class to keep track of column names
from sklearn.compose import ColumnTransformer

class MyCustomColumnTransformer(ColumnTransformer):
    
    def transform(self, *args, **kwargs):
        array = super().transform(*args, **kwargs)
        df = pd.DataFrame(array,columns = self.get_feature_names_out())
        return df
            
    def fit_transform(self, *args, **kwargs):
        array = super().fit_transform(*args, **kwargs)
        df = pd.DataFrame(array,columns = self.get_feature_names_out())
        return df

In [43]:
preprocessor = MyCustomColumnTransformer([
    ('custom_ohe', MyCustomOHE(sparse=False), ['sex'])
])

In [44]:
preprocessor.fit_transform(X_train).head()

Unnamed: 0,custom_ohe__sex_female,custom_ohe__sex_male
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


🏁 In conclusion, it's rather difficult to keep column names and dataframes when dealing with pipelines in Sklearn.