# Re-create your own _One Hot Encoder_ 

In [None]:
import pandas as pd
import seaborn as sns

## (1) The Titanic Dataset

In [None]:
# Loading 100% of the dataset. 
# Choose 0.5 to load only 50% of the rows randomly

data = sns.load_dataset('titanic').sample(frac = 1) 
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
812,0,2,male,35.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
618,1,2,female,4.0,2,1,39.0,S,Second,child,False,F,Southampton,yes,False
36,1,3,male,,0,0,7.2292,C,Third,man,True,,Cherbourg,yes,True
512,1,1,male,36.0,0,0,26.2875,S,First,man,True,E,Southampton,yes,True
378,0,3,male,20.0,0,0,4.0125,C,Third,man,True,,Cherbourg,no,True


In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(columns = ['survived', 'alive', 'who', 'adult_male'])
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
X_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
12,3,male,20.0,0,0,8.0500,S,Third,,Southampton,True
295,1,male,,0,0,27.7208,C,First,,Cherbourg,True
28,3,female,,0,0,7.8792,Q,Third,,Queenstown,True
160,3,male,44.0,0,1,16.1000,S,Third,,Southampton,False
737,1,male,35.0,0,0,512.3292,C,First,B,Cherbourg,True
...,...,...,...,...,...,...,...,...,...,...,...
426,2,female,28.0,1,0,26.0000,S,Second,,Southampton,False
696,3,male,44.0,0,0,8.0500,S,Third,,Southampton,True
883,2,male,28.0,0,0,10.5000,S,Second,,Southampton,True
471,3,male,38.0,0,0,8.6625,S,Third,,Southampton,True


## (2) A first pipeline

❓ Create a basic Pipeline which ***encodes categorical features*** and ***scales numerical features*** ❓

💡 Use [`make_pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html) and [`make_column_transformer`](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html)

In [None]:
num_features = ['age','fare','sibsp','parch']
cat_features = ['pclass','sex','embarked','class','embark_town','alone']

In [None]:
from sklearn import set_config
set_config(display="diagram")

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

cat_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore')
)

preprocessor = make_column_transformer(
    (num_transformer, num_features),
    (cat_transformer, cat_features)
)

preprocessor

In [None]:
pd.DataFrame(preprocessor.fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-0.686488,-0.472524,-0.471332,-0.494199,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.069593,-0.104147,-0.471332,-0.494199,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.069593,-0.475723,-0.471332,-0.494199,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.164199,-0.321771,-0.471332,0.742291,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.470191,8.971173,-0.471332,-0.494199,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,-0.069593,-0.136372,0.405205,-0.494199,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
619,1.164199,-0.472524,-0.471332,-0.494199,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
620,-0.069593,-0.426643,-0.471332,-0.494199,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
621,0.701527,-0.461054,-0.471332,-0.494199,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


<details>
    <summary>👩🏻‍🏫 <i>Pipeline</i> vs. <i>make_pipeline</i></summary>

* When you create a Pipeline with `Pipeline()`, you have to:
    - specify all the ***sequential steps of the pipeline*** in a list
    - each step is a tuple with:
        - "name_of_the_step"
        - official Scikit-Learn name of the step
    
```python
Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
```
  
* When you create a Pipeline with `make_pipeline()`,
    - you don't have give a name to each step
    - you can simply chain all the steps together using their official Scikit-Learn name
    - the names of the steps are automatically induced by `make_pipeline`
    
```python
make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
```
    
</details>

<details>
    <summary>👩🏻‍🏫 <i>ColumnTransformer</i> vs. <i>make_column_transformer</i></summary>

* When you create a ColumnTransformer with `ColumnTransformer()`, you have to:
    - specify all the ***parallel steps of the columns' transformer*** in a list
    - each step is a tuple with:
        - "name_of_the_transformer"
        - the transformer
        - the columns which will be impacted by the transformer
    
```python
ColumnTransformer([
    ('num_transformer', num_transformer, num_features),
    ('cat_transformer', cat_transformer, cat_features)
])
```
  
* When you create a ColumnTransformer with `make_column_transformer()`,
    - you don't have give a name to each parallel step
    - each step is a tuple with:
        - the transformer
        - the columns which will be impacted by the transformer
    
```python
make_column_transformer(
    (num_transformer, num_features),
    (cat_transformer, cat_features)
)
```
    
</details>

❓ Chain this preprocessing pipeline with a classifier and optimize it ❓

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats

final_pipe = make_pipeline(
    preprocessor,
    LogisticRegression(penalty='l1', solver='liblinear')
)

# If you are not sure about which params of your pipeline to optimize...
# ... remember that you can access them with final_pipe.get.params()

search = RandomizedSearchCV(final_pipe,
                            param_distributions={
                                'logisticregression__C': stats.loguniform(0.01,100), 
                            },
                            cv=3, 
                            scoring="accuracy", 
                            n_iter=20, 
                            n_jobs=-1)

search.fit(X_train,y_train);

❓ What are the best params and the best score ❓

In [None]:
search.best_params_

{'logisticregression__C': 0.5383318781153917}

In [None]:
search.best_score_

0.7896615260745695

## (3) How could we design a Custom Encoder to keep track of the columns' names?

In [None]:
# By default, OneHotEncoder works with Numpy and loses track of columns' names...
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(X_train[['sex']])

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [None]:
# ... however, we can access the one-hot-encoded names as follows
ohe.get_feature_names_out()

array(['sex_female', 'sex_male'], dtype=object)

❓ Try to create your own OneHotEncoder so that it preserves the columns names ❓

### 👩🏻‍🏫 Solution 👩🏻‍🏫

#### Wise option: upgrade the initial OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder


class CustomOHE1(OneHotEncoder):

    def transform(self, *args, **kwargs):
        return pd.DataFrame(super().transform(*args, **kwargs), columns=self.get_feature_names_out())

In [None]:
c = CustomOHE1(sparse=False)
c.fit(X_train[['sex']])
c.transform(X_train[['sex']])
c.fit_transform(X_train[['sex']]).head()

Unnamed: 0,sex_female,sex_male
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0


#### Harder option : recode the OneHotEmcoder using _get_dummies_ from _pandas_

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator


class CustomOHE2(TransformerMixin, BaseEstimator):

    def __init__(self):
        pass

    def fit(self, X):
        X_dummified = pd.get_dummies(X)
        self.columns = X_dummified.columns
        return self

    def transform(self, X):
        X_dummified = pd.get_dummies(X)
        # Only keep columns that are computed in the fit() method
        # Drop new dummy columns if new category appears in the test set that were never seen in train set
        X_dummified_reindexed = X_dummified.reindex(columns=self.columns, fill_value=0)
        return X_dummified_reindexed

In [None]:
CustomOHE2().fit(X_train[['sex']]).transform(X_train[['sex']])
CustomOHE2().fit_transform(X_train[['sex']]).head()

Unnamed: 0,sex_female,sex_male
12,0,1
295,0,1
28,1,0
160,0,1
737,0,1


#### Test it within a Pipeline and a ColumnTransformer

In [None]:
# For the Pipeline, it's working as expected and returns columns' names
pipeline = make_pipeline(CustomOHE1(sparse=False))

pd.DataFrame(pipeline.fit_transform(X_train[['sex']])).head()

Unnamed: 0,sex_female,sex_male
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0


**⚠️ But then again, we lose columns' names when passing `CustomOHE1` into a ColumnTransformer!** 👇

In [None]:
preprocessor = make_column_transformer(
    (CustomOHE1(sparse=False), ['sex'])
)

pd.DataFrame(preprocessor.fit_transform(X_train)).head()

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,0.0,1.0


**🤯🤯🤯 We also have to recode the ColumnTransformer ourself!**  
Good news, our default OneHot should pass the column names now!

In [None]:
from sklearn.compose import ColumnTransformer

class CustomColumnTransformer(ColumnTransformer):
    def transform(self, *args, **kwargs):
        return pd.DataFrame(super().transform(*args, **kwargs), columns=self.get_feature_names_out())
    def fit_transform(self, *args, **kwargs):
        return pd.DataFrame(super().fit_transform(*args, **kwargs), columns=self.get_feature_names_out())

In [None]:
preprocessor = CustomColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False), ['sex']),
    ('custom_ohe', CustomOHE1(sparse=False), ['sex'])
])
preprocessor.fit(X_train)
preprocessor.transform(X_train)
preprocessor.fit_transform(X_train).head()

Unnamed: 0,ohe__sex_female,ohe__sex_male,custom_ohe__sex_female,custom_ohe__sex_male
0,0.0,1.0,0.0,1.0
1,0.0,1.0,0.0,1.0
2,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,1.0
4,0.0,1.0,0.0,1.0


🏁 If you want to build a very advanced pipeline, feel free to explore the Optional Challenge dealing the `cars dataset` !

💾 Don't forget to git add/commit/push your notebook.

👏 Congratulations, you are now a master at Pipeline and ColumnTransformer.