In [None]:
from typing import NamedTuple, List, Dict

import pandas as pd
import numpy as np

In [None]:
class Preprocessor(NamedTuple):
    to_scale: List[str] = []
    to_onehot: List[str] = []
    scalers: Dict = {}
    encoders: Dict = {}
    
    def __call__(self, data: pd.DataFrame, *args, fit=False, transform=True, **kwargs):
        self._scale(data, fit, transform)
        self._encode(data, fit, transform)
    
    def _scale(self, data: pd.DataFrame, fit: bool, transform: bool):
        if fit:
            for label in self.to_scale:
                arr = data[label].values
                mean, std = arr.mean(), arr.std()
                self.scalers[label] = (mean, std)
        
        if transform:
            for label in self.to_scale:
                mean, std = self.scalers[label]
                ser = data[label].apply(lambda x: (x-mean)/std)
                # ser = [(x-mean)/std for x in data[label].values]
                data.loc[:, label] = ser

    def _encode(self, data: pd.DataFrame, fit: bool, transform: bool):
        if fit:
            for label in self.to_onehot:
                arr = data[label].values
                self.encoders[label] = np.unique(arr)
        
        if transform:
            for label in self.to_onehot:
                for category in self.encoders[label]:
                    ser = data[label].apply(lambda x: np.uint8(x == category))
                    # ser = [np.uint8(x == category) for x in data[label].values]
                    data.loc[:, f'{label}_{category}'] = ser
                del data[label]

In [None]:
def make_data(size: float=1e5) -> pd.DataFrame:
    size = int(size)
    return pd.DataFrame(dict
        (
            a = np.random.randint(low=1, high=10, size=size),
            b = np.random.choice(list('abcd'), size=size),
            c = np.random.choice(list('wxyz'), size=size)
        ), columns=list('abc'))

In [None]:
train, test = make_data(8e5), make_data(2e5)

In [None]:
print(train.head())
print(test.head())

In [None]:
preprocess = Preprocessor(to_scale=list('a'), to_onehot=list('bc'))

In [None]:
%%time
preprocess(train, fit=True)

In [None]:
%%time
preprocess(test)

In [None]:
print(train.head())
print(test.head())

In [None]:
preprocess.encoders

In [None]:
preprocess.scalers