In [None]:
from typing import NamedTuple
from typing import List, Dict

import pandas as pd
import numpy as np
import numba
import json

In [None]:
@numba.vectorize('f8(f8, f8, f8)')
def fast_scale(x, mean, std):
    return (x-mean)/std

@numba.vectorize('u1(u1, u1)')
def fast_encode(x, category):
    return x == category

In [None]:
class PreProcessor(NamedTuple):
    to_scale: List[str] = []
    to_onehot: List[str] = []
    scalers: Dict = {}
    encoders: Dict = {}
    
    def __call__(self, data: pd.DataFrame, *args, fit=False, transform=True, **kwargs):
        for label in self.to_scale:
            self._scale(data, label, fit, transform)
        for label in self.to_onehot:
            self._encode(data, label, fit, transform)
    
    def _scale(self, data: pd.DataFrame, label: str, fit: bool, transform: bool):
        if fit:
            arr = data[label].values
            mean, std = arr.mean(), arr.std()
            self.scalers[label] = (mean, std)
        
        if transform:
            mean, std = self.scalers[label]
            # ser = data[label].apply(lambda x: (x-mean)/std)
            # ser = [(x-mean)/std for x in data[label].values]
            ser = fast_scale(data[label].values, mean, std)
            data.loc[:, label] = ser

    def _encode(self, data: pd.DataFrame, label: str, fit: bool, transform: bool):
        if fit:
            arr  = data[label].values
            cats = np.unique(arr)
            code = list(range(len(cats)))
            lookup = {str(k): v for k, v in zip(cats, code)}
            self.encoders[label] = lookup
        
        if transform:
            lookup = self.encoders[label]
            tmpcol = data[label].astype(str).map(lookup).astype(np.uint8)
            for category, code in lookup.items():
                # ser = data[label].astype(str).apply(lambda x: np.uint8(x == category))
                # ser = [np.uint8(x == category) for x in data[label].astype(str).values]
                ser = fast_encode(tmpcol.values, code)
                data.loc[:, f'{label}_{category}'] = ser
            del data[label]
    
    def to_json(self) -> str:
        data = self._asdict()
        return json.dumps(data)
    
    @staticmethod
    def from_json(jstr: str) -> 'PreProcessor':
        data = json.loads(jstr)
        return PreProcessor(**data)

In [None]:
def make_data(size: float=1e5) -> pd.DataFrame:
    size = int(size)
    return pd.DataFrame(dict
        (
            a = np.random.randint(low=1, high=10, size=size),
            b = np.random.choice(list('abcd'), size=size),
            c = np.random.choice(list('wxyz'), size=size),
            d = np.random.choice([1,4,5], size=size)
        ), columns=list('abcd'))

In [None]:
train, test = make_data(8e5), make_data(2e5)

In [None]:
print(train.head(1))
print(test.head(1))

In [None]:
preprocess = PreProcessor(to_scale=list('a'), to_onehot=list('bcd'))

In [None]:
%%time
preprocess(train, fit=True)

In [None]:
print(preprocess.scalers)
print(preprocess.encoders)

In [None]:
jstr = preprocess.to_json()
preprocess2 = PreProcessor.from_json(jstr)

In [None]:
%%time
preprocess2(test)

In [None]:
print(train.head(1))
print(test.head(1))

In [None]:
train.dtypes

In [None]:
# sanity check
# from sklearn.preprocessing import StandardScaler

# data = make_data(1e2)

# scaler = StandardScaler()
# scaler.fit(data.a)

# preprocess = Preprocessor(to_scale=list('a'), to_onehot=list('bc'))
# preprocess(data, fit=True, transform=False)

# print(scaler.mean_, scaler.scale_)
# print(preprocess.scalers)

In [None]:
print(preprocess)
print(preprocess2)

In [None]:
print(jstr)