In [None]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.datasets import fetch_openml
from category_encoders import TargetEncoder

display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]

# Dataset

In [None]:
data = fetch_openml(name="house_prices", as_frame=True, parser="auto")
df_pd = data.data[display_cols].copy()
df_pd["MSZoning2"] = df_pd["MSZoning"]
df_pd["target"] = [1 if x > 200000 else 0 for x in data.target]
del data

In [None]:
df:pl.DataFrame = pl.from_pandas(df_pd)

In [None]:
df.head()

In [None]:
to_be_encoded = ["MSZoning", 'CentralAir', 'Heating', "MSZoning2"]

# Target Encoder

In [None]:
df_pd

In [None]:
enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd[to_be_encoded], df_pd["target"])

enc.transform(df_pd[to_be_encoded]).tail(10)

In [None]:
import sys
from dsds.encoders import smooth_target_encode # Currently this only works for binary target

In [None]:
df_transf = smooth_target_encode(df, cols=to_be_encoded
            , target="target"
            , smoothing=10
            , min_samples_leaf=20)

In [None]:
df_transf.head()

In [None]:
df_transf[to_be_encoded].frame_equal(pl.from_pandas(enc.transform(df_pd[to_be_encoded])))

# WOE Encoder

In [None]:
from dsds.encoders import woe_cat_encode
from category_encoders import WOEEncoder

In [None]:
woe_cat_encode(df, "target", to_be_encoded).head()

In [None]:
woe = WOEEncoder(cols=to_be_encoded)
woe.fit_transform(X=df_pd, y=df_pd["target"])

# Time Comparison

All benchmarks here may not be representative. On Linux, dsds will win by a large margin in all cases no matter the size of the dataframe. On windows, on smaller datasets, 
dsds is a bit slower, but beats other implementations on bigger datasets. It is potentially caused by some Polars issue on Windows. We are investigating this.

In [None]:
%%timeit
woe = WOEEncoder(cols=to_be_encoded)
woe.fit_transform(X=df_pd, y=df_pd["target"])

In [None]:
%%timeit 
woe_cat_encode(df, "target", to_be_encoded) # Take this with grain of salt, as this is done on Windows

In [None]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd[to_be_encoded], df_pd["target"])

enc.transform(df_pd[to_be_encoded])

In [None]:
%%timeit 
smooth_target_encode(df, cols=to_be_encoded
                     , target="target"
                     , smoothing=10
                     , min_samples_leaf=20
                     , check_binary = False) # Take this with grain of salt, as this is done on Windows

In [None]:
# let's test it on bigger, more realistic data set size

In [None]:
df_pd_bigger = pd.concat([df_pd.copy()]*50)
df_pd_bigger.shape

In [None]:
df_pl = pl.from_pandas(df_pd_bigger)
df_pl.shape 

In [None]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd_bigger[to_be_encoded], df_pd_bigger["target"])

enc.transform(df_pd_bigger[to_be_encoded])

In [None]:
%%timeit 
woe_cat_encode(df_pl, "target", to_be_encoded)

In [None]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd_bigger[to_be_encoded], df_pd_bigger["target"])

enc.transform(df_pd_bigger[to_be_encoded])[to_be_encoded]

In [None]:
%%timeit 
smooth_target_encode(df_pl, cols=to_be_encoded
                     , target="target"
                     , smoothing=10
                     , min_samples_leaf=20
                     , check_binary = False)

In [None]:
# On bigger datasets, this is much faster
# I don't know why but on Windows, the performance sacles strangely...
# This doesn't happen when I test on Linux.