In [1]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.datasets import fetch_openml
from category_encoders import TargetEncoder

display_cols = ["Id", "MSSubClass", "MSZoning", "LotFrontage", "YearBuilt", "Heating", "CentralAir"]

# Dataset

In [2]:
data = fetch_openml(name="house_prices", as_frame=True, parser="auto")
df_pd = data.data[display_cols].copy()
df_pd["MSZoning2"] = df_pd["MSZoning"]
df_pd["target"] = [1 if x > 200000 else 0 for x in data.target]
del data

In [3]:
df:pl.DataFrame = pl.from_pandas(df_pd)

In [4]:
df.head()

Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir,MSZoning2,target
i64,i64,str,f64,i64,str,str,str,i64
1,60,"""RL""",65.0,2003,"""GasA""","""Y""","""RL""",1
2,20,"""RL""",80.0,1976,"""GasA""","""Y""","""RL""",0
3,60,"""RL""",68.0,2001,"""GasA""","""Y""","""RL""",1
4,70,"""RL""",60.0,1915,"""GasA""","""Y""","""RL""",0
5,60,"""RL""",84.0,2000,"""GasA""","""Y""","""RL""",1


In [5]:
to_be_encoded = ["MSZoning", 'CentralAir', 'Heating', "MSZoning2"]

# Target Encoder

In [6]:
df_pd

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir,MSZoning2,target
0,1,60,RL,65.0,2003,GasA,Y,RL,1
1,2,20,RL,80.0,1976,GasA,Y,RL,0
2,3,60,RL,68.0,2001,GasA,Y,RL,1
3,4,70,RL,60.0,1915,GasA,Y,RL,0
4,5,60,RL,84.0,2000,GasA,Y,RL,1
...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,1999,GasA,Y,RL,0
1456,1457,20,RL,85.0,1978,GasA,Y,RL,1
1457,1458,70,RL,66.0,1941,GasA,Y,RL,1
1458,1459,20,RL,68.0,1950,GasA,Y,RL,0


In [7]:
enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd[to_be_encoded], df_pd["target"])

enc.transform(df_pd[to_be_encoded]).tail(10)

Unnamed: 0,MSZoning,CentralAir,Heating,MSZoning2
1450,0.333623,0.30989,0.294818,0.333623
1451,0.333623,0.30989,0.294818,0.333623
1452,0.045872,0.30989,0.294818,0.045872
1453,0.333623,0.30989,0.294818,0.333623
1454,0.505328,0.30989,0.294818,0.505328
1455,0.333623,0.30989,0.294818,0.333623
1456,0.333623,0.30989,0.294818,0.333623
1457,0.333623,0.30989,0.294818,0.333623
1458,0.333623,0.30989,0.294818,0.333623
1459,0.333623,0.30989,0.294818,0.333623


In [8]:
import sys
from dsds.encoders import smooth_target_encode # Currently this only works for binary target

In [9]:
df_transf = smooth_target_encode(df, cols=to_be_encoded
            , target="target"
            , smoothing=10
            , min_samples_leaf=20)

In [10]:
df_transf.head()

Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir,MSZoning2,target
i64,i64,f64,f64,i64,f64,f64,f64,i64
1,60,0.333623,65.0,2003,0.294818,0.30989,0.333623,1
2,20,0.333623,80.0,1976,0.294818,0.30989,0.333623,0
3,60,0.333623,68.0,2001,0.294818,0.30989,0.333623,1
4,70,0.333623,60.0,1915,0.294818,0.30989,0.333623,0
5,60,0.333623,84.0,2000,0.294818,0.30989,0.333623,1


In [11]:
df_transf[to_be_encoded].frame_equal(pl.from_pandas(enc.transform(df_pd[to_be_encoded])))

True

# WOE Encoder

In [12]:
from dsds.encoders import woe_cat_encode
from category_encoders import WOEEncoder

In [13]:
woe_cat_encode(df, "target", to_be_encoded).head()

Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir,MSZoning2,target
i64,i64,f64,f64,i64,f64,f64,f64,i64
1,60,0.190153,65.0,2003,0.009982,0.081367,0.190153,1
2,20,0.190153,80.0,1976,0.009982,0.081367,0.190153,0
3,60,0.190153,68.0,2001,0.009982,0.081367,0.190153,1
4,70,0.190153,60.0,1915,0.009982,0.081367,0.190153,0
5,60,0.190153,84.0,2000,0.009982,0.081367,0.190153,1


In [14]:
woe = WOEEncoder(cols=to_be_encoded)
woe.fit_transform(X=df_pd, y=df_pd["target"])

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,YearBuilt,Heating,CentralAir,MSZoning2,target
0,1,60,0.190153,65.0,2003,0.009982,0.081367,0.190153,1
1,2,20,0.190153,80.0,1976,0.009982,0.081367,0.190153,0
2,3,60,0.190153,68.0,2001,0.009982,0.081367,0.190153,1
3,4,70,0.190153,60.0,1915,0.009982,0.081367,0.190153,0
4,5,60,0.190153,84.0,2000,0.009982,0.081367,0.190153,1
...,...,...,...,...,...,...,...,...,...
1455,1456,60,0.190153,62.0,1999,0.009982,0.081367,0.190153,0
1456,1457,20,0.190153,85.0,1978,0.009982,0.081367,0.190153,1
1457,1458,70,0.190153,66.0,1941,0.009982,0.081367,0.190153,1
1458,1459,20,0.190153,68.0,1950,0.009982,0.081367,0.190153,0


# Time Comparison

All benchmarks here may not be representative. On Linux, dsds will win by a large margin in all cases no matter the size of the dataframe. On windows, on smaller datasets, 
dsds is a bit slower, but beats other implementations on bigger datasets. It is potentially caused by some Polars issue on Windows. We are investigating this.

In [15]:
%%timeit
woe = WOEEncoder(cols=to_be_encoded)
woe.fit_transform(X=df_pd, y=df_pd["target"])

26.5 ms ± 801 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%%timeit 
woe_cat_encode(df, "target", to_be_encoded) # Take this with grain of salt, as this is done on Windows

72.5 ms ± 2.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd[to_be_encoded], df_pd["target"])

enc.transform(df_pd[to_be_encoded])

24.5 ms ± 605 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
%%timeit 
smooth_target_encode(df, cols=to_be_encoded
                     , target="target"
                     , smoothing=10
                     , min_samples_leaf=20
                     , check_binary = False) # Take this with grain of salt, as this is done on Windows

72.1 ms ± 4.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
# let's test it on bigger, more realistic data set size

In [20]:
df_pd_bigger = pd.concat([df_pd.copy()]*50)
df_pd_bigger.shape

(73000, 9)

In [21]:
df_pl = pl.from_pandas(df_pd_bigger)
df_pl.shape 

(73000, 9)

In [22]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd_bigger[to_be_encoded], df_pd_bigger["target"])

enc.transform(df_pd_bigger[to_be_encoded])

207 ms ± 3.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%%timeit 
woe_cat_encode(df_pl, "target", to_be_encoded)

79.2 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
%%timeit 

enc = TargetEncoder(cols=to_be_encoded, min_samples_leaf=20, smoothing=10)\
    .fit(df_pd_bigger[to_be_encoded], df_pd_bigger["target"])

enc.transform(df_pd_bigger[to_be_encoded])[to_be_encoded]

203 ms ± 9.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%%timeit 
smooth_target_encode(df_pl, cols=to_be_encoded
                     , target="target"
                     , smoothing=10
                     , min_samples_leaf=20
                     , check_binary = False)

83.3 ms ± 2.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
# On bigger datasets, this is much faster
# I don't know why but on Windows, the performance sacles strangely...
# woe_cat_encode went from 72.5ms to 79.2ms when df becomes 50 times bigger...
# This doesn't happen when I test on Linux.