# Install and Import

In [None]:
pip install category_encoders

In [None]:
import warnings
import hashlib
import numpy as np
import pandas as pd
import category_encoders
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

warnings.filterwarnings("ignore")

# Unsupervised

In [None]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters'], name = 'x')
y = pd.Series([65000, 54000, 80000, 72000], name = 'y')

## 1. Ordinal Encoding

In [None]:
ordinal_encoding = x.replace(dict(zip(sorted(set(x)), range(1, len(sorted(set(x))) + 1))))

In [None]:
ordinal_encoding.name = 'OrdinalEncoding'
show = pd.concat([x, ordinal_encoding], axis = 1)
show

## 2. Count Encoding

In [None]:
count_encoding = x.replace(x.value_counts().to_dict())

In [None]:
count_encoding.name = 'CountEncoding'
show = pd.concat([x, count_encoding], axis = 1)
show

## 3. One-Hot Encoding

In [None]:
one_hot_encoding = ordinal_encoding.apply(lambda e: pd.Series(np.diag(np.ones(len(set(x))))[e - 1].astype(int)))

In [None]:
from psutil import virtual_memory

mem = virtual_memory()

mem.total / 1024 ** 3, mem.used / 1024 ** 3, mem.available / 1024 ** 3

In [None]:
one_hot_encoding.columns = sorted(set(x))
show = pd.concat([x, ordinal_encoding, one_hot_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['OneHotEncoding'] * len(set(x)), [''] * 2 + list(one_hot_encoding.columns)]
show

In [None]:
pd.concat([x, y], axis = 1)

In [None]:
sm.OLS(y, pd.concat([pd.Series(1, index = x.index, name = 'intercept'), one_hot_encoding], axis = 1)).fit().params

## 4. Sum Encoding

In [None]:
sum_encoding = one_hot_encoding.iloc[:, :-1].apply(lambda row: row if row.sum() == 1 else row.replace(0, -1), axis = 1)

In [None]:
sum_encoding

In [None]:
ordinal_encoding.columns = sorted(set(x))[:-1]
show = pd.concat([x, ordinal_encoding, sum_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['SumEncoding'] * (len(set(x)) - 1), [''] * 2 + sorted(set(x))[:-1]]
show

In [None]:
pd.concat([x, y], axis = 1)

In [None]:
sm.OLS(
    endog = y, 
    exog = pd.concat([pd.Series(1, index = x.index, name = 'intercept'), sum_encoding], axis = 1)
).fit().params

In [None]:
y.mean()

a -> 10.5 - 5.5 = 5

b -> 10.5 - 0.5 = 10

c -> 10.5 - 3.5 = 7

d -> 10.5 - (-5.5 -.5 -3.5) = 20

## 5. Backward-Difference

In [None]:
backward_difference_encoding = ordinal_encoding.apply(
    lambda oe: pd.Series([i / len(set(x)) for i in range(1, oe)] + [- i / len(set(x)) for i in range(len(set(x)) - oe, 0, -1)])
)

In [None]:
assert (backward_difference_encoding == category_encoders.BackwardDifferenceEncoder().fit_transform(X = x).drop('intercept', axis = 1).values).all().all()

In [None]:
backward_difference_encoding.columns = sorted(set(x))[1:]
show = pd.concat([x, ordinal_encoding, backward_difference_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['BackwardDifferenceEncoding'] * len(sorted(set(x))[1:]), [''] * 2 + sorted(set(x))[1:]]
show

In [None]:
sm.OLS(
    endog = y, 
    exog = pd.concat([pd.Series(1, index = x.index, name = 'intercept'), backward_difference_encoding], axis = 1)
).fit().params

b -> 10 - 5 = 5

c -> 7 - 10 = -3

d -> 20 - 7 = 13

## 6. Helmert Encoding

In [None]:
helmert_encoding = ordinal_encoding.apply(
    lambda oe: pd.Series([0] * (oe - 2) + ([oe - 1] if oe > 1 else []) + [-1] * (len(set(x)) - oe))
).div(pd.Series(range(2,len(set(x)) + 1)))
# https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/

In [None]:
assert (helmert_encoding == category_encoders.HelmertEncoder().fit_transform(X = x).drop('intercept', axis = 1).values / np.arange(2, len(set(x)) + 1)).all().all()

In [None]:
helmert_encoding.columns = sorted(set(x))[1:]
show = pd.concat([x, ordinal_encoding, helmert_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['HelmertEncoding'] * helmert_encoding.shape[1], [''] * 2 + sorted(set(x))[1:]]
show

In [None]:
sm.OLS(
    endog = y, 
    exog = pd.concat([pd.Series(1, index = x.index, name = 'intercept'), helmert_encoding], axis = 1)
).fit().params.round(2)

b -> 10 - 5 = 5

c -> 7 - np.mean([5, 10, 7]) = -0.5

d -> 20 - np.mean([5, 10, 7]) = 12.66667

## 7. Polynomial Encoding

In [None]:
def do_polynomial_encoding(order):
    # https://github.com/pydata/patsy/blob/master/patsy/contrasts.py
    n = len(set(x))
    scores = np.arange(n)
    scores = np.asarray(scores, dtype=float)
    scores -= scores.mean()
    raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1))
    q, r = np.linalg.qr(raw_poly)
    q *= np.sign(np.diag(r))
    q /= np.sqrt(np.sum(q ** 2, axis=1))
    # q[:, 0] = 1
    q = q[:, 1:]
    return q[order - 1]

polynomial_encoding = ordinal_encoding.apply(lambda oe: pd.Series(do_polynomial_encoding(oe)))

In [None]:
assert (polynomial_encoding == category_encoders.PolynomialEncoder().fit_transform(X = x).drop('intercept', axis = 1).values).all().all()

In [None]:
polynomial_encoding.columns = ['degree' + str(i) for i in range(1, polynomial_encoding.shape[1] + 1)]
show = pd.concat([x, ordinal_encoding, polynomial_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['PolynomialEncoding'] * polynomial_encoding.shape[1], 
                [''] * 2 + list(polynomial_encoding.columns)]
show

In [None]:
sm.OLS(
    endog = y, 
    exog = pd.concat([pd.Series(1, index = x.index, name = 'intercept'), polynomial_encoding], axis = 1)
).fit().params.round(2)

## 8. Binary Encoding

In [None]:
binary_base = ordinal_encoding.apply(lambda oe: str(bin(oe))[2:].zfill(len(bin(len(set(x)))) - 2))
binary_encoding = binary_base.apply(lambda bb: pd.Series(list(bb))).astype(int)

In [None]:
assert (binary_encoding == category_encoders.BinaryEncoder().fit_transform(X = x).values).all().all()

In [None]:
binary_encoding.columns = ['dim' + str(i) for i in range(binary_encoding.shape[1], 0, -1)]
show = pd.concat([x, ordinal_encoding, binary_base, binary_encoding], axis = 1)
show.columns = [
    ['x', 'OrdinalEncoding', 'binary_base'] + ['BinaryEncoding'] * binary_encoding.shape[1], 
    [''] * 3 + list(binary_encoding.columns)
]
show

## 9. Base N Encoding

In [None]:
def int2base(n, base):
    assert n > 0
    out = ''
    while n:
        out += str(int(n % base))
        n //= base
    return out[::-1]

base = 3

base_n = ordinal_encoding.apply(lambda oe: int2base(n = oe, base = base))
base_n_encoding = base_n.apply(lambda bn: pd.Series(list(bn.zfill(base_n.apply(len).max())))).astype(int)

In [None]:
assert (base_n_encoding == category_encoders.BaseNEncoder(base = base).fit_transform(X = x).iloc[:,1:].values).all().all()

In [None]:
base_n_encoding.columns = ['dim' + str(i) for i in range(base_n_encoding.shape[1], 0, -1)]
show = pd.concat([x, ordinal_encoding, base_n, base_n_encoding], axis = 1)
show.columns = [
    ['x', 'OrdinalEncoding', 'base_{}'.format(base)] + ['BaseNEncoding'] * base_n_encoding.shape[1], 
    [''] * 3 + list(base_n_encoding.columns)
]
show

## 10. Hashing Encoding

In [None]:
def do_hash(string, output_dimension):
    hasher = hashlib.new('sha256')
    hasher.update(bytes(string, 'utf-8'))
    string_hashed = hasher.hexdigest()
    string_hashed_int = int(string_hashed, 16)
    string_hashed_int_remainder = string_hashed_int % output_dimension
    return string_hashed, string_hashed_int, string_hashed_int_remainder

output_dimension = 10

hashing = x.apply(
    lambda string: pd.Series(do_hash(string, output_dimension), 
        index = ['x_hashed', 'x_hashed_int', 'x_hashed_int_remainder']))

hashing_encoding = hashing['x_hashed_int_remainder'].apply(lambda rem: pd.Series(np.diag(np.ones(output_dimension))[rem]))

In [None]:
assert (hashing_encoding == category_encoders.HashingEncoder(hash_method = 'sha256', n_components = output_dimension).fit_transform(X = x).values).all().all()

In [None]:
hashing_encoding.columns = ['dim' + str(i) for i in range(hashing_encoding.shape[1], 0, -1)]
show = pd.concat([x, hashing, hashing_encoding], axis = 1)
show.columns = [
    ['x', 'x_hashed', 'x_hashed_int', 'x_hashed_int_remainder'] + ['HashingEncoding'] * hashing_encoding.shape[1], 
    [''] * 4 + list(hashing_encoding.columns)
]
show

In [None]:
sm.OLS(
    endog = y, 
    exog = pd.concat([pd.Series(1, index = x.index, name = 'intercept'), hashing_encoding], axis = 1)
).fit().params.round(2)

# Supervised

In [None]:
x = pd.Series(['a', 'a', 'b', 'b', 'b', 'b'])
y = pd.Series([ 1, 2, 3, 4, 5, 6])

## 11. Target Encoding

In [None]:
count_encoding = x.replace(y.groupby(x).count())
y_grand_mean = x.apply(lambda l: y.mean())
y_level_mean = x.replace(y.groupby(x).mean())

In [None]:
target_encoding = dict()

for smoothing in [0, 1, 10]:
    weight = 1 / (1 + np.exp(-(count_encoding - 1) / smoothing))
    target_encoding[smoothing] = y_level_mean * weight + y_grand_mean * (1 - weight)

In [None]:
for sm, te in target_encoding.items():
    assert (te == category_encoders.TargetEncoder(smoothing = sm).fit_transform(X = x, y = y).iloc[:, 0]).all()

In [None]:
show = pd.concat([x, y, y_level_mean, y_grand_mean] + [target_encoding[i] for i in target_encoding.keys()], axis = 1)
show.columns = [
    ['x', 'y', 'y_level_mean', 'y_grand_mean'] + ['TargetEncoding'] * len(target_encoding), 
    [''] * 4 + ['smoothing={}'.format(sm) for sm in target_encoding.keys()]
]
show

## 12. MEstimate Encoding

In [None]:
count_encoding = x.replace(y.groupby(x).count())
y_grand_mean = x.apply(lambda l: y.mean())
y_level_mean = x.replace(y.groupby(x).mean())

In [None]:
m_estimate_encoding = dict()

for m in [0, 1, 10]:
    m_estimate_encoding[m] = (y_level_mean * count_encoding + y_grand_mean * m) / (count_encoding + m)

In [None]:
for m, te in m_estimate_encoding.items():
    assert (te == category_encoders.MEstimateEncoder(m = m).fit_transform(X = x, y = y).iloc[:, 0]).all()

In [None]:
show = pd.concat([x, y, count_encoding, y_level_mean, y_grand_mean] + [m_estimate_encoding[i] for i in m_estimate_encoding.keys()], axis = 1)
show.columns = [
    ['x', 'y', 'CountEncoding', 'y_level_mean', 'y_grand_mean'] + ['MEstimateEncoding'] * len(m_estimate_encoding), 
    [''] * 5 + ['m={}'.format(m) for m in m_estimate_encoding.keys()]
]
show

## 13. James-Stein Encoding

In [None]:
x = pd.Series(['a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], name = 'x')
y = pd.Series([ 1, 2, 3, 4, 5, 6, 7, 15], name = 'y')

In [None]:
y_level_mean = x.replace(y.groupby(x).mean())
y_level_var = x.replace(y.groupby(x).var()).fillna(0)
weight = (y_level_var / (y.var() + y_level_var) * (len(set(x)) - 3) / (len(set(x)) - 1)).clip(lower=0, upper=1)
james_stein_encoding = y_level_mean * (1 - weight) + y.mean() * weight

In [None]:
assert (james_stein_encoding == category_encoders.JamesSteinEncoder().fit_transform(X = x, y = y).iloc[:, 0]).all()

In [None]:
show = pd.concat([x, y, y_level_mean, y_level_var, pd.Series(y.mean(), index = x.index), pd.Series(y.var(), index = x.index), weight, james_stein_encoding], axis = 1)
show.columns = ['x', 'y', 'y_level_mean', 'y_level_var', 'y.mean()', 'y.var()', 'weight', 'JamesSteinEncoding']
show

## 14. GLMM Encoding

In [None]:
model = smf.mixedlm(formula = 'y ~ 1', data = y.to_frame(), groups = x).fit()
intercept = pd.Series(model.params['Intercept'], index = x.index)
random_effect = x.replace({k: float(v) for k, v in model.random_effects.items()})
glmm_encoding = intercept + random_effect

In [None]:
assert (random_effects == category_encoders.GLMMEncoder().fit_transform(X = x, y = y).iloc[:, 0]).all()

In [None]:
show = pd.concat([x, y, intercept, random_effect, glmm_encoding], axis = 1)
show.columns = ['x', 'y', 'intercept', 'random_effect', 'GLMMEncoding']
show

## 15. WOE Encoding

In [None]:
x = pd.Series(['a','a','b','b','b','b'], name = 'x')
y = pd.Series([0,1,0,0,0,1], name = 'y')

In [None]:
y_level_ones = x.replace(y.groupby(x).apply(lambda l: (l == 1).sum()))
y_level_zeros = x.replace(y.groupby(x).apply(lambda l: (l == 0).sum()))
y_ones = (y == 1).sum()
y_zeros = (y == 0).sum()
nominator = y_level_ones / y_ones
denominator = y_level_zeros / y_zeros
woe_encoder = np.log(nominator / denominator)

In [None]:
assert (woe_encoder == category_encoders.WOEEncoder(regularization = 0).fit_transform(X = x, y = y).iloc[:, 0]).all()

In [None]:
show = pd.concat([x, y, y_level_ones, y_level_zeros, pd.Series(y_ones, index = x.index), pd.Series(y_zeros, index = x.index), nominator, denominator, woe_encoder], axis = 1)
show.columns = ['x', 'y', 'y_level_ones', 'y_level_zeros', 'y_ones', 'y_zeros','nominator', 'denominator', 'WOEEncoding']
show

## 16. Leave One Out Encoding

In [None]:
x = pd.Series(['a','a','b','b','b','b'], name = 'x')
y = pd.Series([1,2,3,4,5,6], name = 'y')

In [None]:
y_level_except_self = x.to_frame().apply(lambda row: y[x == row['x']].drop(row.name).to_list(), axis = 1)
leave_one_out_encoding = y_level_except_self.apply(np.mean)

In [None]:
assert (leave_one_out_encoding == category_encoders.LeaveOneOutEncoder().fit_transform(X = x, y = y).iloc[:, 0]).all()

In [None]:
show = pd.concat([x, y, y_level_except_self, leave_one_out_encoding], axis = 1)
show.columns = ['x', 'y', 'y_level_except_self', 'LeaveOneOutEncoding']
show['LeaveOneOutEncoding'] = show['LeaveOneOutEncoding'].round(2)
show

## 17. CatBoost Encoding

In [None]:
x = pd.Series(['a','a','b','b','b','b'], name = 'x')
y = pd.Series([1,2,3,4,5,6], name = 'y')

In [None]:
a = 1
y_level_before_self = x.to_frame().apply(lambda row: y[(x == row['x']) & (y.index < row.name)].to_list(), axis = 1)
catboost_encoding = y_level_before_self.apply(lambda l: (sum(l) + y.mean() * a) / (len(l) + a))

In [None]:
assert (catboost_encoding == category_encoders.CatBoostEncoder().fit_transform(X = x, y = y).iloc[:, 0]).all()

In [None]:
show = pd.concat([x, y, pd.Series(y.mean(), index = x.index), y_level_before_self, catboost_encoding], axis = 1)
show.columns = ['x', 'y', 'y_mean', 'y_level_before_self', 'CatBoostEncoding']
show

# End

# Unsupervised:

- Backward Difference Contrast [2][3]
- BaseN [6]
- Binary [5]
- Count [10]
- Hashing [1]
- Helmert Contrast [2][3]
- Ordinal [2][3]
- One-Hot [2][3]
- Polynomial Contrast [2][3]
- Sum Contrast [2][3]

# Supervised:

- CatBoost [11]
- Generalized Linear Mixed Model [12]
- James-Stein Estimator [9]
- LeaveOneOut [4]
- M-estimator [7]
- Target Encoding [7]
- Weight of Evidence [8]

In [None]:
encoder_names = [class_ for class_ in dir(category_encoders) if class_[0].isupper()]

In [None]:
encoder_names = [class_ for class_ in dir(category_encoders) if class_[0].isupper()]
for en in encoder_names:
    print(en)

In [None]:
encoder_names = [class_ for class_ in dir(category_encoders) if class_[0].isupper()]
df = pd.DataFrame(index = encoder_names)

x = pd.Series(['a','b','b','c','c','c'])
y = pd.Series([1,2,3,4,5,6])
y_bin = pd.Series([0,0,1,0,1,1])

for encoder_name in encoder_names:
    print(encoder_name)
    exec('enc = category_encoders.{}()'.format(encoder_name))
    try:
        x_enc = enc.fit_transform(x)
        df.loc[encoder_name, 'type'] = 'unsupervised'
    except:
        try:
            x_enc = enc.fit_transform(x, y)
            df.loc[encoder_name, 'type'] = 'supervised'
        except:
            x_enc = enc.fit_transform(x, y_bin)
            df.loc[encoder_name, 'type'] = 'supervised'

    df.loc[encoder_name, 'output_dimension'] = 'single' if x_enc.shape[1] == 1 else 'multiple'
    df.loc[encoder_name, 'mapping'] = 'unique' if (~pd.concat([x, x_enc], axis = 1).duplicated()).sum() == len(set(x)) else 'not unique'

In [None]:
df.to_excel('encoders.xlsx')

In [None]:
pd.concat([x, x_enc], axis = 1)

In [None]:
encoder_names = [class_ for class_ in dir(category_encoders) if class_[0].isupper()]

for encoder_name in encoder_names:
    print(encoder_name)
    #exec('print(category_encoders.{}.__doc__)'.format(encoder_name))
    try:    
        exec('enc = category_encoders.{}()'.format(encoder_name))
        x = pd.Series(['a','b','b','c','c','c'])
        y = pd.Series([1,2,3,4,5,6])
        x_enc = enc.fit_transform(x, y = y)
        print(x_enc.shape[1] == 1)
        print(x_enc)
    except Exception as e:
        print('Exception:', e)
    print()

In [None]:
encoder_names = [class_ for class_ in dir(category_encoders) if class_[0].isupper()]

for encoder_name in encoder_names:
    print(encoder_name)
    try:    
        exec('enc = category_encoders.{}()'.format(encoder_name))
        x = pd.Series(['a','b','b','c','c','c'])
        y = pd.Series([1,2,3,4,5,6])
        x_enc = enc.fit_transform(x, y = y)
        print(x_enc)
    except Exception as e:
        print('Exception:', e)
    print()

In [None]:
x = pd.Series(['a','a','b','b','c','c','c','c'])
x_enc = BackwardDifferenceEncoder().fit_transform(x)
x_enc

In [None]:
x_enc.sum(axis = 1)

In [None]:
series = pd.Series(['a','b','b','c','c','c'])
y = pd.Series([1,2,3,4,5,6])


category = pd.Categorical(series)

categories = category.categories
codes = category.codes.copy()

codes[codes == -1] = len(categories)
categories = np.append(categories, np.nan)

return_map = pd.Series(dict([(code, category) for code, category in enumerate(categories)]))

result = y.groupby(codes).agg(['sum', 'count'])
return_ = result.rename(return_map)