## experiments with category encoders
Source:
https://www.kaggle.com/discdiver/category-encoders-examples

In [7]:
import numpy as np
import pandas as pd              
import category_encoders as ce   
from sklearn.preprocessing import LabelEncoder

In [8]:
pd.options.display.float_format = '{:.2f}'.format # to make legible

In [17]:
# make some data
df = pd.DataFrame({'color':["black", "red", "white", "white", "black", "blue"], 'outcome':[1, 2, 0, 0, 0, 1], 'type':['fruit', 'nut', 'veg', 'meat', 'egg', 'grass'], 'rating':[1.0,2.0,3.0, 4.0,4.0,5.0]})
print(df)

   color  outcome  rating   type
0  black        1    1.00  fruit
1    red        2    2.00    nut
2  white        0    3.00    veg
3  white        0    4.00   meat
4  black        0    4.00    egg
5   blue        1    5.00  grass


In [18]:
# set up X and y
X = df.drop('outcome', axis = 1)
y = df.drop('color', axis = 1)

## sklearn encoders

In [15]:
print(X) 

le = LabelEncoder()
encoded = le.fit_transform(np.ravel(X))    # warning thrown without np.ravel

print("\n The result of transforming X with LabelEncoder:")
print(encoded)
print(type(encoded))

   color   type
0  black  fruit
1    red    nut
2  white    veg
3  white   meat
4  black    egg
5   blue  grass

 The result of transforming X with LabelEncoder:
[0 3 7 6 9 8 9 5 0 2 1 4]
<class 'numpy.ndarray'>


## ordinal encoder

In [22]:
ce_ord = ce.OrdinalEncoder(mapping=[{
            "col":"color",    
            "mapping": [
                ('black',0), 
                ('red',1), 
                ('blue',2), 
                ('white',4)        
            ]},{'col':'rating', 'mapping':[(1.0,1), (2.0, 2), (3.0,3),(4.0,4), (5.0,5)]}
        ])
ce_ord.fit(X)
ce_ord.transform(X)

Unnamed: 0,color,rating,type
0,0,1,fruit
1,1,2,nut
2,4,3,veg
3,4,4,meat
4,0,4,egg
5,2,5,grass


In [23]:
X.rating.astype(int)

0    1
1    2
2    3
3    4
4    4
5    5
Name: rating, dtype: int32

## One hot encoder

In [15]:
ce_one_hot = ce.OneHotEncoder(cols = ['color'])
ce_one_hot.fit_transform(X, y)

Unnamed: 0,color_1,color_2,color_3,color_4,color_-1
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,1,0,0,0,0
5,0,0,0,1,0


## Binary encoder

In [18]:
ce_binary = ce.BinaryEncoder(cols = ['color'])
print(ce_binary.fit_transform(X, y))
print(X)

   color_0  color_1  color_2
0        0        0        1
1        0        1        0
2        0        1        1
3        0        1        1
4        0        0        1
5        1        0        0
   color
0  black
1    red
2  white
3  white
4  black
5   blue


In [27]:
df = pd.DataFrame({'gender':["m", "f", "m", "f", "m", "f", 'f', 'f', 'f']})
ce_binary = ce.LeaveOneOutEncoder(cols = ['gender'])
print(ce_binary.fit_transform(df))

ValueError: The length of X is 9 but length of y is 0.

In [23]:
# if we have 11 different colors? we should get 5 columns
df = pd.DataFrame({'color':["black", "red", "white", "white", "black", "blue", 'gray', 'green', 'gray', 'cyan', 'pink', 'maroon', 'dark-gray', 'dark-green'], 'outcome':[1, 2, 0, 0, 0, 1, 2, 1,2,3,1,2,1,2]})
print(df['color'].value_counts())
# set up X and y
X = df.drop('outcome', axis = 1)
y = df.drop('color', axis = 1)
print(ce_binary.fit_transform(X, y))
print(X)

white         2
black         2
gray          2
pink          1
green         1
dark-green    1
dark-gray     1
red           1
cyan          1
maroon        1
blue          1
Name: color, dtype: int64
    color_0  color_1  color_2  color_3  color_4
0         0        0        0        0        1
1         0        0        0        1        0
2         0        0        0        1        1
3         0        0        0        1        1
4         0        0        0        0        1
5         0        0        1        0        0
6         0        0        1        0        1
7         0        0        1        1        0
8         0        0        1        0        1
9         0        0        1        1        1
10        0        1        0        0        0
11        0        1        0        0        1
12        0        1        0        1        0
13        0        1        0        1        1
         color
0        black
1          red
2        white
3        white
4  

In [41]:
#base=1 is same as one-hot, base=2 is same as binary, 
#I dont think other bases are useful as we are trying to convert nominal data
ce_basen = ce.BaseNEncoder(cols = ['color'], base=3)
pd.concat([X, ce_basen.fit_transform(X, y)], axis=1)

Unnamed: 0,color,color_0,color_1,color_2,color_3
0,black,0,0,0,1
1,red,0,0,0,2
2,white,0,0,1,0
3,white,0,0,1,0
4,black,0,0,0,1
5,blue,0,0,1,1
6,gray,0,0,1,2
7,green,0,0,2,0
8,gray,0,0,1,2
9,cyan,0,0,2,1


In [39]:
#we can see collision with blue, cyan and pink
ce_hash = ce.HashingEncoder(cols = ['color'], n_components=4)
pd.concat([X,ce_hash.fit_transform(X, y)], axis=1)

Unnamed: 0,color,col_0,col_1,col_2,col_3
0,black,0,1,0,0
1,red,1,0,0,0
2,white,1,0,0,0
3,white,1,0,0,0
4,black,0,1,0,0
5,blue,0,0,0,1
6,gray,0,1,0,0
7,green,1,0,0,0
8,gray,0,1,0,0
9,cyan,0,0,0,1


## article also explains about contrast encoders, but I think they are not widely used
https://www.kaggle.com/discdiver/category-encoders-examples

## Target Encoding

In [31]:
# if we have 11 different colors? we should get 5 columns
df = pd.DataFrame({'color':["black", "red", "white", "white", "black", "blue", 'gray', 'green', 'gray', 'cyan', 'pink', 'maroon', 'dark-gray', 'dark-green'], 'outcome':[1, 0, 0, 0, 0, 1, 0, 1,1,0,1,0,1,1]})
te = ce.TargetEncoder(cols=['color'])
# set up X and y
#X = df.drop('outcome', axis = 1)
#y = df.drop('color', axis = 1)
print(te.fit_transform(df, df.outcome))
print(df.color)

    color  outcome
0    0.50        1
1    0.50        0
2    0.13        0
3    0.13        0
4    0.50        0
5    0.50        1
6    0.50        0
7    0.50        1
8    0.50        1
9    0.50        0
10   0.50        1
11   0.50        0
12   0.50        1
13   0.50        1
0          black
1            red
2          white
3          white
4          black
5           blue
6           gray
7          green
8           gray
9           cyan
10          pink
11        maroon
12     dark-gray
13    dark-green
Name: color, dtype: object


In [34]:
class XYZ:
    
    s_instances = 0
    def __init__(self):
        self.x = 10
        XYZ.s_instances = XYZ.s_instances + 1
        
    @staticmethod
    def square(x):
        s_instances = 0
    
    @classmethod
    def get_instances(cls):
        return cls.s_instances

In [35]:
xyz = XYZ()

UnboundLocalError: local variable 's_instances' referenced before assignment

In [36]:
df

Unnamed: 0,color,outcome
0,black,1
1,red,0
2,white,0
3,white,0
4,black,0
5,blue,1
6,gray,0
7,green,1
8,gray,1
9,cyan,0


In [38]:
df['color_mean'] = df['outcome'].groupby(df['color']).transform('mean')

In [39]:
df

Unnamed: 0,color,outcome,color_mean
0,black,1,0.5
1,red,0,0.0
2,white,0,0.0
3,white,0,0.0
4,black,0,0.5
5,blue,1,1.0
6,gray,0,0.5
7,green,1,1.0
8,gray,1,0.5
9,cyan,0,0.0
