# Imports Always First

In [103]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from collections import defaultdict
import warnings
import numpy as np

# Creating Test Dataset

In [2]:
df_encode1 = pd.DataFrame({'col1': ['this', 'will', 'be', 'encoded'], 'col2': [1, 2, 3, 4]})

In [3]:
df_encode2 = df_encode1.copy()
df_encode1

Unnamed: 0,col1,col2
0,this,1
1,will,2
2,be,3
3,encoded,4


---
# Encoding Using get_dummies

In [4]:
pd.get_dummies(df_encode1, drop_first=True)

Unnamed: 0,col2,col1_encoded,col1_this,col1_will
0,1,0,1,0
1,2,0,0,1
2,3,0,0,0
3,4,1,0,0


---
# Encoding Using LabelEncoder and OneHotEncoder

In [5]:
# set encoder
lenc = LabelEncoder()
oenc = OneHotEncoder()

In [6]:
# laabel endcoding the categorical column
df_encode1.col1 = lenc.fit_transform(df_encode1.col1)

In [7]:
df_encode1

Unnamed: 0,col1,col2
0,2,1
1,3,2
2,0,3
3,1,4


In [8]:
# one hot encoding, all values are encoded
oenc.fit_transform(df_encode1).toarray()

array([[0., 0., 1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 1.]])

---
# Multi-column Encoding

## Label Encoding

In [9]:
df_multiencode = pd.concat([df_encode2, 
                            pd.DataFrame({'col3': ['Another', 'encoding', 'attempt', 'here']})], 
                            axis=1)
display(df_multiencode)

Unnamed: 0,col1,col2,col3
0,this,1,Another
1,will,2,encoding
2,be,3,attempt
3,encoded,4,here


In [90]:
df_encoded = pd.DataFrame()
inverse_dict = defaultdict(LabelEncoder)

# for loop version
for column in df_multiencode.columns:
    df_encoded[column] = inverse_dict[column].fit_transform(df_multiencode[column])    

display(df_encoded)

Unnamed: 0,col1,col2,col3
0,2,0,0
1,3,1,2
2,0,2,1
3,1,3,3


In [91]:
# one line version
df_encoded = df_multiencode.copy()
df_encoded = df_multiencode.apply(lambda x: inverse_dict[x.name].fit_transform(x))
display(df_encoded)

Unnamed: 0,col1,col2,col3
0,2,0,0
1,3,1,2
2,0,2,1
3,1,3,3


### Label Encoding Inverse

In [94]:
warnings.filterwarnings('ignore')
df_rencoded = pd.DataFrame()
# for loop version
for column in df_encoded.columns:
    df_rencoded[column] = inverse_dict[column].inverse_transform(df_encoded[column])       
df_rencoded

Unnamed: 0,col1,col2,col3
0,this,1,Another
1,will,2,encoding
2,be,3,attempt
3,encoded,4,here


## One Hot Encoding

In [72]:
df_onehot = pd.DataFrame()
one_hot_dict = defaultdict(OneHotEncoder)

for column in df_encoded.columns:    
    one_hot_target = df_encoded[column].values.reshape(-1, 1)
    X = one_hot_dict[column].fit_transform(one_hot_target).toarray()
    one_hot_column = ([column + '_' + str(x) for x in range(X.shape[1])])
    new_onehot = pd.DataFrame(X, columns=one_hot_column)
    df_onehot = pd.concat([df_onehot, new_onehot], axis=1)
    
df_onehot

Unnamed: 0,col1_0,col1_1,col1_2,col1_3,col2_0,col2_1,col2_2,col2_3,col3_0,col3_1,col3_2,col3_3
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### One Hot Inverse Deprecated

---
# Label Binarizer Encoder

In [129]:
bina_dict = defaultdict(LabelBinarizer)
df_bina = pd.DataFrame()

for column in df_rencoded.columns:
    X = bina_dict[column].fit_transform(df_rencoded[column].values)
    bina_column = ([column + '_' + str(x) for x in range(X.shape[1])])
    new_bina = pd.DataFrame(X, columns=bina_column)
    df_bina = pd.concat([df_bina, new_bina], axis=1)
    
df_bina

Unnamed: 0,col1_0,col1_1,col1_2,col1_3,col2_0,col2_1,col2_2,col2_3,col3_0,col3_1,col3_2,col3_3
0,0,0,1,0,1,0,0,0,1,0,0,0
1,0,0,0,1,0,1,0,0,0,0,1,0
2,1,0,0,0,0,0,1,0,0,1,0,0
3,0,1,0,0,0,0,0,1,0,0,0,1


In [130]:
x = bina_dict['col1']

In [131]:
x.classes_

array(['be', 'encoded', 'this', 'will'], dtype='<U7')

# Bonus

In [128]:
# custome helper function to encode your categorical data
def cust_label_binarizer():
    pass

# Sources:

1. http://www.insightsbot.com/blog/McTKK/python-one-hot-encoding-with-scikit-learn