# Different ways of encoding categorical data

In [31]:
from sklearn import preprocessing
import numpy as np

## LabelEncoder

### Transforming categorical data into discrete numbers

In [8]:
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])

LabelEncoder()

In [9]:
list(le.classes_)

['amsterdam', 'paris', 'tokyo']

In [10]:
le.transform(["tokyo", "tokyo", "paris"])

array([2, 2, 1], dtype=int64)

In [11]:
list(le.inverse_transform([2, 2, 1]))

['tokyo', 'tokyo', 'paris']

In [12]:
le.inverse_transform([2,2,1])

array(['tokyo', 'tokyo', 'paris'], 
      dtype='<U9')

In [13]:
labelEncoder = preprocessing.LabelEncoder()

labelEncoder.fit(['Prasanna', 'Rashmi', 'Lalitha', 'Subramanyam'])

LabelEncoder()

In [14]:
labelEncoder.classes_

array(['Lalitha', 'Prasanna', 'Rashmi', 'Subramanyam'], 
      dtype='<U11')

In [15]:
labelEncoder.transform(['Prasanna', 'Rashmi', 'Lalitha', 'Subramanyam'])

array([1, 2, 0, 3], dtype=int64)

In [17]:
labelEncoder.fit_transform(['Monday', 'Tuesday', 'Wednesday', 'Thursday'])

array([0, 2, 3, 1], dtype=int64)

In [18]:
labelEncoder.classes_

array(['Monday', 'Thursday', 'Tuesday', 'Wednesday'], 
      dtype='<U9')

# One Hot Encoding

In [34]:
np.array([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])

# When we pass such a multidimentional array into the fit function, it is interpreted as if it were a data frame
# The first "column" contains [0,1,0,1] -> two different categories
# The first "column" contains [0,1,2,0] -> three different categories
# The first "column" contains [3,0,1,2] -> four different categories

# Later, when we call the "transform" method, it expects a value for each "column" which will then be encoded into the
# binary "one hot encoding" format

array([[0, 0, 3],
       [1, 1, 0],
       [0, 2, 1],
       [1, 0, 2]])

In [35]:
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  

OneHotEncoder(categorical_features='all', dtype=<class 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [36]:
enc.transform([[0, 1, 3]]).toarray()

array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])

In [38]:
enc.transform([[0, 0, 0], [1, 1, 1]]).toarray()

array([[ 1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])

In [39]:
# ^ format is something like 
# [col0_0, col0_1, col1_0, col1_1, col1_2, col2_0, col2_1, col2_2, col2_3]