#One-hot Encoding


##In Numpy
Step1. Convert Text to lower case \
Step2. Tokenize the text \
Step3. Get unique words \
Step4. Sort the word list \
Step5. Get the integer/position of the words \
Step6. create a vector of each word by marking its position as 1 and rest as 0 \
Step7. create a matrix of the found vectors.

In [None]:
import numpy as np
docs = "Should I learn NLP".lower().split()
doc1 = set(docs)
doc1 = sorted(doc1)
print ("\nvalues: ", doc1)

integer_encoded = []
for i in docs:
    v = np.where( np.array(doc1) == i)[0][0]
    integer_encoded.append(v)
print ("\ninteger encoded: ",integer_encoded)

def get_vec(len_doc,word):
    empty_vector = [0] * len_doc
    vect = 0
    find = np.where( np.array(doc1) == word)[0][0]
    empty_vector[find] = 1
    return empty_vector

def get_matrix(doc1):
    mat = []
    len_doc = len(doc1)
    for i in docs:
        vec = get_vec(len_doc,i)
        mat.append(vec)
        
    return np.asarray(mat)

print ("\nMATRIX:")
print (get_matrix(doc1))


values:  ['i', 'learn', 'nlp', 'should']

integer encoded:  [3, 0, 1, 2]

MATRIX:
[[0 0 0 1]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]]


##In Sklearn

Step1. Convert the text to Lower Case \
Step2. Word Tokenize \
Step3. Get its integer value i.e the position by using **LabelEncoder()** \
Step4. Get one hot encoding of the word by referring to the label encoded values using **OneHotEncoder()**

In [None]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
# data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']


doc1 = "Should I learn NLP".lower()
doc2 = "You must learn NLP".lower()
doc1 = doc1.split()
doc2 = doc2.split()
doc1_array = array(doc1)
doc2_array = array(doc2)
doc3 = doc1+doc2
# doc3 = set(doc3)
data = list(doc3)


values = array(data)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)


# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

['should' 'i' 'learn' 'nlp' 'you' 'must' 'learn' 'nlp']
[4 0 1 3 5 2 1 3]
[[0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]]
['should']


##In Keras

Step1. Convert the text to Lower Case \
Step2. Word Tokenize \
Step3. Get its integer value i.e the position by using **LabelEncoder()** \
Step4. Get one hot encoding of the word by referring to the label encoded values by using **to_categorical()**

In [None]:
from keras.preprocessing.text import Tokenizer
from numpy import array
from numpy import argmax
from tensorflow.keras.utils import to_categorical


doc = "Should I learn NLP".lower().split()

def using_Tokenizer(doc):
    # create the tokenizer
    t = Tokenizer()
    # fit the tokenizer on the documents
    t.fit_on_texts(doc)

    # integer encode documents
    encoded_docs = t.texts_to_matrix(doc, mode='count')
    return encoded_docs

def using_to_categorical(doc):
    label_encoder = LabelEncoder()
    data = label_encoder.fit_transform(doc)
    data = array(data)

    # one hot encode
    encoded = to_categorical(data)
    return encoded

def invert_encoding(row_num):
    inverted = label_encoder.inverse_transform([argmax(onehot_encoded[row_num, :])])
    return inverted
    
print ("===using Keras Tokenizer for OneHotEncoding===")
print (using_Tokenizer(doc))
print ()
print ("===using Keras to_categorical for OneHotEncoding===")
print (using_to_categorical(doc))
print ()
print (invert_encoding(int(0)))

===using Keras Tokenizer for OneHotEncoding===
[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]

===using Keras to_categorical for OneHotEncoding===
[[0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]]

['should']
