In [8]:
import numpy as np
from keras.datasets import mnist
from keras.preprocessing.text import Tokenizer

In [9]:
(train_data,train_lab),(test_data,test_lab) = mnist.load_data()

# **One-hot-Encoding of Text-of-words**

In [10]:
#here sequences are just a sentence, but it could be an entire document.
sequences  = ['The cat sat on the mat.', 'The dog ate my homework', 'i love food', 'i hate instagram']

# We create a tokenizer, configured to only take into account the top-1000 most common words
tokenizer = Tokenizer(num_words=1000, oov_token='oov')
tokenizer.fit_on_texts(sequences)

In [11]:
word_index = tokenizer.word_index
print(word_index)

{'oov': 1, 'the': 2, 'i': 3, 'cat': 4, 'sat': 5, 'on': 6, 'mat': 7, 'dog': 8, 'ate': 9, 'my': 10, 'homework': 11, 'love': 12, 'food': 13, 'hate': 14, 'instagram': 15}


In [12]:
#turning strings into lists of integer indices.
int_sequences = tokenizer.texts_to_sequences(['i am passionate abt my work','''don't mess with my dog'''])
int_sequences

[[3, 1, 1, 1, 10, 1], [1, 1, 1, 10, 8]]

In [13]:
tokenizer.word_index.get('oov')

1

**Converting lists of integer indices to one-hotted array**

**mtd-1**

In [14]:
#This turns strings into one-hot binary representations.
encoded_array = tokenizer.texts_to_matrix(['i am passionate abt my work','i do not hate instagram','the cat sat on the mat'])
print(encoded_array)

#since, num_words = 1000. so each array will be 1000-dim.
print('\n size: ',encoded_array[0].shape[0])

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]

 size:  1000


**mtd-2**

In [15]:
data = np.zeros((len(int_sequences),1000))

for i, sen in enumerate(int_sequences):
  data[i,sen] = 1

data

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

# **Encoding Labels**

**Encoding integer labels**

In [16]:
from keras.utils import to_categorical

In [17]:
encoded_labels = to_categorical(test_lab[:100])
print('Encoded Labels: ',encoded_labels[:3])
print('\n shape: ',encoded_labels[5].shape)

Encoded Labels:  [[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]

 shape:  (10,)


**Encoding categorical labels**

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
test_lab2 = ['HR','HR','HR','HR','HR','HR','Design','Design','Design','Design','Design','Design','Design','Design','Design','Design','Design','Design','Design','Design',
             'Design','Design','Design','Design','Design','HR','HR','HR','HR','HR','HR','HR','HR','HR','HR','HR','HR','Management','Management','Management',
             'Management','Management','Management','Management','Management','Management','Management','Management','Management','Management','Management',
             'Design','Design','Design','Design','Design','HR','HR','HR','HR','HR',]

In [23]:
le = LabelEncoder()
int_labels = le.fit_transform(test_lab2)
int_labels

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

**At this point we have two options**

**mtd-1**

In [32]:
from keras.utils import to_categorical
ohe_cat_en_labels = to_categorical(int_labels)
ohe_cat_en_labels[-15:]

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

**mtd-2**

In [29]:
from sklearn.preprocessing import OneHotEncoder

In [38]:
ohe = OneHotEncoder()                                     
ohe_cat_en_labels2 = np.array(ohe.fit_transform(int_labels.reshape(-1,1)).todense())
ohe_cat_en_labels2[-15:]

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

**Encoding Multi-labels**

In [39]:
m_lab = [['Engineering','HR'],
         ['Apparel','Design'],
         ['Apparel','Design'],
         ['Apparel','Design'],
         ['Apparel','Design'],
         ['Design','IT','HR'],
         ['Design','IT'],
         ['Engineering','IT']]

In [40]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

en_multi_label = mlb.fit_transform(m_lab)
en_multi_label,en_multi_label.shape

(array([[0, 0, 1, 1, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [0, 1, 0, 1, 1],
        [0, 1, 0, 0, 1],
        [0, 0, 1, 0, 1]]), (8, 5))