# One Hot Encoding

In [None]:
!pip install -q tensorflow-gpu==2.0.0-rc1

In [4]:
samples = ["철수은 축구를 좋아한다",
          "영희은 축구를 싫어한다",
          "철수은 도서관을 좋아한다",
          "영희은 도서관을 싫어한다",
          "철수은 공부를 좋아한다",
          "영희은 공부를 싫어한다",
          "철수은 축구를 좋아한다 영희은 축구를 싫어한다"]

### keras Preprocessing API 사용

In [5]:
from tensorflow.keras import preprocessing

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(samples) 
sequences = tokenizer.texts_to_sequences(samples) 

word_to_index = tokenizer.word_index

In [6]:
print(f"token_sentences : {sequences}")
print(f"word_to_index : {word_to_index}")

token_sentences : [[1, 2, 3], [4, 2, 5], [1, 6, 3], [4, 6, 5], [1, 7, 3], [4, 7, 5], [1, 2, 3, 4, 2, 5]]
word_to_index : {'철수은': 1, '축구를': 2, '좋아한다': 3, '영희은': 4, '싫어한다': 5, '도서관을': 6, '공부를': 7}


In [7]:
index_to_word = dict((i, w) for w, i in word_to_index.items())
print(index_to_word)

{1: '철수은', 2: '축구를', 3: '좋아한다', 4: '영희은', 5: '싫어한다', 6: '도서관을', 7: '공부를'}


In [8]:
onehot_encodeds = list()

for values in sequences:
    onehot = list()
    for val in values:
        letter = [0] * (len(word_to_index) + 1) 
        letter[val] = 1
        onehot.append(letter)
    onehot_encodeds.append(onehot)

print(onehot_encodeds[0])

[[0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0]]


### tensorflow onehot endcode 만들기

In [12]:
import tensorflow as tf

for sequence in sequences:
    onehot_encoded = tf.one_hot(indices=sequence, depth=len(word_to_index)+1)
    print(f'origin sequence: {sequence}')
    print(f'one-hot encoded sequence\n{onehot_encoded}')
    print('\n'+'='*30+'\n')

origin sequence: [1, 2, 3]
one-hot encoded sequence
[[0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]]


origin sequence: [4, 2, 5]
one-hot encoded sequence
[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]]


origin sequence: [1, 6, 3]
one-hot encoded sequence
[[0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]]


origin sequence: [4, 6, 5]
one-hot encoded sequence
[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]]


origin sequence: [1, 7, 3]
one-hot encoded sequence
[[0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 0. 0.]]


origin sequence: [4, 7, 5]
one-hot encoded sequence
[[0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0.]]


origin sequence: [1, 2, 3, 4, 2, 5]
one-hot encoded sequence
[[0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0