### 단어로 감성 분류하기

In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output

import time

### 데이터셋 구성
- 각 단어에 대한 정답을 부정(0), 긍정(1)로 정의해서 데이터셋을 구성


In [2]:
x_train_words = ['good', 'bad', 'amazing', 'so good', 'bull shit',
                 'awesome', 'how dare', 'very much', 'nice', 'god damn it',
                 'very very very happy', 'what the fuck']
y_train = np.array([1, 0, 1, 1, 0,
                    1, 0, 1, 1, 0,
                    1, 0], dtype=np.int32)

In [3]:
# negative sample
index = 1
print("word: {}\nlabel: {}".format(x_train_words[index], y_train[index]))

word: bad
label: 0


In [4]:
# positive sample
index = 0
print("word: {}\nlabel: {}".format(x_train_words[index], y_train[index]))

word: good
label: 1


### 텍스트데이터 처리를 위한 Tokenizer사용

In [5]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [6]:
tokenizer = Tokenizer(char_level=True)

In [7]:
tokenizer.fit_on_texts(x_train_words)

In [8]:
num_chars = len(tokenizer.word_index) + 1
print("number of characters: {}".format(num_chars))

number of characters: 25


In [9]:
tokenizer.word_index

{' ': 1,
 'a': 4,
 'b': 19,
 'c': 18,
 'd': 5,
 'e': 2,
 'f': 23,
 'g': 9,
 'h': 6,
 'i': 11,
 'k': 24,
 'l': 20,
 'm': 10,
 'n': 14,
 'o': 3,
 'p': 21,
 'r': 7,
 's': 15,
 't': 12,
 'u': 16,
 'v': 13,
 'w': 17,
 'y': 8,
 'z': 22}

In [10]:
x_train_tokens = tokenizer.texts_to_sequences(x_train_words)

In [11]:
index = 2
print("text: {}".format(x_train_words[index]))
print("token: {}".format(x_train_tokens[index]))

text: amazing
token: [4, 10, 4, 22, 11, 14, 9]


In [12]:
x_train_seq_length = np.array([len(tokens) for tokens in x_train_tokens], dtype=np.int32)
num_seq_length = x_train_seq_length

In [13]:
max_seq_length = np.max(num_seq_length)
print(max_seq_length)

20


In [14]:
pad = 'pre'
# pad = 'post'

In [15]:
x_train_pad = pad_sequences(sequences=x_train_tokens, maxlen=max_seq_length,
                            padding=pad, truncating=pad)

In [16]:
index = 7
print("text : {}\n".format(x_train_words[index]))
print("token : {}\n".format(x_train_tokens[index]))
print("pad: {}".format(x_train_pad[index]))

text : very much

token : [13, 2, 7, 8, 1, 10, 16, 18, 6]

pad: [ 0  0  0  0  0  0  0  0  0  0  0 13  2  7  8  1 10 16 18  6]


In [17]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))
print(inverse_map)

{1: ' ', 2: 'e', 3: 'o', 4: 'a', 5: 'd', 6: 'h', 7: 'r', 8: 'y', 9: 'g', 10: 'm', 11: 'i', 12: 't', 13: 'v', 14: 'n', 15: 's', 16: 'u', 17: 'w', 18: 'c', 19: 'b', 20: 'l', 21: 'p', 22: 'z', 23: 'f', 24: 'k'}


In [18]:
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token != 0]

    text = "".join(words)

    return text

In [19]:
index = 10
print("original text : \n{}\n".format(x_train_words[index]))
print("tokens: \n{}\n".format(x_train_tokens[index]))
print("tokens to string: \n{}".format(tokens_to_string(x_train_tokens[index])))

original text : 
very very very happy

tokens: 
[13, 2, 7, 8, 1, 13, 2, 7, 8, 1, 13, 2, 7, 8, 1, 6, 4, 21, 21, 8]

tokens to string: 
very very very happy


In [20]:
batch_size = 4
max_epochs = 50
num_units = 16
num_classes = 2 
initializer_scale = 0.1
learning_rate = 1e-3

In [21]:
# tf.data로 data pipline 생성
train_dataset = tf.data.Dataset.from_tensor_slices((x_train_pad, x_train_seq_length, y_train))
train_dataset = train_dataset.shuffle(buffer_size = 100)
train_dataset = train_dataset.repeat()
train_dataset = train_dataset.batch(batch_size = batch_size)
print(train_dataset)

<BatchDataset shapes: ((None, 20), (None,), (None,)), types: (tf.int32, tf.int32, tf.int32)>


In [24]:
model = tf.keras.Sequential([
            layers.Embedding(num_chars, num_chars, embeddings_initializer='identity', trainable=False),
            layers.SimpleRNN(units=num_units),
            layers.Dense(units=num_classes, activation='sigmoid')
])

In [25]:
optimizer = tf.keras.optimizers.Adam(learning_rate)
loss_obj = tf.keras.losses.BinaryCrossentropy(from_logits=False)
mean_loss = tf.keras.metrics.Mean("loss")
loss_history = []

### tf.GradientTape을 이용한 학습 진행

In [27]:
total_steps = int(len(x_train_words)/ batch_size * max_epochs)

for (step, (seq_pad, seq_length, labels)) in enumerate(train_dataset.take(total_steps)):
    start_time = time.time()
    with tf.GradientTape() as tape:
        logits = model(seq_pad)
        loss_value = loss_obj(tf.one_hot(labels, depth=num_classes), logits)

    mean_loss(loss_value)
    loss_history.append((mean_loss.result().numpy()))
    grads = tape.gradient(loss_value, model.variables)
    optimizer.apply_gradients(zip(grads, model.variables))

    if step % 3 == 0:
        clear_output(wait=True)
        duration = time.time() - start_time
        examples_per_sec = batch_size/float(duration)
        epochs = batch_size * step / float(len(x_train_words))
        print("epochs : {:.2f}, step : {}, loss: {:g}, ({:.2f} examples/sec; {: .3f} sec/batch".format(epochs+1, step, loss_value, examples_per_sec, duration))
print("training done!!!!")

epochs : 50.00, step : 147, loss: 0.0615634, (135.28 examples/sec;  0.030 sec/batch
training done!!!!


In [None]:
loss_history = np.array(loss_history)
plt.plot(loss_history, label='train')

### 모델 평가

In [28]:
train_dataset_eval = tf.data.Dataset.from_tensor_slices((x_train_pad, x_train_seq_length, y_train))
train_dataset_eval = train_dataset_eval.batch(batch_size=len(x_train_pad))

In [29]:
loss_object = tf.keras.losses.CategoricalCrossentropy()
acc_object = tf.keras.metrics.CategoricalAccuracy()
val_acc_object = tf.keras.metrics.CategoricalAccuracy()

In [30]:
val_mean_loss = tf.keras.metrics.Mean("loss")
val_mean_accuracy = tf.keras.metrics.Mean("accuracy")

In [31]:
for (step, (seq_pad, seq_length, labels)) in enumerate(train_dataset.take(1)):
    predictions = model(seq_pad, training=False)
    val_loss_value = loss_object(tf.one_hot(labels, depth=num_classes), predictions)
    val_acc_value = val_acc_object(tf.one_hot(labels, depth=num_classes), predictions)

    val_mean_loss(val_loss_value)
    val_mean_accuracy(val_acc_value)

    print("valid loss : {: .4g}, valid accuracy : {: .4g}%".format(val_mean_loss.result(),
                                                                   val_mean_accuracy.result() * 100))

valid loss :  0.004676, valid accuracy :  100%


In [32]:
for (step, (seq_pad, seq_length, labels)) in enumerate(train_dataset_eval.take(1)):
    logits = model(seq_pad)
    predictions = tf.cast(tf.argmax(logits, 1), tf.int32)

In [33]:
predictions

<tf.Tensor: shape=(12,), dtype=int32, numpy=array([1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0], dtype=int32)>

In [34]:
for x , y in zip(seq_pad, predictions):
    if y.numpy() == 1:
        print("{}: positive".format(tokens_to_string(x.numpy())))
    else:
        print("{}: negative".format(tokens_to_string(x.numpy())))

good: positive
bad: negative
amazing: positive
so good: positive
bull shit: negative
awesome: positive
how dare: negative
very much: positive
nice: positive
god damn it: negative
very very very happy: positive
what the fuck: negative
