In [1]:
import pandas as pd
df = pd.read_csv('./data/imdb.csv')

In [2]:
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
import tensorflow as tf

In [4]:
tk=tf.keras.preprocessing.text.Tokenizer(num_words=2000, oov_token='<unk>')

In [5]:
tk.fit_on_texts(df['review']) # 빈칸 단위로 잘라서 실행한다

In [8]:
tk.word_index['good']

31

In [9]:
tk.index_word[31]

'good'

In [10]:
import joblib
joblib.dump(tk, 'tokenizer.pkl')

['tokenizer.pkl']

In [11]:
seqs=tk.texts_to_sequences(df['review'])

In [12]:
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [13]:
tk.index_word[27]

'very'

In [14]:
data = []
for seq in seqs:
    for i in range(0, len(seq) - 4):
        data.append((seq[i:i+4], seq[i+4]))

In [15]:
import random

In [17]:
random.shuffle(data)

In [18]:
data[0]

([118, 2, 412, 5], 2)

In [19]:
import numpy as np

xs = np.array([x for x, y in data])
ys = np.array([y for x, y in data])

In [20]:
xs

array([[ 118,    2,  412,    5],
       [ 494,    4,  232,  171],
       [1132,  709,    3,    1],
       ...,
       [ 291,  143,  743,  525],
       [ 147,    7,   16,  280],
       [  27,    1,    3,   15]])

In [19]:
joblib.dump((xs, ys), 'lm-data.pkl')

['lm-data.pkl']

## 언어 모형

In [21]:
NUM_WORD = tk.num_words + 1

In [21]:
emb1 = tf.keras.layers.Embedding(
    input_dim=NUM_WORD,
    output_dim=8,
)

언어 모형을 만든다.

In [22]:
lm = tf.keras.Sequential([
    emb1,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(NUM_WORD)
])

모형 요약을 확인한다

In [23]:
lm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 8)           16008     
_________________________________________________________________
global_average_pooling1d (Gl (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 8)                 72        
_________________________________________________________________
dense_1 (Dense)              (None, 2001)              18009     
Total params: 34,089
Trainable params: 34,089
Non-trainable params: 0
_________________________________________________________________


모형을 학습시킨다.

In [24]:
lm.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy']
)

In [25]:
lm.fit(xs, ys, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x1af6540ddc0>

모형을 저장한다.

In [26]:
lm.save('lm.krs')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: lm.krs\assets


## 단어 임베딩

단어 임베딩을 확인한다.

In [27]:
e = emb1.embeddings.numpy()
e.shape

(2001, 8)

단어 임베딩은 임베딩 레이어의 가중치와 동일하다.

In [28]:
import numpy as np

w = emb1.get_weights()[0]
np.array_equal(e, w)

True

임베딩을 저장한다.

In [29]:
np.savez('word-emb.npz', emb=e)

GlobalAveragePooling1D는 1번 인덱스를 기준으로 평균을 구한다. 예를 들기 위해 다음과 같은 행렬이 있다고 하자

In [30]:
x = np.array([[[1, 2, 3], [3, 6, 9]]], dtype='float32')
x

array([[[1., 2., 3.],
        [3., 6., 9.]]], dtype=float32)

In [31]:
x.shape

(1, 2, 3)

이 행렬을 GlobalAveragePooling1D 레이어에 통과시키면 다음과 같이 된다

In [32]:
avg = tf.keras.layers.GlobalAveragePooling1D()

In [33]:
y = avg(x).numpy()
y

array([[2., 4., 6.]], dtype=float32)

In [34]:
y.shape

(1, 3)

다음에 나올 단어의 확률 예측

In [35]:
x = xs[0:1]
y = ys[0]

x의 4단어를 확인한다.

In [36]:
[tk.index_word[i] for i in x[0]]

['of', 'the', 'most', '<unk>']

모형에 넣는다.

In [37]:
import numpy as np
logit = lm.predict(x.astype('float32'))
logit

array([[-5.128454 ,  4.9709554,  4.1648808, ..., -4.9584007, -5.1158504,
        -5.11511  ]], dtype=float32)

소프트맥스 함수를 적용하여 확률로 바꾼다.

In [38]:
p = tf.nn.softmax(logit).numpy()
p

array([[8.0910477e-06, 1.9684412e-01, 8.7912105e-02, ..., 9.5908717e-06,
        8.1936714e-06, 8.1997368e-06]], dtype=float32)

여기에서 실제로 나온 단어를 확인한다.

In [39]:
tk.index_word[y]

'characters'

해당 단어의 확률을 본다.

In [40]:
p[0, y]

0.002009325

확률이 가장 높은 단어를 알아본다.

In [41]:
i = p.argmax()
i

1

In [42]:
p[0, i]

0.19684412

In [43]:
tk.index_word[i]

'<unk>'

시퀀스마다 길이가 모두 다르므로 앞에 0을 채워(padding) 길이를 맞춰준다.

In [44]:
pads = tf.keras.preprocessing.sequence.pad_sequences(seqs)

단어 임베딩 불러오기

학습된 단어 임베딩을 불러온다.

In [45]:
import numpy as np

z = np.load('word-emb.npz')
e = z['emb']

감성 분석

감성 분석 모형에 들어갈 임베딩 레이어를 만든다. 언어 모형에서 학습된 가중치로 초기화한다.

In [46]:
emb2 = tf.keras.layers.Embedding(
    input_dim=tk.num_words + 1,
    output_dim=8,
    embeddings_initializer=tf.keras.initializers.Constant(e)
)

감성 분석 모형을 만든다.

In [47]:
model = tf.keras.Sequential([
    emb2,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

모형 요약을 확인한다.

In [48]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 8)           16008     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 8)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 16,089
Trainable params: 16,089
Non-trainable params: 0
_________________________________________________________________


모형을 설정한다.

In [49]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

학습시킨다.

In [50]:
y = df['sentiment'].values

In [51]:
model.fit(pads, y)



<tensorflow.python.keras.callbacks.History at 0x1af66a0af10>