In [5]:
import re
import sys
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [6]:
print (f"tensorflow version : {tf.__version__}")

tensorflow version : 2.5.0


In [10]:
sentences = list()
buff_sentence = list()

with open('train.txt', mode='r') as f:
    """
    train.txt
    

    """

    data = f.readlines()
    for sentence in data:
        if len(sentence) == 0 or sentence.startswith('-DOCSTART-') or sentence [0] == "\n":
            if len(buff_sentence) > 0:
                sentences.append(buff_sentence)
                buff_sentence = list()
            continue

        splits = sentence.split(' ')
        splits[-1] = re.sub(r"\n", '', splits[-1]) # 줄바꿈 제거
        word = splits[0].lower() # 단어를 소문자화
        buff_sentence.append([word, splits[-1]])

In [12]:
print (sentences[0])

[['eu', 'B-ORG'], ['rejects', 'O'], ['german', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['british', 'B-MISC'], ['lamb', 'O'], ['.', 'O']]


In [14]:
sentences_info = list() # 단어 변수
tags_info = list() # 태그 변수

In [16]:
for sent in sentences:
    word, tag = zip(*sent) # zip 함수를 사용해서 word, tag를 분리
    sentences_info.append(list(word)) # 각 분리된 정보에서 word만 따로 저장
    tags_info.append(list(tag)) # 각 분리된 정보서 tag만 따로 저장

In [17]:
print(f"첫 번째 문장: {sentences_info[0]}")
print(f"첫 번째 태그 정보: {tags_info[0]}")

첫 번째 문장: ['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
첫 번째 태그 정보: ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [18]:
print (f"열 한 번째 문장: {sentences_info[10]}")
print (f"열 한 번째 문장: {tags_info[10]}")

열 한 번째 문장: ['spanish', 'farm', 'minister', 'loyola', 'de', 'palacio', 'had', 'earlier', 'accused', 'fischler', 'at', 'an', 'eu', 'farm', 'ministers', "'", 'meeting', 'of', 'causing', 'unjustified', 'alarm', 'through', '"', 'dangerous', 'generalisation', '.', '"']
열 한 번째 문장: ['B-MISC', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [21]:
vocab_size = 5000
src_tokenizer = Tokenizer(num_words=vocab_size, oov_token='OOV')
src_tokenizer.fit_on_texts(sentences_info)
tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(tags_info)

In [22]:
x_train = src_tokenizer.texts_to_sequences(sentences_info)
y_train = tar_tokenizer.texts_to_sequences(tags_info)

In [23]:
print (sentences_info[0])
print (tags_info[0])

['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [24]:
print (x_train[0])
print (y_train[0])

[989, 1, 205, 629, 7, 3939, 216, 1, 3]
[4, 1, 7, 1, 1, 1, 7, 1, 1]


In [25]:
index2word = src_tokenizer.index_word
index2tag = tar_tokenizer.index_word

decoded = list()

decoded = [index2word[index] for index in x_train[0]]

print (f"기존 문장: {sentences_info[0]}")
print (f"Vocabulary에 없어 OOV 처리된 단어: {decoded}")

기존 문장: ['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
Vocabulary에 없어 OOV 처리된 단어: ['eu', 'OOV', 'german', 'call', 'to', 'boycott', 'british', 'OOV', '.']


In [26]:
max_len = 80
x_train_padded = pad_sequences(x_train, padding='post', maxlen=max_len)
y_train_padded = pad_sequences(y_train, padding='post', maxlen=max_len)

print (x_train_padded[0])
print (y_train_padded[0])

[ 989    1  205  629    7 3939  216    1    3    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]
[4 1 7 1 1 1 7 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0]


In [32]:
x_train, x_test, y_train, y_test = train_test_split(x_train_padded, y_train_padded, test_size=.2, random_state=555)

print (len(x_train))
print (len(y_train))

11232
11232


In [33]:
tag_size = len(tar_tokenizer.word_index) + 1
y_train = to_categorical(y_train, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)

In [34]:
print (f"size of training sample sentence: {x_train.shape}")
print (f"size of training sample label: {y_train.shape}")
print (f"size of test sample sentence: {x_test.shape}")
print (f"size of test sample label: {y_test.shape}")

size of training sample sentence: (11232, 80)
size of training sample label: (11232, 80, 10)
size of test sample sentence: (2809, 80)
size of test sample label: (2809, 80, 10)


In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras.optimizers import Adam

In [36]:
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))

In [37]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(1e-3), metrics=['accuracy'])

In [47]:
history = model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_test, y_test))


model.save("bid_lstm_on_ner")
model.save_weights("cpkt")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: bid_lstm_on_ner\assets


INFO:tensorflow:Assets written to: bid_lstm_on_ner\assets


In [48]:
print (f'train loss: {history.history["loss"]}')
print (f'train accuracy: {history.history["loss"]}')
print (f'val loss: {history.history["loss"]}')
print (f'val accuracy: {history.history["loss"]}')

train loss: [0.08566053956747055, 0.05719640105962753, 0.03935857489705086, 0.029298366978764534, 0.023310862481594086, 0.019540460780262947, 0.016719359904527664, 0.014471057802438736, 0.012598689645528793, 0.01121416687965393]
train accuracy: [0.08566053956747055, 0.05719640105962753, 0.03935857489705086, 0.029298366978764534, 0.023310862481594086, 0.019540460780262947, 0.016719359904527664, 0.014471057802438736, 0.012598689645528793, 0.01121416687965393]
val loss: [0.08566053956747055, 0.05719640105962753, 0.03935857489705086, 0.029298366978764534, 0.023310862481594086, 0.019540460780262947, 0.016719359904527664, 0.014471057802438736, 0.012598689645528793, 0.01121416687965393]
val accuracy: [0.08566053956747055, 0.05719640105962753, 0.03935857489705086, 0.029298366978764534, 0.023310862481594086, 0.019540460780262947, 0.016719359904527664, 0.014471057802438736, 0.012598689645528793, 0.01121416687965393]


In [49]:
print (f"\n test accuracy: %.4f " % (model.evaluate(x_test, y_test)[1]))


 test accuracy: 0.9614 
