In [3]:
import numpy as np

from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, SimpleRNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [18]:
# 아리랑신문 일부 발췌
text_data = """Tackling inflation and expanding employment are on the government's priority list next year.
This is according to Finance Minister Choo Kyung-ho  on this Wednesday.
He also spoke of a new growth strategy to be shared within the year to ensure economic expansion amid the prolonged presence of risk factors here and elsewhere.
The new strategy will seek to upgrade existing plans on agriculture manufacturing and IT as well as  take pro-active steps to boost exports and investment.
Starting with the latest from the Qatar 2022 World Cup Argentina is through to the World Cup final after beating Croatia 3-nil.
The win means the South American side reaches its sixth World Cup final.
Argentina opened the scoring with a penalty by captain Lionel Messi in the 34th minute. Then five minutes later Julian Alvarez doubled Argentina's lead before scoring the team's third goal in the second half.
The win also means that Messi has a chance of claiming the record for the most appearances at the World Cup.
He is currently tied at 25 appearances with Lothar Matth us of Germany.
Defending champion France takes on underdog Morocco in the other semi-final on Thursday at 4AM Korea time.
"""

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])
# 정수 시퀀스로 변환
encoded = tokenizer.texts_to_sequences([text_data])[0]
print(encoded)

[27, 28, 4, 29, 30, 31, 3, 1, 32, 33, 34, 35, 14, 15, 8, 36, 2, 37, 38, 39, 40, 41, 3, 15, 42, 16, 17, 43, 5, 9, 18, 44, 19, 2, 45, 46, 47, 1, 14, 2, 48, 49, 50, 51, 1, 52, 53, 5, 54, 55, 56, 4, 57, 1, 18, 19, 58, 59, 2, 60, 61, 62, 3, 63, 64, 4, 65, 20, 66, 20, 67, 68, 69, 70, 2, 71, 72, 4, 73, 74, 10, 1, 75, 76, 1, 77, 78, 6, 7, 21, 8, 79, 2, 1, 6, 7, 11, 80, 81, 82, 83, 84, 1, 22, 23, 1, 85, 86, 87, 88, 89, 90, 6, 7, 11, 21, 91, 1, 24, 10, 9, 92, 93, 94, 95, 25, 12, 1, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 24, 1, 108, 109, 110, 12, 1, 111, 112, 1, 22, 17, 23, 113, 25, 114, 9, 115, 5, 116, 1, 117, 118, 1, 119, 26, 13, 1, 6, 7, 16, 8, 120, 121, 13, 122, 26, 10, 123, 124, 125, 5, 126, 127, 128, 129, 130, 3, 131, 132, 12, 1, 133, 134, 11, 3, 135, 13, 136, 137, 138]


In [20]:
# 단어의 정보 및 크기 확인
print(tokenizer.word_index)
vocab_size = len(tokenizer.word_index)+1
print('vocab_size', vocab_size)

{'the': 1, 'to': 2, 'on': 3, 'and': 4, 'of': 5, 'world': 6, 'cup': 7, 'is': 8, 'a': 9, 'with': 10, 'final': 11, 'in': 12, 'at': 13, 'year': 14, 'this': 15, 'he': 16, 'also': 17, 'new': 18, 'strategy': 19, 'as': 20, 'argentina': 21, 'win': 22, 'means': 23, 'scoring': 24, 'messi': 25, 'appearances': 26, 'tackling': 27, 'inflation': 28, 'expanding': 29, 'employment': 30, 'are': 31, "government's": 32, 'priority': 33, 'list': 34, 'next': 35, 'according': 36, 'finance': 37, 'minister': 38, 'choo': 39, 'kyung': 40, 'ho': 41, 'wednesday': 42, 'spoke': 43, 'growth': 44, 'be': 45, 'shared': 46, 'within': 47, 'ensure': 48, 'economic': 49, 'expansion': 50, 'amid': 51, 'prolonged': 52, 'presence': 53, 'risk': 54, 'factors': 55, 'here': 56, 'elsewhere': 57, 'will': 58, 'seek': 59, 'upgrade': 60, 'existing': 61, 'plans': 62, 'agriculture': 63, 'manufacturing': 64, 'it': 65, 'well': 66, 'take': 67, 'pro': 68, 'active': 69, 'steps': 70, 'boost': 71, 'exports': 72, 'investment': 73, 'starting': 74, 'la

In [22]:
seqences = list()
for i in range(1, len(encoded)):
    seqence = encoded[i-1:i+1]
    seqences.append(seqence)
print(seqences)
print('length', len(seqences))

[[27, 28], [28, 4], [4, 29], [29, 30], [30, 31], [31, 3], [3, 1], [1, 32], [32, 33], [33, 34], [34, 35], [35, 14], [14, 15], [15, 8], [8, 36], [36, 2], [2, 37], [37, 38], [38, 39], [39, 40], [40, 41], [41, 3], [3, 15], [15, 42], [42, 16], [16, 17], [17, 43], [43, 5], [5, 9], [9, 18], [18, 44], [44, 19], [19, 2], [2, 45], [45, 46], [46, 47], [47, 1], [1, 14], [14, 2], [2, 48], [48, 49], [49, 50], [50, 51], [51, 1], [1, 52], [52, 53], [53, 5], [5, 54], [54, 55], [55, 56], [56, 4], [4, 57], [57, 1], [1, 18], [18, 19], [19, 58], [58, 59], [59, 2], [2, 60], [60, 61], [61, 62], [62, 3], [3, 63], [63, 64], [64, 4], [4, 65], [65, 20], [20, 66], [66, 20], [20, 67], [67, 68], [68, 69], [69, 70], [70, 2], [2, 71], [71, 72], [72, 4], [4, 73], [73, 74], [74, 10], [10, 1], [1, 75], [75, 76], [76, 1], [1, 77], [77, 78], [78, 6], [6, 7], [7, 21], [21, 8], [8, 79], [79, 2], [2, 1], [1, 6], [6, 7], [7, 11], [11, 80], [80, 81], [81, 82], [82, 83], [83, 84], [84, 1], [1, 22], [22, 23], [23, 1], [1, 85], [

In [23]:
seqences = np.array(seqences)
X,y = seqences[:,0], seqences[:,1]
print("X: ", X)
print("y: ", y)

X:  [ 27  28   4  29  30  31   3   1  32  33  34  35  14  15   8  36   2  37
  38  39  40  41   3  15  42  16  17  43   5   9  18  44  19   2  45  46
  47   1  14   2  48  49  50  51   1  52  53   5  54  55  56   4  57   1
  18  19  58  59   2  60  61  62   3  63  64   4  65  20  66  20  67  68
  69  70   2  71  72   4  73  74  10   1  75  76   1  77  78   6   7  21
   8  79   2   1   6   7  11  80  81  82  83  84   1  22  23   1  85  86
  87  88  89  90   6   7  11  21  91   1  24  10   9  92  93  94  95  25
  12   1  96  97  98  99 100 101 102 103 104 105 106 107  24   1 108 109
 110  12   1 111 112   1  22  17  23 113  25 114   9 115   5 116   1 117
 118   1 119  26  13   1   6   7  16   8 120 121  13 122  26  10 123 124
 125   5 126 127 128 129 130   3 131 132  12   1 133 134  11   3 135  13
 136 137]
y:  [ 28   4  29  30  31   3   1  32  33  34  35  14  15   8  36   2  37  38
  39  40  41   3  15  42  16  17  43   5   9  18  44  19   2  45  46  47
   1  14   2  48  49  50  51   1 

In [24]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1, 10)             1390      
                                                                 
 lstm_1 (LSTM)               (None, 50)                12200     
                                                                 
 dense_1 (Dense)             (None, 139)               7089      
                                                                 
Total params: 20,679
Trainable params: 20,679
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
model.fit(X, y, epochs=500, verbose=0)

<keras.callbacks.History at 0x11ba9b1d7c0>

In [27]:
test_text = 'the'
encoded = tokenizer.texts_to_sequences([test_text])[0]
encoded = np.array([encoded])
print('encoded', encoded)

encoded [[1]]


In [28]:
onehot_output = model.predict(encoded)
print('onehot_output=', onehot_output)

onehot_output= [[2.90034950e-06 5.27322129e-07 1.97993405e-03 5.63995926e-08
  1.07224296e-04 8.64109606e-05 1.17147572e-01 2.74854625e-04
  1.02754657e-05 1.36677514e-09 2.11810657e-05 7.25426617e-06
  1.98646943e-04 3.01227155e-05 6.88957497e-02 1.50546883e-04
  6.01633246e-06 3.58130683e-06 5.22880629e-02 3.19267315e-06
  1.21034172e-05 9.63237653e-07 1.09946214e-01 1.06669659e-05
  4.90535200e-02 3.94957548e-04 9.05690811e-08 2.89589411e-06
  2.07443358e-04 2.44648618e-06 8.51610480e-07 7.66913004e-08
  5.78046292e-02 3.18735864e-08 1.60197249e-06 1.18955199e-06
  5.03035153e-07 8.88712464e-07 2.98333089e-05 1.48662264e-04
  5.42948328e-05 6.23341941e-04 8.73820227e-07 9.19081776e-06
  3.83676934e-06 2.36187452e-06 4.85169257e-05 7.55766962e-07
  1.78451864e-06 1.07800530e-04 8.05608852e-06 1.96170173e-08
  5.67196794e-02 5.90435895e-11 3.73047593e-09 2.90986482e-05
  1.44695041e-05 1.98165026e-06 3.68993133e-05 1.11574057e-08
  1.63965888e-06 2.05894303e-05 2.17108376e-09 1.989730

In [29]:
output = np.argmax(onehot_output)
print('output=', output)

output= 6


In [30]:
for word, index in tokenizer.word_index.items():
    if index == output:
        print(test_text, "=>", word)

the => world
