# LSTM을 이용하여 텍스트 생성하기

In [3]:
import pandas as pd
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [4]:
df = pd.read_csv('ArticlesApril2018.csv')
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [5]:
# headline에 결측치가 있는지 확인
print(df['headline'].isnull().sum())

0


In [6]:
# 헤드라인의 값들을 리스트로 저장
hl = df['headline'].tolist()

print(hl[:5])

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell', 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.', 'The New Noma, Explained', 'Unknown', 'Unknown']


In [7]:
# 현재 샘플의 개수 출력
print(len(hl))

1324


In [8]:
df['headline'].value_counts()

headline
Unknown                                                                        110
Variety: Acrostic                                                                3
Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell      1
As Facebook Loses Luster, Tech Stocks Await Fallout                              1
This Many                                                                        1
                                                                              ... 
Did Outsiders Make 911 Calls? A Fear Born of Brooklyn Gentrification             1
Childhood Fears No Parent Can Allay                                              1
For Bannon, Tariffs Are Test of Trump’s Beliefs                                  1
The Failures of Anti-Trumpism                                                    1
There Is Nothin’ Like a Tune                                                     1
Name: count, Length: 1213, dtype: int64

In [15]:
# Unknown 값을 가진 샘플 제거
filtered_headline = []

for word in hl:
    if word != 'Unknown':
       filtered_headline.append(word)

# 노이즈 제거 후 샘플의 개수 출력
print(len(filtered_headline))
# hl -> 리스트내의 요소를 싹다 살펴봐서(for)
# 'Unknown'이 아니라면, 값을 저장

filtered_headline

1214


['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?',
 'Commuter Reprogramming',
 'Ford Changed Leaders, Looking for a Lift. It’s Still Looking.',
 'Romney Failed to Win at Utah Convention, But Few Believe He’s Doomed',
 'Chain Reaction',
 'He Forced the Vatican to Investigate Sex Abuse. Now He’s Meeting With Pope Francis.',
 'In Berlin, artists find a home',
 'The Right Stuff',
 'Jimmy Carter Knows What North Korea Wants',
 'The Truth Is Out There',
 'New Jersey Ruling Could Reignite Battle Over Church-State Separation',
 'Procrastinating',
 'Word + Quiz: dilatory',
 'My Life-Threatening Bout With E. Coli Food Poisoning',
 'Choosing Brexit, a Town Yearned for Its Seafaring Past, and Muddied Its Future',
 'A Quote Disproved',
 'Hot Stuff Turns Cold',
 'At the Top,

In [16]:
# 구두점(!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)불러오는 코드
from string import punctuation

# 데이터 전처리 함수
def repreprocessing(raw_sentence):
    # 아스키코드로 디코딩하는 과정에서 아스키 코드로 표현할 수 없는 문자는 사라짐
    preproceseed_sentence = raw_sentence.encode("utf8").decode("ascii",'ignore')
    # 구두점 제거 및 소문자화
    return ''.join(word for word in preproceseed_sentence if word not in punctuation).lower()


# filtered_headline각 요소에 대해 데이터 전처리 과정 수행
preprocessed_headline = []
for i in filtered_headline:
    pre = repreprocessing(i)
    preprocessed_headline.append(pre)

# preprocessed_headline = [repreprocessing(x) for x in headline]

# 전처리 완료된 새로운 리스트 생성
preprocessed_headline[:5]


['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [17]:
# 토큰화 및 단어 집합의 크기(vocab_size) 계산
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1

print(vocab_size)

3494


In [18]:
# 문장을 분해하여 sequence 생성
sequences = []

for sentence in preprocessed_headline:
    encoded = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1, len(encoded)):
        sequence = encoded[: i+1]
        sequences.append(sequence)
sequences[:15]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3],
 [100, 3, 1117],
 [100, 3, 1117, 2],
 [100, 3, 1117, 2, 14],
 [100, 3, 1117, 2, 14, 583]]

In [20]:
# 샘플(sequence)의 최대 길이 구하기
max_len = max(len(l) for l in sequences)

lens = []
for line in sequences:
    lens.append(len(line))
max_len = max(lens)

max_len

24

In [21]:
# 시퀀스 패딩
sequences = pad_sequences(sequences, maxlen=max_len)
sequences[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          99,  269],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   99,
         269,  371],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   99,  269,
         371, 1115]])

In [22]:
# 시퀀스 넘파이 배열로 변환
sequences = np.array(sequences)

# 마지막 단어를 제외한 부분을 입력 데이터로 사용
X = sequences[:, :-1]

# 마지막 단어를 출력 데이터로 사용
y = sequences[:, -1]


In [23]:
# data(X) 3행 출력
X[:3]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,  99],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,  99, 269],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,  99, 269, 371]])

In [24]:
# target(y) 3행 출력
y[:3]

array([ 269,  371, 1115])

In [25]:
# target(y)에 대해 원 핫 인코딩
y = to_categorical(y, num_classes=vocab_size)

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [27]:
# 모델 생성
model = Sequential()
model.add(Embedding(vocab_size, 64))
model.add(LSTM(128))
model.add(Dense(vocab_size, activation='softmax'))

# 모델 컴파일
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
# 모델 학습
model.fit(X, y, epochs=100)

Epoch 1/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.0230 - loss: 7.7204
Epoch 2/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.0265 - loss: 7.0726
Epoch 3/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.0368 - loss: 6.8531
Epoch 4/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.0502 - loss: 6.7042
Epoch 5/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.0532 - loss: 6.4645
Epoch 6/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.0548 - loss: 6.2455
Epoch 7/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.0546 - loss: 6.0582
Epoch 8/100
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.0663 - loss: 5.8277
Epoch 9/100
[1m244/244

<keras.src.callbacks.history.History at 0x1d1cbbe0850>

In [29]:
# 문장 생성기(RNN과 완전 동일)
def sentence_generation(model, tokenizer, current_word, n): # 모델, 토크나이저, 현재 단어, 반복할 횟수
    init_word = current_word
    sentence = ''

    # n번 반복하면서 단어를 계속 추가
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=max_len-1, padding='pre')

        # 입력한 X(현재 단어)에 대해서 y를 예측하고 y(예측한 단어)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items(): 
            # 만약 예측한 단어와 인덱스와 동일한 단어가 있다면 break
            if index == result:
                break

        # 현재 단어 + ' ' + 예측 단어를 현재 단어로 변경
        current_word = current_word + ' '  + word

        # 예측 단어를 문장에 저장
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [30]:
print(sentence_generation(model, tokenizer, 'i want', 10))

i want to be rich and im not sorry at home at


In [31]:
print(sentence_generation(model, tokenizer, 'how', 10))

how to make a crossword puzzle fire to the police for
