# Text Generation using LSTM

In [3]:
import pandas as pd
import numpy as np
from string import punctuation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [5]:
df = pd.read_csv('ArticlesApril2018.csv')

In [6]:
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [7]:
df.shape, df.columns

((1324, 15),
 Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
        'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
        'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
       dtype='object'))

In [14]:
df['headline'].isnull().values.any()

False

In [19]:
headline = []
headline.extend(list(df.headline.values))
headline = [n for n in headline if n!= 'Unknown']

In [20]:
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?']

In [21]:
def repreprocessing(s):
    s = s.encode('utf8').decode('ascii','ignore')
    return ''.join(c for c in s if c not in punctuation).lower()

text = [repreprocessing(x) for x in headline]
text[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

### Tokenize

In [22]:
t = Tokenizer()
t.fit_on_texts(text)
vocab_size = len(t.word_index) + 1
vocab_size

3494

In [23]:
sequences = list()

for line in text:
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1,len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

In [24]:
sequences[:10]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116]]

In [27]:
t.word_index.items()

dict_items([('the', 1), ('a', 2), ('to', 3), ('of', 4), ('in', 5), ('for', 6), ('and', 7), ('is', 8), ('on', 9), ('with', 10), ('trump', 11), ('as', 12), ('at', 13), ('new', 14), ('how', 15), ('from', 16), ('it', 17), ('an', 18), ('that', 19), ('be', 20), ('season', 21), ('us', 22), ('you', 23), ('its', 24), ('what', 25), ('episode', 26), ('can', 27), ('your', 28), ('not', 29), ('he', 30), ('now', 31), ('his', 32), ('are', 33), ('teaching', 34), ('war', 35), ('out', 36), ('no', 37), ('was', 38), ('by', 39), ('trumps', 40), ('has', 41), ('over', 42), ('may', 43), ('into', 44), ('why', 45), ('more', 46), ('we', 47), ('who', 48), ('about', 49), ('recap', 50), ('activities', 51), ('1', 52), ('just', 53), ('do', 54), ('women', 55), ('when', 56), ('syria', 57), ('trade', 58), ('i', 59), ('2', 60), ('or', 61), ('will', 62), ('this', 63), ('have', 64), ('president', 65), ('but', 66), ('home', 67), ('up', 68), ('long', 69), ('one', 70), ('off', 71), ('facebook', 72), ('house', 73), ('gop', 74),

In [29]:
index_to_word = {}
for key,value in t.word_index.items():
    index_to_word[value] = key

print('빈도수 상위 100등 단어 : {}'.format(index_to_word[100]))

빈도수 상위 100등 단어 : epa


In [30]:
max_len = max(len(l) for l in sequences)
max_len

24

In [31]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [32]:
sequences[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          99,  269],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   99,
         269,  371],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,   99,  269,
         371, 1115]])

In [33]:
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

In [34]:
print(y[:3])

[ 269  371 1115]


In [36]:
y = to_categorical(y, num_classes=vocab_size)

In [37]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Modeling

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,LSTM

In [40]:
model = Sequential()
model.add(Embedding(vocab_size,10,input_length=max_len-1))
model.add(LSTM(128))
model.add(Dense(vocab_size, activation='softmax'))

In [41]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 23, 10)            34940     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               71168     
_________________________________________________________________
dense_1 (Dense)              (None, 3494)              450726    
Total params: 556,834
Trainable params: 556,834
Non-trainable params: 0
_________________________________________________________________


In [42]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.fit(X,y,epochs=200,verbose=2)

Train on 7803 samples
Epoch 1/200
7803/7803 - 13s - loss: 7.6463 - accuracy: 0.0299
Epoch 2/200
7803/7803 - 10s - loss: 7.1249 - accuracy: 0.0300
Epoch 3/200
7803/7803 - 10s - loss: 6.9879 - accuracy: 0.0323
Epoch 4/200
7803/7803 - 10s - loss: 6.8682 - accuracy: 0.0396
Epoch 5/200
7803/7803 - 10s - loss: 6.7236 - accuracy: 0.0436
Epoch 6/200
7803/7803 - 9s - loss: 6.5640 - accuracy: 0.0459
Epoch 7/200
7803/7803 - 10s - loss: 6.3952 - accuracy: 0.0492
Epoch 8/200
7803/7803 - 9s - loss: 6.2135 - accuracy: 0.0550
Epoch 9/200
7803/7803 - 10s - loss: 6.0382 - accuracy: 0.0541
Epoch 10/200
7803/7803 - 10s - loss: 5.8628 - accuracy: 0.0625
Epoch 11/200
7803/7803 - 10s - loss: 5.6938 - accuracy: 0.0663
Epoch 12/200
7803/7803 - 9s - loss: 5.5328 - accuracy: 0.0693
Epoch 13/200
7803/7803 - 9s - loss: 5.3734 - accuracy: 0.0761
Epoch 14/200
7803/7803 - 10s - loss: 5.2220 - accuracy: 0.0833
Epoch 15/200
7803/7803 - 9s - loss: 5.0759 - accuracy: 0.0882
Epoch 16/200
7803/7803 - 9s - loss: 4.9370 - ac

Epoch 131/200
7803/7803 - 10s - loss: 0.3681 - accuracy: 0.9134
Epoch 132/200
7803/7803 - 10s - loss: 0.3630 - accuracy: 0.9144
Epoch 133/200
7803/7803 - 10s - loss: 0.3615 - accuracy: 0.9144
Epoch 134/200
7803/7803 - 10s - loss: 0.3538 - accuracy: 0.9146
Epoch 135/200
7803/7803 - 10s - loss: 0.3514 - accuracy: 0.9144
Epoch 136/200
7803/7803 - 10s - loss: 0.3476 - accuracy: 0.9150
Epoch 137/200
7803/7803 - 11s - loss: 0.3434 - accuracy: 0.9173
Epoch 138/200
7803/7803 - 12s - loss: 0.3413 - accuracy: 0.9143
Epoch 139/200
7803/7803 - 10s - loss: 0.3382 - accuracy: 0.9154
Epoch 140/200
7803/7803 - 13s - loss: 0.3335 - accuracy: 0.9146
Epoch 141/200
7803/7803 - 11s - loss: 0.3297 - accuracy: 0.9150
Epoch 142/200
7803/7803 - 10s - loss: 0.3275 - accuracy: 0.9171
Epoch 143/200
7803/7803 - 11s - loss: 0.3229 - accuracy: 0.9152
Epoch 144/200
7803/7803 - 10s - loss: 0.3226 - accuracy: 0.9139
Epoch 145/200
7803/7803 - 11s - loss: 0.3179 - accuracy: 0.9157
Epoch 146/200
7803/7803 - 11s - loss: 0.

<tensorflow.python.keras.callbacks.History at 0x191b7a15208>

In [47]:
def sentence_generation(model, t, current_word, n):
    init_word = current_word
    sentence = ''
    for _ in range(n):
        encoded = t.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=23, padding='pre')
        result = model.predict_classes(encoded, verbose=0)
        for word, index in t.word_index.items():
            if index == result:
                break
        current_word = current_word + ' ' + word
        sentence = sentence + ' ' + word
    
    sentence = init_word + sentence
    return sentence

In [48]:
sentence_generation(model, t, 'i', 10)

'i want to be rich and im not sorry attack when'

In [49]:
sentence_generation(model, t, 'how', 10)

'how to make facebook more accountable attracts talk can a pulitzer'