In [1]:
# keras module for building LSTM 
# https://www.kaggle.com/shivamb/beginners-guide-to-text-generation-using-lstms
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.


In [3]:
curr_dir = '/Users/seeni-2328/Documents/Seeni/Datasets/kaggle/nyt-comments/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break

print(len(all_headlines))
all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

885


829

In [16]:
all_headlines[0]


In [23]:

def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 
corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['nfl vs politics has been battle all season long',
 'voice vice veracity',
 'a standups downward slide',
 'new york today a groundhog has her day',
 'a swimmers communion with the ocean',
 'trail activity',
 'super bowl',
 'trumps mexican shakedown',
 'pences presidential pet',
 'fruit of a poison tree']

In [24]:
all_headlines[:10]

['N.F.L. vs. Politics Has Been Battle All Season Long',
 'Voice. Vice. Veracity.',
 'A Stand-Up’s Downward Slide',
 'New York Today: A Groundhog Has Her Day',
 'A Swimmer’s Communion With the Ocean',
 'Trail Activity',
 'Super Bowl',
 'Trump’s Mexican Shakedown',
 'Pence’s Presidential Pet',
 'Fruit of a Poison Tree']

In [92]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    print(total_words)
    tokenizer.
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
print(total_words)
print(len(inp_sequences))
inp_sequences[:10]

2288
2288
4544


[[660, 117],
 [660, 117, 72],
 [660, 117, 72, 73],
 [660, 117, 72, 73, 661],
 [660, 117, 72, 73, 661, 662],
 [660, 117, 72, 73, 661, 662, 63],
 [660, 117, 72, 73, 661, 662, 63, 29],
 [660, 117, 72, 73, 661, 662, 63, 29, 210],
 [211, 663],
 [211, 663, 664]]

In [133]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    print(input_sequences[:10])
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
print(predictors.shape)
print(label.shape)


[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 660 117]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0 660 117  72]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0 660 117  72  73]
 [  0   0   0   0   0   0   0   0   0   0   0   0 660 117  72  73 661]
 [  0   0   0   0   0   0   0   0   0   0   0 660 117  72  73 661 662]
 [  0   0   0   0   0   0   0   0   0   0 660 117  72  73 661 662  63]
 [  0   0   0   0   0   0   0   0   0 660 117  72  73 661 662  63  29]
 [  0   0   0   0   0   0   0   0 660 117  72  73 661 662  63  29 210]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 211 663]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0 211 663 664]]
(4544, 16)
(4544, 2288)


In [81]:
total_words

2288

In [70]:
max_sequence_len

17

In [103]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len)) #2288, 10, 16
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100)) 
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 16, 10)            22880     
_________________________________________________________________
lstm_10 (LSTM)               (None, 100)               44400     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 2288)              231088    
Total params: 298,368
Trainable params: 298,368
Non-trainable params: 0
_________________________________________________________________


In [122]:
model.fit(predictors, label, epochs=200, verbose=5)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x12c7a3a58>

In [112]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    final_result = seed_text + " :"
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
        final_result += " "+output_word
    return final_result.title()

In [108]:
def generate_text2(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [121]:
print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("preident trump", 4, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("How", 6, model, max_sequence_len))

United States : Race To Cement Their Priorities
Preident Trump : Vs Press Crazy Stupid
India And China : On Familiar Streets A
How : To A Literal Reading His Democrats


In [131]:
print('On 200 epochs')
print (generate_text("united states", 5, model, max_sequence_len))
print (generate_text("preident trump", 5, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("How", 6, model, max_sequence_len))
print (generate_text("Dummy", 7, model, max_sequence_len))

On 200 epochs
United States : Race To Cement Their Priorities
Preident Trump : Is Perverse Fight In Verse
India And China : On Bold Cooking Rules
How : To Be Mindful While Eating Chocolate
Dummy : The Walking Dead Season 7 Episode 10


In [139]:
print (generate_text("finding an expansive", 20, model, max_sequence_len))

Finding An Expansive : Therapist Fake His Online Reviews In A Begins For Shrugs Years 6 Plan In The Second Economy Premiere First Spouse
