In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
from tensorflow.random import set_seed
from numpy.random import seed
set_seed(2)
seed(1)
import pandas as pd
import numpy as np
import string, os 
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
cd = '../input/nyt-comments/'
all_headlines = []
for filename in os.listdir(cd):
    if 'Articles' in filename:
        art_df = pd.read_csv(cd + filename)
        all_headlines.extend(list(art_df.headline.values))
        break

all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

829

In [3]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 
corpus = [clean_text(x) for x in all_headlines]
corpus[:20]

['nfl vs politics has been battle all season long',
 'voice vice veracity',
 'a standups downward slide',
 'new york today a groundhog has her day',
 'a swimmers communion with the ocean',
 'trail activity',
 'super bowl',
 'trumps mexican shakedown',
 'pences presidential pet',
 'fruit of a poison tree',
 'the peculiar populism of donald trump',
 'questions for on alaskas coldest days a village draws close for warmth',
 'the new kids',
 'what my chinese mother made',
 'do you think teenagers can make a difference in the world',
 'president pledges to let politics return to pulpits',
 'the police killed my unarmed son in 2012 im still waiting for justice',
 'video of sheep slaughtering ignites a dispute',
 'this will change your mind',
 'busy start for a president and that was in 1933']

In [4]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index)+1
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1,len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences,total_words
inp_sequences,total_words = get_sequence_of_tokens(corpus)
inp_sequences[:20]

[[660, 117],
 [660, 117, 72],
 [660, 117, 72, 73],
 [660, 117, 72, 73, 661],
 [660, 117, 72, 73, 661, 662],
 [660, 117, 72, 73, 661, 662, 63],
 [660, 117, 72, 73, 661, 662, 63, 29],
 [660, 117, 72, 73, 661, 662, 63, 29, 210],
 [211, 663],
 [211, 663, 664],
 [2, 665],
 [2, 665, 666],
 [2, 665, 666, 345],
 [11, 27],
 [11, 27, 28],
 [11, 27, 28, 2],
 [11, 27, 28, 2, 667],
 [11, 27, 28, 2, 667, 73],
 [11, 27, 28, 2, 667, 73, 153],
 [11, 27, 28, 2, 667, 73, 153, 90]]

In [5]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
    predictors,label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label,num_classes=total_words)
    return predictors,label,max_sequence_len
predictors,label,max_sequence_len = generate_padded_sequences(inp_sequences)

In [6]:
def create_model(max_sequence_len,total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words,10,input_length=input_len))
    model.add(LSTM(1000))
    model.add(Dropout(0.1))
    model.add(Dense(total_words,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam')
    return model
model = create_model(max_sequence_len,total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 10)            22880     
_________________________________________________________________
lstm (LSTM)                  (None, 1000)              4044000   
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense (Dense)                (None, 2288)              2290288   
Total params: 6,357,168
Trainable params: 6,357,168
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(predictors,label,epochs=1000,verbose=5)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x7f159e64bc50>

In [8]:
def generate_text(seed_text,next_words,model,max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')
        predicted = model.predict_classes(token_list,verbose=0)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index==predicted:
                output_word = word
                break
        seed_text += " "+ output_word
    return seed_text.title()

In [9]:
print(generate_text("scientists",20,model,max_sequence_len))

Scientists Do You Feel Your School And Teachers Welcome Both Conservative And Liberal Points Of View Of Rises Myself It You
