In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential, load_model

import datetime
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint

In [2]:
# Load Data
data = pd.read_csv('kaggle_poem_dataset.csv',index_col=0)

In [3]:
train = False # Change to true if model wants to be trained

In [4]:
# Take just the poems
poems = data["Content"]

# Remove duplicates
poems.drop_duplicates(inplace=True)

In [5]:
# Concatenate poems to one string
concatPoems = ''

# How many poems to select
poemCount = 50 

# Take n poems
nPoems = poems[3431:3481]

for content in nPoems:
    str = content.replace('\xa0','') # Remove double space
    concatPoems += str+ '\n'

In [6]:
# Tokenizing the poems
tokenizer = Tokenizer()

# Define a corpus
corpus = concatPoems.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
print('Total number of words in corpus:',total_words)

Total number of words in corpus: 3526


In [7]:
# create input sequences using list of tokens
sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        sequences.append(n_gram_sequence)

# Max sequence length
max_sequence_len = 0
for x in sequences:
    current_len = len(x)
    if(max_sequence_len < current_len):
        max_sequence_len = current_len

# Pad the sequences
sequences = pad_sequences(sequences, maxlen=max_sequence_len)

In [8]:
print(sequences)

[[   0    0    0 ...    0 1133   80]
 [   0    0    0 ... 1133   80  292]
 [   0    0    0 ...   80  292    1]
 ...
 [   0    0    0 ...  830    3    1]
 [   0    0    0 ...    3    1  115]
 [   0    0    0 ...    1  115  101]]


In [9]:
# create X and y
X = sequences[:, :-1]
print(X)

[[   0    0    0 ...    0    0 1133]
 [   0    0    0 ...    0 1133   80]
 [   0    0    0 ... 1133   80  292]
 ...
 [   0    0    0 ...    4  830    3]
 [   0    0    0 ...  830    3    1]
 [   0    0    0 ...    3    1  115]]


In [10]:
y = sequences[:,-1]
print(y)

[ 80 292   1 ...   1 115 101]


In [11]:
# One hot encoding for y
y = to_categorical(y, num_classes=total_words)

In [12]:
def create_best_model():
    model = Sequential()

    model.add(Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len-1))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(200,return_sequences=True)))
    model.add(Dropout(0.35))
    model.add(Bidirectional(LSTM(128)))
    model.add(Dense(total_words,activation='softmax'))

    model.compile(loss='categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    print(model.summary())
    
    return model

In [13]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [14]:
if train:
    model = create_best_model()

    #Tensorboard
    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True, write_images=True, update_freq='epoch')

     #Checkpoint
    chekpoint_path= "weights.best.hdf5"
    checkpoint = ModelCheckpoint(chekpoint_path, monitor='loss', verbose=1, save_best_only=True, mode='min')

    callbacks = [tensorboard, checkpoint]

    history = model.fit(X, y, batch_size=32, epochs=150, callbacks=callbacks)

In [16]:
if train:
    accuracy = history.history['accuracy']
    loss = history.history['loss']

    print("Accuracy:", max(accuracy))
    print("Loss:", min(loss))

Accuracy: 0.8485321402549744
Loss: 0.5662724375724792


In [22]:
chekpoint_path= "weights.best.hdf5"
model = load_model(chekpoint_path)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [23]:
# Testing the model
test_text = "Help me"
next_words = 22

for num in range(next_words):
    token = tokenizer.texts_to_sequences([test_text])
    new_pad = pad_sequences(token, maxlen=173)
    predicted = model.predict(new_pad, verbose=0)

    classes_x=np.argmax(predicted,axis=1)

    for word, index in tokenizer.word_index.items():
        if index == classes_x:
            output_word = word
            break
    test_text += " " + output_word

In [24]:
print(test_text)

Help me come up with a strategy to get through this nasty pond the big lounge room not night a full moon covers it


In [25]:
# Testing the model
test_text = "Water"
next_words = 34

for num in range(next_words):
    token = tokenizer.texts_to_sequences([test_text])
    new_pad = pad_sequences(token, maxlen=173)
    predicted = model.predict(new_pad, verbose=0)

    classes_x=np.argmax(predicted,axis=1)

    for word, index in tokenizer.word_index.items():
        if index == classes_x:
            output_word = word
            break
    test_text += " " + output_word

In [26]:
print(test_text)

Water surrounds all shapes that enter up with the frozen seed from dazzled snows your daughter man taken to the last of the sea — voice away other than long long wrong shield in dust


In [27]:
# Testing the model
test_text = "The snowy mountains painted the sky"
next_words = 25

for num in range(next_words):
    token = tokenizer.texts_to_sequences([test_text])
    new_pad = pad_sequences(token, maxlen=173)
    predicted = model.predict(new_pad, verbose=0)

    classes_x=np.argmax(predicted,axis=1)

    for word, index in tokenizer.word_index.items():
        if index == classes_x:
            output_word = word
            break
    test_text += " " + output_word

In [28]:
print(test_text) 

The snowy mountains painted the sky lacy with lightning and the beyond sailboats taken away away out of belief eating the masters of bluff not a toy the drone could have
