In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential

import datetime
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
# Load Data
data = pd.read_csv('kaggle_poem_dataset.csv',index_col=0) 

In [4]:
data

Unnamed: 0,Author,Title,Poetry Foundation ID,Content
0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...
...,...,...,...,...
15647,Hannah Gamble,Your Invitation to a Modest Breakfast,56059,"It’s too cold to smoke outside, but if you com..."
15648,Eleni Sikelianos,Your Kingdom\n \n \n \n Launch Audio in a N...,145220,if you like let the body feel\nall its own evo...
15649,Susan Elizabeth Howe,“Your Luck Is About To Change”,41696,(A fortune cookie)\nOminous inscrutable Chines...
15650,Andrew Shields,Your Mileage May Vary,90177,1\nOur last night in the house was not our las...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15652 entries, 0 to 15651
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Author                15652 non-null  object
 1   Title                 15652 non-null  object
 2   Poetry Foundation ID  15652 non-null  int64 
 3   Content               15652 non-null  object
dtypes: int64(1), object(3)
memory usage: 611.4+ KB


In [6]:
# Take just the poems
poems = data["Content"]

#Show lines with duplicate poems
data[poems.isin(poems[poems.duplicated()])]

Unnamed: 0,Author,Title,Poetry Foundation ID,Content
560,Wallace Stevens,Anecdote of the Jar,51648,"I placed a jar in Tennessee,\nAnd round it was..."
561,Wallace Stevens,Anecdote of the Jar,14575,"I placed a jar in Tennessee,\nAnd round it was..."
1467,W. S. Di Piero,Big City Speech,55621,Use me\nAbuse me\nTurn wheels of ﬁre\non manho...
1468,W. S. Di Piero,Big City Speech,52599,Use me\nAbuse me\nTurn wheels of ﬁre\non manho...
1883,Ben Lerner,[By any measure . . .],53660,"By any measure, it was endless\nwinter. Emulsi..."
1884,Ben Lerner,[By any measure],53702,"By any measure, it was endless\nwinter. Emulsi..."
2723,Anonymous,The Cuckoo Song,51454,"Sing, cuccu, nu. Sing, cuccu.\nSing, cuccu. Si..."
5351,David Ferry,The Guest Ellen at the Supper for Street People,54841,The unclean spirits cry out in the body\nOr mi...
5352,David Ferry,The Guest Ellen at the Supper for Street People,43395,The unclean spirits cry out in the body\nOr mi...
5741,Warsan Shire,The House,90733,i


In [7]:
# Remove duplicates
poems.drop_duplicates(inplace=True)

In [8]:
# Poems after duplicates filtered
print(poems.size)

15638


In [9]:
# remove first introduction row
poems = poems.iloc[1:]

In [10]:
print(poems)

1        Philosophic\nin its complex, ovoid emptiness,\...
2        We'd  like  to  talk  with  you  about  fear t...
3        The Wise Men will unlearn your name.\nAbove yo...
4        For Jack Collom\n10 Things I do Every Day\n\np...
5        WINTER\nMore time is spent at the window.\n\nS...
                               ...                        
15647    It’s too cold to smoke outside, but if you com...
15648    if you like let the body feel\nall its own evo...
15649    (A fortune cookie)\nOminous inscrutable Chines...
15650    1\nOur last night in the house was not our las...
15651    If your house\nis a dress\nit’ll fit like\nLos...
Name: Content, Length: 15637, dtype: object


In [11]:
# Concatenate poems to one string
concatPoems = ''

# How many poems to select
poemCount = 50 

# Take n random poems
nPoems = poems.sample(poemCount)

for content in nPoems:
    str = content.replace('\xa0','') # Remove double space
    concatPoems += str+ '\n'

In [12]:
len(concatPoems)

56059

In [13]:
# Tokenizing the poems
tokenizer = Tokenizer()

# Define a corpus
corpus = concatPoems.lower().split("\n")
tokenizer.fit_on_texts(corpus)

total_words = len(tokenizer.word_index) + 1
print('Total number of words in corpus:',total_words)

Total number of words in corpus: 3191


In [14]:
# create input sequences using list of tokens
sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        sequences.append(n_gram_sequence)

# Max sequence length
max_sequence_len = 0
for x in sequences:
    current_len = len(x)
    if(max_sequence_len < current_len):
        max_sequence_len = current_len

# Pad the sequences with zeros
sequences = pad_sequences(sequences, maxlen=max_sequence_len)

In [15]:
# create X and y
X = sequences[:, :-1]
print(X)

[[  0   0   0 ...   0   0   6]
 [  0   0   0 ...   0   6  41]
 [  0   0   0 ...   6  41  38]
 ...
 [  0   0   0 ...   2  91 707]
 [  0   0   0 ...  91 707   8]
 [  0   0   0 ... 707   8 305]]


In [16]:
y = sequences[:,-1]
print(y)

[ 41  38  70 ...   8 305 253]


In [17]:
# One hot encoding for y
y = to_categorical(y, num_classes=total_words)

In [18]:
# Defining the model.
def create_model(nodes1, nodes2, dropout1, dropout2):
    model = Sequential()

    model.add(Embedding(input_dim=total_words, output_dim=100, input_length=max_sequence_len-1))
    model.add(Dropout(dropout1))
    model.add(Bidirectional(LSTM(nodes1,return_sequences=True)))
    model.add(Dropout(dropout2))
    model.add(Bidirectional(LSTM(nodes2)))
    model.add(Dense(total_words,activation='softmax'))

    model.compile(loss='categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    print(model.summary())
    
    return model

In [None]:
# Keras wrapper
model = KerasClassifier(build_fn=create_model)

nodes1 = [256, 230, 200]
nodes2 = [128]
dropout1 = [0.2]
dropout2 = [0.3]
batch_size = [32]

param_grid = dict(nodes1=nodes1, nodes2=nodes2, dropout1=dropout1, dropout2=dropout2, batch_size=batch_size)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv = [(slice(None), slice(None))])
grid_result = grid_search.fit(X, y, epochs = 50)

In [22]:
bestNodes1 = [(grid_result.best_params_).get('nodes1')]
print("Best accuracy:", grid_result.best_score_)
print("Best params:", grid_result.best_params_)

Best accuracy: 0.0335712693631649
Best params: {'batch_size': 32, 'dropout1': 0.2, 'dropout2': 0.3, 'nodes1': 200, 'nodes2': 128}


In [None]:
model = KerasClassifier(build_fn=create_model)

nodes1 = bestNodes1
nodes2 = [128, 100, 90]
dropout1 = [0.2]
dropout2 = [0.3]
batch_size = [32]

param_grid = dict(nodes1=nodes1, nodes2=nodes2, dropout1=dropout1, dropout2=dropout2, batch_size=batch_size)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=[(slice(None), slice(None))])
grid_result = grid_search.fit(X, y, epochs = 50)

In [27]:
bestNodes2 = [(grid_result.best_params_).get('nodes2')]
print("Best accuracy:", grid_result.best_score_)
print("Best params:", grid_result.best_params_)

Best accuracy: 0.13484273850917816
Best params: {'batch_size': 32, 'dropout1': 0.2, 'dropout2': 0.3, 'nodes1': 200, 'nodes2': 128}


In [None]:
model = KerasClassifier(build_fn=create_model)

nodes1 = bestNodes1
nodes2 = bestNodes2
dropout1 = [0.2,0.25,0.3]
dropout2 = [0.3]
batch_size = [32]

param_grid = dict(nodes1=nodes1, nodes2=nodes2, dropout1=dropout1, dropout2=dropout2, batch_size=batch_size)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=[(slice(None), slice(None))])
grid_result = grid_search.fit(X, y, epochs = 50)

In [29]:
bestDropout1 = [(grid_result.best_params_).get('dropout1')]
print("Best accuracy:", grid_result.best_score_)
print("Best params:", grid_result.best_params_)

Best accuracy: 0.11967432498931885
Best params: {'batch_size': 32, 'dropout1': 0.2, 'dropout2': 0.3, 'nodes1': 200, 'nodes2': 128}


In [None]:
model = KerasClassifier(build_fn=create_model)

nodes1 = bestNodes1
nodes2 = bestNodes2
dropout1 = bestDropout1
dropout2 = [0.3,0.35,0.37]
batch_size = [32]

param_grid = dict(nodes1=nodes1, nodes2=nodes2, dropout1=dropout1, dropout2=dropout2, batch_size=batch_size)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=[(slice(None), slice(None))])
grid_result = grid_search.fit(X, y, epochs = 50)

In [31]:
bestDropout2 = [(grid_result.best_params_).get('dropout2')]
print("Best accuracy:", grid_result.best_score_)
print("Best params:", grid_result.best_params_)

Best accuracy: 0.13082757592201233
Best params: {'batch_size': 32, 'dropout1': 0.2, 'dropout2': 0.35, 'nodes1': 200, 'nodes2': 128}


In [None]:
model = KerasClassifier(build_fn=create_model)

nodes1 = bestNodes1
nodes2 = bestNodes2
dropout1 = bestDropout1
dropout2 = bestDropout2
batch_size = [32,64,16]

param_grid = dict(nodes1=nodes1, nodes2=nodes2, dropout1=dropout1, dropout2=dropout2, batch_size=batch_size)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=[(slice(None), slice(None))])
grid_result = grid_search.fit(X, y, epochs = 50)

In [33]:
bestBatch = [(grid_result.best_params_).get('batch_size')]
print("Best accuracy:", grid_result.best_score_)
print("Best params:", grid_result.best_params_)

Best accuracy: 0.12859691679477692
Best params: {'batch_size': 32, 'dropout1': 0.2, 'dropout2': 0.35, 'nodes1': 200, 'nodes2': 128}
