Objective:
My objevtive for the machine learning project is to create a Recurrent Neural Network (RNN) that can generate (somewhat) legible poetry based on a collection of poems. To do so, I will clean the data and cut up groups of words into sets of inputs for the network to take. The size of the input i.e. the number of words is yet to be determined considering we want to have a large enough input size to gather important features like rhyme schemes and context for the LSTM cell to remember, but we don't want to be too large in our input size that training is too slow. 

Instead of feeding in actual words, I will encode each word into an identifier, making the unique set of these identifiers as the Y vector for our supervised learning to take place. The vocabulary vector, our Y vector, will be one hot encoded for the word that was actually present, which the LSTM cell will attempt at predicting for each time step. 

link to data : https://www.kaggle.com/ishnoor/poetry-analysis-with-machine-learning

In [None]:
import pandas as pd
import numpy as np

poems = pd.read_csv("all.csv")

In [None]:
poems['length'] = 0
for i in range(len(poems)):
    poems['length'][i] = len(poems['content'][i])

Clean data by deleting null entries etc.

In [None]:
poems = poems.sort_values(by='length') #Sort by length of poem
poems = poems[14:len(poems)-5] # Delete tails on both sides
poems = poems[poems['content'].str.contains('Published')==False]# Eliminate non-poems with 'Published'
print(len(poems))
poems = poems[poems['content'].str.contains('from Selected Poems')==False]# Eliminate non-poems with 'from Selected Poems'
print(len(poems))
poems = poems[poems['content'].str.contains('Collected Poems')==False]# Eliminate non-poems with 'from Collected Poems'
print(len(poems))
#Eliminate where poem is just intro
for ind, row in poems.iterrows():
    if row['author'] in row['content'].upper() or str(row['poem name']) in row['content'][:40]:
        poems = poems.drop([ind])
print(len(poems))

In [None]:
num_poems = len(poems)
poem = poems['content'][:num_poems]
poem = poem[poems['length'] > 100]
poem = poem[poems['length'] < 1000]
poem = poem.reset_index(drop=True)
X = poem
num_poems = len(poem)

In [None]:
print(num_poems)

Create vocab size and word dictionary

In [None]:
temp = ''
for i in range(num_poems):
    temp += poem[i] + ' '
poem = temp

import re
#poem = re.sub(' +',' ',poem)
poem = poem.lower()
poem = re.findall(r'[\w]+|[\'!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]',poem)
words = list(set(poem))
vocab_size = len(words)
#print(vocab_size)


In [None]:
print(X.describe())

In [None]:
X[0]

In [None]:
for i in range(len(X)):
    X[i] = X[i].replace("\r\n"," ")  

In [None]:
from keras.preprocessing.text import  Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer( num_words=vocab_size)

In [None]:
tokenizer.fit_on_texts(X)

In [None]:
text = tokenizer.texts_to_sequences(X)
text = pad_sequences(text, maxlen=1000)

In [None]:
word_dict = tokenizer.word_index

In [None]:
maxwords = len(word_dict)

In [None]:
count = 0
for key,value in tokenizer.word_counts.items():
    count += 1
    print(key,value)
    
print(count)

In [None]:
embedding_matrix = np.zeros((maxwords,50))

In [None]:
with open('glove.6B.50d.txt') as f:
    for line in f:
        l = line.split()
        if l[0] in word_dict:
            indx = word_dict[l[0]]
            for i in range(50):
                embedding_matrix[indx-1][i] = l[i+1]
    

In [None]:
embedding_matrix[-50:]

RNN Model:

In [None]:
from keras.models import Sequential, Model
from keras.layers import Embedding, LSTM, Dropout, TimeDistributed, Dense, Activation, Input
from keras.optimizers import Adam


num_steps = 1000
hidden_size = 350
use_dropout = False




optimizer = Adam(0.0002, 0.5)

In [None]:
def generator():
    model = Sequential()
    model.add(Embedding(maxwords, hidden_size, input_length=num_steps))
    model.add(LSTM(hidden_size, return_sequences=True))
    # model.add(LSTM(hidden_size, return_sequences=True))
    if use_dropout:
        model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(maxwords)))
    model.add(Activation('softmax'))
    model.layers[0].weight=embedding_matrix
    model.layers[0].trainable=False
    
    noise = Input(shape=(num_steps,))
    gen_poem = model(noise)

    return Model(noise, gen_poem)

In [None]:

def discriminator():
    model = Sequential()
    model.add(Embedding(maxwords, hidden_size, input_length=num_steps))
    model.add(LSTM(hidden_size, return_sequences=False))
    # model.add(LSTM(hidden_size, return_sequences=True))
    if use_dropout:
        model.add(Dropout(0.5))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.layers[0].weight=embedding_matrix
    model.layers[0].trainable=False
    
    i_poem = Input(shape=(num_steps,))
    validity = model(i_poem)

    return Model(i_poem, validity)

In [None]:
# Build and compile the discriminator
discriminator = discriminator()
discriminator.compile(loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'])

# Build the generator
generator = generator()

# The generator takes noise as input and generates poem
z = Input(shape=(num_steps,))
g_poem = generator(z)

# For the combined model we will only train the generator
discriminator.trainable = False

# The discriminator takes generated poems as input and determines validity
validity = discriminator(g_poem)

# The combined model  (stacked generator and discriminator)
# Trains the generator to fool the discriminator
combined = Model(z, validity)
combined.compile(loss='binary_crossentropy', optimizer=optimizer)

Train Model