In [1]:
try:
    import keras
except:
    !pip install keras

Collecting keras
  Using cached Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Installing collected packages: keras
Successfully installed keras-2.4.3


In [9]:
from pathlib import Path
import tensorflow as tf
tf_session = tf.compat.v1.Session()
from tensorflow.compat.v1.keras import backend as K
K.set_session(tf_session)

from keras.callbacks import ModelCheckpoint,  CSVLogger
from keras.layers import Add, Dense, Input, LSTM
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils

import numpy as np
import pandas as pd
import joblib
!pip install np_utils
# Local library with model definitions for training and generating
from models import Generator, create_training_model

Collecting np_utils
  Using cached np_utils-0.5.12.1.tar.gz (61 kB)
Building wheels for collected packages: np-utils
  Building wheel for np-utils (setup.py) ... [?25ldone
[?25h  Created wheel for np-utils: filename=np_utils-0.5.12.1-py3-none-any.whl size=57125 sha256=74ed76b6a000a1f56a4fa697ab90793ae943ee5beb90fe317d9346b4e2370271
  Stored in directory: /Users/svetachurina122/Library/Caches/pip/wheels/58/98/54/2896a40fd91932a8a2568e688f87231f7da2eaad330254335a
Successfully built np-utils
Installing collected packages: np-utils
Successfully installed np-utils-0.5.12.1


# Load Input

In [10]:
# Settings

# Percent of samples to use for training, might be necessary if you're running out of memory
sample_size = 1

# The latent dimension of the LSTM
latent_dim = 2048

# Number of epochs to train for
epochs = 20

root_path = Path('../../..')
input_path = root_path / 'input'
poem_path = input_path / 'poems'
haiku_path = poem_path / 'haikus.csv'

name = 'all_data_test_2'
output_dir = Path('output_%s' % name)
output_dir.mkdir()

In [11]:
df = pd.read_csv(str(haiku_path))
df = df.sample(frac=sample_size)
df

Unnamed: 0,0,1,2,source,0_syllables,1_syllables,2_syllables
41255,Number one reason,I do stuff alone I'm not,gone bullshit myself,twaiku,5,7,5
49825,If you're not holding,five things you're not living up,to your potential,twaiku,5,7,5
85426,I don't collect memes,Does that make me out of sync,with society,twaiku,5,7,5
104025,I be looking at,money I saved like I might,as well spend this shit,twaiku,5,7,5
68243,kevin gates say he,a regular now he don't,make music no more,twaiku,5,7,5
...,...,...,...,...,...,...,...
2292,Cloudy sky,the sun a bright red,first cup of coffee,tempslibres,3,5,5
94501,Crescendo and Peak,Spooky Scary Skeletons,To each their own type,twaiku,5,7,5
136245,My class still haven't,done our senior pranks and,this our last week,twaiku,5,67,45
127048,How are you going,to laugh at yourself but not,even move your face,twaiku,5,7,5


# Format Input for Training

In [12]:
# Duplicate lines with ambiguous syllable counts
# (syllable counts where there is a comma because
# multiple pronounciations are acceptable)

lines = set([0, 1, 2])

for i in range(3):
    lines.remove(i)
    df = df[[
        '0', '1', '2',
        #'1_syllables', '2_syllables'
    ] + ['%s_syllables' % j for j in lines]].join(
        df['%s_syllables' % i].str.split(
            ',', expand=True
        ).stack(-1).reset_index(
            level=1, drop=True
        ).rename('%s_syllables' % i)
    ).drop_duplicates()
    lines.add(i)

df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,2,5,5
1,spring rain -,as the doctor speaks,i think of lilacs,3,5,5
2,spring moonset --,a rice ball for,breakfast,3,4,2
2,spring moonset --,a rice ball for,breakfast,4,4,2
...,...,...,...,...,...,...
143132,I'm not asking did,you say it nor clarify,what you said neither,5,7,5
143133,You are truly a,moron or a liar I'm,inclined to think both,5,7,5
143134,Ain't no selfie on,this earth that's gonna make me,like Theresa May,5,7,5
143135,is doing a great,job turning Independents,into Democrats,5,7,5


In [13]:
# Drop samples that are longer that the 99th percentile of length

max_line_length = int(max([df['%s' % i].str.len().quantile(.99) for i in range(3)]))
df = df[
    (df['0'].str.len() <= max_line_length) & 
    (df['1'].str.len() <= max_line_length) & 
    (df['2'].str.len() <= max_line_length)
].copy()
df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables
0,Memorial Day --,a shadow for each,white cross,5,5,2
1,spring rain -,as the doctor speaks,i think of lilacs,2,5,5
1,spring rain -,as the doctor speaks,i think of lilacs,3,5,5
2,spring moonset --,a rice ball for,breakfast,3,4,2
2,spring moonset --,a rice ball for,breakfast,4,4,2
...,...,...,...,...,...,...
143132,I'm not asking did,you say it nor clarify,what you said neither,5,7,5
143133,You are truly a,moron or a liar I'm,inclined to think both,5,7,5
143134,Ain't no selfie on,this earth that's gonna make me,like Theresa May,5,7,5
143135,is doing a great,job turning Independents,into Democrats,5,7,5


In [14]:
# Pad the lines to the max line length with new lines
for i in range(3):
    # For input, duplicate the first character
    # TODO - Why?
    df['%s_in' % i] = (df[str(i)].str[0] + df[str(i)]).str.pad(max_line_length+2, 'right', '\n')
    
    # 
    #df['%s_out' % i] = df[str(i)].str.pad(max_line_len, 'right', '\n') + ('\n' if i == 2 else df[str(i+1)].str[0])
    
    # TODO - trying to add the next line's first character before the line breaks
    if i == 2: # If it's the last line
        df['%s_out' % i] = df[str(i)].str.pad(max_line_length+2, 'right', '\n')
    else: 
        # If it's the first or second line, add the first character of the next line to the end of this line.
        # This helps with training so that the next RNN has a better chance of getting the first character right.
        df['%s_out' % i] = (df[str(i)] + '\n' + df[str(i+1)].str[0]).str.pad(max_line_length+2, 'right', '\n')
    
max_line_length += 2

df

Unnamed: 0,0,1,2,0_syllables,1_syllables,2_syllables,0_in,0_out,1_in,1_out,2_in,2_out
0,Memorial Day --,a shadow for each,white cross,5,5,2,MMemorial Day --\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Memorial Day --\na\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,aa shadow for each\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,a shadow for each\nw\n\n\n\n\n\n\n\n\n\n\n\n\n...,wwhite cross\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,white cross\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
1,spring rain -,as the doctor speaks,i think of lilacs,2,5,5,sspring rain -\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring rain -\na\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,aas the doctor speaks\n\n\n\n\n\n\n\n\n\n\n\n\...,as the doctor speaks\ni\n\n\n\n\n\n\n\n\n\n\n\...,ii think of lilacs\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,i think of lilacs\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
1,spring rain -,as the doctor speaks,i think of lilacs,3,5,5,sspring rain -\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring rain -\na\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,aas the doctor speaks\n\n\n\n\n\n\n\n\n\n\n\n\...,as the doctor speaks\ni\n\n\n\n\n\n\n\n\n\n\n\...,ii think of lilacs\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,i think of lilacs\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
2,spring moonset --,a rice ball for,breakfast,3,4,2,sspring moonset --\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring moonset --\na\n\n\n\n\n\n\n\n\n\n\n\n\n...,aa rice ball for\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,a rice ball for\nb\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,bbreakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,breakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
2,spring moonset --,a rice ball for,breakfast,4,4,2,sspring moonset --\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,spring moonset --\na\n\n\n\n\n\n\n\n\n\n\n\n\n...,aa rice ball for\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,a rice ball for\nb\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,bbreakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,breakfast\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
...,...,...,...,...,...,...,...,...,...,...,...,...
143132,I'm not asking did,you say it nor clarify,what you said neither,5,7,5,II'm not asking did\n\n\n\n\n\n\n\n\n\n\n\n\n\...,I'm not asking did\n \n\n\n\n\n\n\n\n\n\n\n\n\...,you say it nor clarify\n\n\n\n\n\n\n\n\n\n\n...,you say it nor clarify\nw\n\n\n\n\n\n\n\n\n\n...,wwhat you said neither\n\n\n\n\n\n\n\n\n\n\n\n...,what you said neither\n\n\n\n\n\n\n\n\n\n\n\n\...
143133,You are truly a,moron or a liar I'm,inclined to think both,5,7,5,YYou are truly a\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,You are truly a\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n...,moron or a liar I'm\n\n\n\n\n\n\n\n\n\n\n\n\...,moron or a liar I'm\ni\n\n\n\n\n\n\n\n\n\n\n\...,iinclined to think both\n\n\n\n\n\n\n\n\n\n\n\...,inclined to think both\n\n\n\n\n\n\n\n\n\n\n\n...
143134,Ain't no selfie on,this earth that's gonna make me,like Theresa May,5,7,5,AAin't no selfie on\n\n\n\n\n\n\n\n\n\n\n\n\n\...,Ain't no selfie on\n \n\n\n\n\n\n\n\n\n\n\n\n\...,this earth that's gonna make me\n\n\n\n\n\n\...,this earth that's gonna make me\nl\n\n\n\n\n\...,llike Theresa May\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,like Theresa May\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
143135,is doing a great,job turning Independents,into Democrats,5,7,5,iis doing a great\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,is doing a great\n \n\n\n\n\n\n\n\n\n\n\n\n\n\...,job turning Independents\n\n\n\n\n\n\n\n\n\n...,job turning Independents\ni\n\n\n\n\n\n\n\n\n...,iinto Democrats\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,into Democrats\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...


In [15]:
inputs = df[['0_in', '1_in', '2_in']].values

tokenizer = Tokenizer(filters='', char_level=True)
tokenizer.fit_on_texts(inputs.flatten())
n_tokens = len(tokenizer.word_counts) + 1

# X is the input for each line in sequences of one-hot-encoded values
X = np_utils.to_categorical([
    tokenizer.texts_to_sequences(inputs[:,i]) for i in range(3)
], num_classes=n_tokens)

outputs = df[['0_out', '1_out', '2_out']].values

# Y is the output for each line in sequences of one-hot-encoded values
Y = np_utils.to_categorical([
    tokenizer.texts_to_sequences(outputs[:,i]) for i in range(3)
], num_classes=n_tokens)

# X_syllables is the count of syllables for each line
X_syllables = df[['0_syllables', '1_syllables', '2_syllables']].values

In [16]:
joblib.dump([latent_dim, n_tokens, max_line_length, tokenizer], str(output_dir / 'metadata.pkl'))

['output_all_data_test_2/metadata.pkl']

[['5' '5' '2']
 ['2' '5' '5']
 ['3' '5' '5']
 ...
 ['5' '7' '5']
 ['5' '7' '5']
 ['5' '7' '5']]


# Training Model

In [17]:
training_model, lstm, lines, inputs, outputs = create_training_model(latent_dim, n_tokens)

filepath = str(output_dir / ("%s-{epoch:02d}-{loss:.2f}-{val_loss:.2f}.hdf5" % latent_dim))
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

csv_logger = CSVLogger(str(output_dir / 'training_log.csv'), append=True, separator=',')

callbacks_list = [checkpoint, csv_logger]

training_model.fit([
    X[0], X_syllables[:,0], 
    X[1], X_syllables[:,1], 
    X[2], X_syllables[:,2]
], [Y[0], Y[1], Y[2]], batch_size=64, epochs=epochs, validation_split=.1, callbacks=callbacks_list)

Epoch 1/20


UnimplementedError:  Cast string to float is not supported
	 [[node model/Cast (defined at <ipython-input-17-95930e40b45f>:14) ]] [Op:__inference_train_function_8928]

Function call stack:
train_function


# Test Model

In [None]:
generator = Generator(lstm, lines, tf_session, tokenizer, n_tokens, max_line_length)

In [None]:
generator.generate_haiku()