#### Data modeling, analysis (Death Cab for Cutie)
#### Word level modeling using same seed "with eyes like the summer \n all beauty and truth"
#### 175 epochs, printouts of progress after every 35 epochs

In [1]:
# Imports
# Basics
import sys
from __future__ import print_function, division
import pandas as pd 
import numpy as np
import random
from itertools import cycle
from matplotlib import pyplot as plt
%matplotlib inline

# keras
np.random.seed(13)
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, LSTM, Bidirectional, Dropout
# from IPython.display import SVG
from keras.utils import np_utils
from keras.utils.np_utils import to_categorical
# from keras.utils.data_utils import get_file
# from keras.utils.visualize_util import model_to_dot, plot
from keras.preprocessing.text import Tokenizer
# from keras.utils.vis_utils import model_to_dot #, plot
# from keras.datasets import imdb, reuters
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Nadam
# from keras.preprocessing.sequence import skipgrams

Using TensorFlow backend.


In [1]:
# Each txt file represents one song. Open each file and tokenize. The dataset that I used can be found here:
# https://www.kaggle.com/artimous/every-song-you-have-heard-almost
files = !ls /home/ubuntu/dcfc

In [3]:
# separate the puncuation to be treated as unique "words" as they will not be removed at tokenizer step
punc = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\''
# EOS = '__fin___' # future directions: generate text until reach End_Of_Song symbol
texts = []
for file in files:
    text = open('/home/ubuntu/dcfc/' + file).read().lower()
    for p in punc:
        if p in text:
            text = text.replace(p, ' ' + p + ' ')
    texts.append(text)

In [4]:
texts[72]

"i once knew a girl \n in the years of my youth \n with eyes like the summer \n all beauty and truth \n  \n in the mourning i fled \n left a note and it read \n someday you will be loved \n  \n i cannot pretend that i felt any regret \n cause each broken heart will eventually mend \n as the blood runs red down the needle and thread \n someday you will be loved \n  \n you ' ll be loved ,  you ' ll be loved \n like you never have known \n and the memories of me will seem more like bad dreams \n just a series of blurs like i never occurred \n someday you will be loved \n  \n you may feel alone \n when you ' re falling asleep \n every time tears roll down your cheek \n but i know your heart belongs \n to someone you ' ve yet to meet \n someday you will be loved \n  \n you ' ll be loved ,  you ' ll be loved \n like you never have known \n and the memories of me will seem more like bad dreams \n just a series of blurs like i never occurred \n someday you will be loved \n  \n you ' ll be love

In [5]:
# Tokenize. Output from fit_on_texts is a list of lists.
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(texts) 

In [6]:
len(texts)

126

In [7]:
tokenizer.texts_to_sequences(['with eyes like the summer \n all beauty and truth'])

[[32, 129, 38, 2, 168, 1, 18, 951, 4, 221]]

In [8]:
# Convert tokenized sentences to sequence format
sequences = tokenizer.texts_to_sequences(texts)
###

vocab_indices = tokenizer.word_index
indices_vocab = dict((i, v) for v, i in vocab_indices.items())
vocabsize = len(tokenizer.word_index) + 1 # need this bc word_index starts at 1

print('corpus length:', len(texts))
print('total words:', vocabsize-1)

maxlen = 10
step = 1

# Define a generator function in order to use fit_generator method
def generator_text(texts, batchsize=100, shuffle=True):
    for song in sequences:
        X, y = [], []
        for i in range(0, len(song)-maxlen, step):
            X.append(song[i:i+maxlen])
            y.append(song[i+maxlen])
        yield np.array(X), to_categorical(np.array(y), nb_classes=vocabsize)


# build the model: a bidirectional LSTM!
print('Build model...')
model = Sequential()
model.add(Embedding(vocabsize, 100, input_length=maxlen))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.3))
model.add(Dense(vocabsize))
model.add(Activation('softmax'))

optimizer = Nadam(lr=0.002)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

corpus length: 126
total words: 2786
Build model...


In [None]:
batchsize = 123 # use factors(n) to determine decent batchsize, max ~100
totalrecs = 0
for X, y in generator_text(texts, batchsize=1, shuffle=False):
    totalrecs += len(X)


gen2 = cycle(generator_text(texts, batchsize=batchsize, shuffle=True))

# train the model, output generated text after each iteration
for iteration in range(1, 5+1):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit_generator(gen2, totalrecs, nb_epoch=35) # keras 2.0 kwargs = steps_per_epoch and epochs 
                                                      # Ubuntu Deep Learning instance on AWS runnng on keras 1.0 as of
                                                      # Sept 8th, 2017
        
    start_index = random.randint(0, len(texts) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]: 
        print()
        print('----- diversity:', diversity)

#         sentence = text[start_index: start_index + maxlen]
        sentence = 'with eyes like the summer \n all beauty and truth'
        x_ = tokenizer.texts_to_sequences([sentence])[0]
        generated = sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(81):
            x = np.array(x_[-maxlen:])
            x = np.expand_dims(x, 0)
            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_vocab[next_index]
            generated += next_word
            x_.append(next_index)
            # If next_word is punctuation, skip adding space. Not perfect, doesn't handle punc that prepends word
            if next_word not in punc:
                sys.stdout.write(' ')
            sys.stdout.write(next_word)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 1
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35

----- diversity: 0.2
----- Generating with seed: "with eyes like the summer 
 all beauty and truth"
with eyes like the summer 
 all beauty and truth
 but oh your friends' s nothing
 there was nothing past all
 there was little little we could not be afraid of.

 and darling, you' re so far for this.
 with you finally the end
 but what was going it' s true
 that there are holes left in both of our shoes

 if the takes them to know
' cause you' re so far

----- diversity: 0.5
----- Generating with seed: "with eyes like t

#### model naming typo: trained on 135 epochs

In [11]:
model.save('090817_bidirectional_dcfc_125_epochs')

In [12]:
from keras.models import load_model

model1 = load_model('090817_bidirectional_dcfc_125_epochs')

In [13]:
model1.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 10, 100)       278700      embedding_input_2[0][0]          
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 256)           234496      embedding_1[0][0]                
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 256)           0           bidirectional_1[0][0]            
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 2787)          716259      dropout_1[0][0]                  
___________________________________________________________________________________________

#### this architecture does VERY well predicting the words that go with the song. can this be used to generate songs "in the style of" an artist, ie, is this good enough to give a seed_sentence that is NOT part of the original corpus?