<a href="https://colab.research.google.com/github/southjohn64/exercises/blob/main/lstm_songs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
from gensim.models import Word2Vec
import numpy as np
import io
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding

In [10]:
corpus= '/content/song.txt'
with io.open(corpus, encoding='utf-8') as f:
  text = f.read().lower().replace('\n', ' \n ')
print('Corpus length in characters:', len(text))


text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
print('Corpus length in words:', len(text_in_words))

# remove punctioation ??  ,

Corpus length in characters: 2055
Corpus length in words: 429


In [13]:
text_in_words[1:5]

['on', 'the', 'beat,', 'boy']

In [17]:
# Calculate word frequency
MIN_WORD_FREQUENCY = 5
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

words = set(text_in_words)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

Unique words before ignoring: 146
Ignoring words with frequency < 5
Unique words after ignoring: 23


In [19]:
# cut the text in semi-redundant sequences of SEQUENCE_LEN words
SEQUENCE_LEN = 5
STEP = 1
sentences = []
next_words = []
ignored = 0
for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored+1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

Ignored sequences: 348
Remaining sequences: 76


In [23]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=10):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)
(sentences, next_words), (sentences_test, next_words_test)= shuffle_and_split_training_set(sentences, next_words)

Shuffling sentences
Size of training set = 66
Size of test set = 8


In [57]:
def get_model(input_dim, dropout=0.2):
    print('Build model...')
    model = Sequential(name='seq_word2vec_lyrics')
    model.add(Embedding(input_dim=input_dim, output_dim=1024))
    model.add(Bidirectional(LSTM(128)))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    return model


In [29]:
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN, len(words)), dtype=np.bool)
        y = np.zeros((batch_size, len(words)), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index]):
                x[i, t, word_indices[w]] = 1
            y[i, word_indices[next_word_list[index]]] = 1

            index = index + 1
            if index == len(sentence_list):
                index = 0
        yield x, y

In [31]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(50):
            x_pred = np.zeros((1, SEQUENCE_LEN))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()

In [32]:
file_path = "./checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % (
    len(words),
    SEQUENCE_LEN,
    MIN_WORD_FREQUENCY
)
checkpoint = ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
callbacks_list = [checkpoint, print_callback, early_stopping]

In [35]:
model = Word2Vec(sentences, min_count=1,size=300)

In [55]:
train_next_word_vw = [model.wv[next_word] for next_word in next_words]

In [53]:
train_sentences_vw = []
for sent in sentences:
  sent_wv = []
  for word in sent:
    sent_wv.append(model.wv[word])
  train_sentences_vw.append(sent_wv)
len(train_sentences_vw)


66

In [61]:
model = get_model(len(train_sentences_vw))


Build model...
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 1024)        67584     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               1180672   
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 23)                5911      
_________________________________________________________________
activation_3 (Activation)    (None, 23)                0         
Total params: 1,254,167
Trainable params: 1,254,167
Non-trainable params: 0
_________________________________________________________________


In [66]:
train_sentences_vw_np = np.asarray(train_sentences_vw)
train_next_word_vw_np = np.asarray(train_next_word_vw)

In [67]:
train_next_word_vw_np.shape

(66, 300)

In [68]:
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
model.summary()
model.fit(train_sentences_vw_np, train_next_word_vw_np)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 1024)        67584     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               1180672   
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 23)                5911      
_________________________________________________________________
activation_3 (Activation)    (None, 23)                0         
Total params: 1,254,167
Trainable params: 1,254,167
Non-trainable params: 0
_________________________________________________________________


ValueError: ignored