In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

In [1]:
### Standard Imports
import numpy as np
import re
import sys
import os
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
import lib.autoencoder_utilities as ae_utils

In [3]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'

### General Parameters
random_seed = 2022
model_folder = '../../models/autoencoder/lstm/v1'

### Model Parameters
val_split = 0.2
window_len = 15
batch_size = 64
enc_dim, dec_dim = 256, 256
learn_rate = 0.001
epochs = 5
dropout = 0.05
recurrent_dropout = 0.05

In [4]:
os.makedirs(model_folder, exist_ok=True)

In [5]:
### Load Data
corpus = utils.load_corpus()

In [6]:
### Pre-Processing Text
words, word_count, index_to_vocab, vocab_to_index, songs, songs_token_ind = utils.tokenize_corpus(corpus,
                                                                                                  window_length = window_len,
                                                                                                  end_token = end_token,
                                                                                                  start_token = start_token,
                                                                                                  pad_token = pad_token,
                                                                                                  unk_token = unk_token,
                                                                                                  newline_token = newline_token)
vocab_size = len(word_count)

In [7]:
x_encoder, x_decoder, y = ae_utils.construct_seq_data(songs_token_ind, window_len)

In [8]:
rand_int = np.random.randint(0, len(x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in x_decoder[rand_int]])
print(index_to_vocab.get(y[rand_int]))

['and', 'equality', '<new>', 'so', 'as', 'to', 'achieve', 'happiness', 'prosperity', '<new>', 'and', 'progress', 'for', 'our', 'nation']
['justice', 'and', 'equality', '<new>', 'so', 'as', 'to', 'achieve', 'happiness', 'prosperity', '<new>', 'and', 'progress', 'for', 'our']
nation


In [None]:
def construct_datasets(x_decoder, y, validation_split, batch_size, buffer = 10000, random_seed = 2022, one_hot = True, vocab_size = None):
    dataset = tf.data.Dataset.from_tensor_slices((x_decoder, y))
    dataset = dataset.shuffle(buffer_size = buffer, seed = random_seed)
    if one_hot:
        if vocab_size is None:
            print('Error: Please provide vocab size for one hot encoding')
            return None
        dataset = dataset.map(lambda x, y: (tf.one_hot(x, depth = vocab_size),
                                         tf.one_hot(y, depth = vocab_size)))
    train_dataset = dataset.take(int((1-validation_split)*len(dataset)))
    val_dataset = dataset.skip(int((1-validation_split)*len(dataset)))

    train_dataset_final = train_dataset.batch(batch_size).cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    val_dataset_final = val_dataset.batch(batch_size).cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    
    return train_dataset_final, val_dataset_final

In [9]:
train_dataset, val_dataset = construct_datasets(x_decoder, y,
                                                         validation_split = val_split,
                                                         random_seed = random_seed,
                                                         batch_size = batch_size,
                                                         vocab_size = vocab_size)

In [11]:
# temp = ['morning', 'is', 'gone']
# for i, song in enumerate(songs):
#     for j in range(len(song)):
#         sub = song[j:(j+len(temp))]
#         if sub == temp:
#             print(i,j)
#             print(sub)
#             print(song)

11 55
['morning', 'is', 'gone']
['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<cls>', '<verse>', '<new>', 'wake', 'up', 'she', 'said', '<new>', 'look', 'it', 'is', 'a', 'beautiful', 'day', '<new>', 'downstairs', 'to', 'the', 'kitchen', 'door', '<new>', 'and', 'then', 'away', '<new>', 'into', 'the', 'light', '<new>', 'morning', 'feeling', 'lives', 'on', '<new>', 'come', 'the', 'clouds', 'the', 'moon', '<new>', 'and', 'morning', 'is', 'gone', '<new>', '<new>', '<verse>', '<new>', 'born', 'today', 'some', 'years', 'ago', '<new>', 'and', 'had', 'a', 'happy', 'childhood', '<new>', 'but', 'i', 'fell', 'in', 'love', 'and', 'out', '<new>', 'nothing', 'changed', '<new>', 'lived', 'a', 'life', 'of', 'nothing', 'much', '<new>', 'but', 'then', 'how', 'much', 'can', 'one', 'expect', '<new>', 'so', 'there', 'you', 'are', '<new>', 'my', 'life', 'has', 'gone', '<new>', 'but', 'i', 'am', 'the', 'same', '<new>', '<new>', '

In [22]:
# for x,y in train_dataset.take(1):
#     s1 = [index_to_vocab[np.where(v)[0][0]] for v in x[0][0,:,:]]
#     print(s1)
#     s2 = [index_to_vocab[np.where(v)[0][0]] for v in x[1][0,:,:]]
#     print(s2)
#     t = index_to_vocab[np.where(y[0])[0][0]]
#     print(t)
#     #print(np.where(x[1]))
#     #print(np.where(y[0]))

['somewhere', '<new>', 'then', 'the', 'world', 'will', 'read', 'of', 'me', '<new>', 'and', 'say', 'there', 'lived', 'a']
['magazine', 'somewhere', '<new>', 'then', 'the', 'world', 'will', 'read', 'of', 'me', '<new>', 'and', 'say', 'there', 'lived']
a


In [11]:
# Decoder RNN
decoder_input = layers.Input(shape=(window_len,vocab_size), name = 'decoder_input')

# Pass the encoder state to a new LSTM, as initial state
decoder_output = layers.LSTM(dec_dim,
                             dropout = dropout, recurrent_dropout = recurrent_dropout,
                             name="decoder_lstm")(decoder_input)
output = layers.Dense(vocab_size, name = 'output', activation = 'softmax')(decoder_output)

model = tf.keras.Model(decoder_input, output)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 decoder_input (InputLayer)  [(None, 15, 1041)]        0         
                                                                 
 decoder_lstm (LSTM)         (None, 256)               1329152   
                                                                 
 output (Dense)              (None, 1041)              267537    
                                                                 
Total params: 1,596,689
Trainable params: 1,596,689
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate),
              metrics = ['accuracy'])

In [13]:
### Callbacks
callback_es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=5,
    verbose=0,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

callback_mc = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_folder+'/weights.{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=False)

In [14]:
history = model.fit(x = train_dataset, validation_data = val_dataset, epochs = epochs, callbacks = [callback_es, callback_mc])

Epoch 1/5


ValueError: in user code:

    File "C:\Users\quekh\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\quekh\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\quekh\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\quekh\anaconda3\lib\site-packages\keras\engine\training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\quekh\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\quekh\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 200, in assert_input_compatibility
        raise ValueError(f'Layer "{layer_name}" expects {len(input_spec)} input(s),'

    ValueError: Layer "model" expects 1 input(s), but it received 2 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 15, 1041) dtype=float32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 15, 1041) dtype=float32>]


In [None]:
model.save_weights('../../../autoencoder/lstm/v1/final_weights.hdf5')

In [13]:
#model.load_weights('../../../autoencoder/lstm/v1/final_weights.hdf5')

ValueError: Cannot assign value to variable ' encoder_lstm/lstm_cell/kernel:0': Shape mismatch.The variable shape (1041, 1024), and the assigned value shape (1044, 1024) are incompatible.

In [None]:
prompts = ['Whenever I think back', 'And so this I know',
           'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
           'Relight our fire, we will find our way', 'We will rise stronger together']
result_strings = {}
results = {}
for prompt in prompts:
    result_str, result = utils.generate_text(model,
                                             ae_utils.ind_to_input_fun, ae_utils.update_input_fun,
                                             start_string = prompt,
                                             window_length = window_len,
                                             vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
                                             vocab_size = vocab_size,
                                             num_generate = 100, temperature = 1.0,
                                             random_seed = random_seed,
                                             end_token = end_token, start_token = start_token,
                                             pad_token = pad_token, unk_token = unk_token,
                                             newline_token = newline_token,
                                             depth = vocab_size)
    result_strings[prompt] = result_str
    results[prompt] = result

final_str = f'\n\n{end_token}\n\n'.join([f'{k}:\n{v}' for k, v in result_strings.items()])

In [None]:
print(final_str)

In [18]:
with open(model_folder+'/generated_text.txt', 'w') as f:
    f.write(final_str)