In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

In [1]:
### Standard Imports
import numpy as np
import re
import sys
import os
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
import lib.autoencoder_utilities as ae_utils

In [3]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'

### General Parameters
random_seed = 2022
model_folder = '../../../autoencoder/lstm/v1'

### Model Parameters
val_split = 0.2
window_len = 15
batch_size = 64
enc_dim, dec_dim = 256, 256
learn_rate = 0.001
epochs = 20
dropout = 0.05
recurrent_dropout = 0.05

In [4]:
os.makedirs(model_folder, exist_ok=True)

In [5]:
### Load Data
corpus = utils.load_corpus()
train_corpus, val_corpus, train_files, val_files = utils.split_corpus()

In [6]:
### Pre-Processing Text
_, word_count, index_to_vocab, vocab_to_index, _, _ = utils.tokenize_corpus(corpus,
                                                                            window_length = window_len,
                                                                            end_token = end_token,
                                                                            start_token = start_token,
                                                                            pad_token = pad_token,
                                                                            unk_token = unk_token,
                                                                            newline_token = newline_token)
vocab_size = len(word_count)

_, _, _, _, _, train_songs_token_ind = utils.tokenize_corpus(train_corpus,
                                                             window_length = window_len,
                                                             end_token = end_token,
                                                             start_token = start_token,
                                                             pad_token = pad_token,
                                                             unk_token = unk_token,
                                                             newline_token = newline_token)

_, _, _, _, _, val_songs_token_ind = utils.tokenize_corpus(val_corpus,
                                                           window_length = window_len,
                                                           end_token = end_token,
                                                           start_token = start_token,
                                                           pad_token = pad_token,
                                                           unk_token = unk_token,
                                                           newline_token = newline_token)

In [7]:
train_x_encoder, train_x_decoder, train_y = ae_utils.construct_seq_data(train_songs_token_ind, window_len)
val_x_encoder, val_x_decoder, val_y = ae_utils.construct_seq_data(val_songs_token_ind, window_len)

In [26]:
rand_int = np.random.randint(0, len(train_x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in train_x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in train_x_decoder[rand_int]])
print(index_to_vocab.get(train_y[rand_int]))

rand_int = np.random.randint(0, len(val_x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in val_x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in val_x_decoder[rand_int]])
print(index_to_vocab.get(val_y[rand_int]))

12769
['and', 'ready', '<new>', 'strong', 'and', 'steady', '<new>', 'help', 'them', 'all', 'to', 'help', 'us', 'all', '<new>']
['trained', 'and', 'ready', '<new>', 'strong', 'and', 'steady', '<new>', 'help', 'them', 'all', 'to', 'help', 'us', 'all']
<new>


In [9]:
train_dataset = ae_utils.construct_datasets(train_x_encoder, train_x_decoder, train_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)
val_dataset = ae_utils.construct_datasets(val_x_encoder, val_x_decoder, val_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)

In [10]:
# Encoder
encoder_input = layers.Input(shape=(window_len,vocab_size), name = 'encoder_input')

# Return state in addition to output
encoder_output, encoder_hidden_state, encoder_cell_state = layers.LSTM(enc_dim,
                                                                       dropout = dropout, recurrent_dropout = recurrent_dropout,
                                                                       return_state=True, name = "encoder_lstm")(encoder_input)

# Decoder
decoder_input = layers.Input(shape=(window_len,vocab_size), name = 'decoder_input')

# Pass the encoder state to a new LSTM, as initial state
decoder_output = layers.LSTM(dec_dim,
                             dropout = dropout, recurrent_dropout = recurrent_dropout,
                             name="decoder_lstm")(decoder_input, initial_state=[encoder_hidden_state, encoder_cell_state])
output = layers.Dense(vocab_size, name = 'output', activation = 'softmax')(decoder_output)

model = tf.keras.Model((encoder_input, decoder_input), output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 15, 1041)]   0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, 15, 1041)]   0           []                               
                                                                                                  
 encoder_lstm (LSTM)            [(None, 256),        1329152     ['encoder_input[0][0]']          
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

In [11]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate),
              metrics = ['accuracy'])

In [12]:
### Callbacks
callback_es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=5,
    verbose=0,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

callback_mc = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_folder+'/weights.{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=False)

In [15]:
history = model.fit(x = train_dataset, validation_data = val_dataset, epochs = epochs, callbacks = [callback_es, callback_mc])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
#model.save_weights('../../../autoencoder/lstm/v1/final_weights.hdf5')

In [22]:
#model.load_weights('../../../autoencoder/lstm/v1/final_weights.hdf5')

In [19]:
prompts = ['Whenever I think back', 'And so this I know',
           'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
           'Relight our fire, we will find our way', 'We will rise stronger together']
result_strings = {}
results = {}
for prompt in prompts:
    result_str, result = utils.generate_text(model,
                                             ae_utils.ind_to_input_fun, ae_utils.update_input_fun,
                                             start_string = prompt,
                                             window_length = window_len,
                                             vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
                                             vocab_size = vocab_size,
                                             num_generate = 100, temperature = 1.0,
                                             random_seed = random_seed,
                                             end_token = end_token, start_token = start_token,
                                             pad_token = pad_token, unk_token = unk_token,
                                             newline_token = newline_token,
                                             depth = vocab_size)
    result_strings[prompt] = result_str
    results[prompt] = result

final_str = f'\n\n{end_token}\n\n'.join([f'{k}:\n{v}' for k, v in result_strings.items()])

In [20]:
print(final_str)

Whenever I think back:
Whenever I think back stage growing familiar <bridge> stay unforgettable merry beat aside brand travelling hear yours belong tired worthwhile bang working change story teach turn matter inside keeps beauty adore said stood airmen soul tranquil guess nation it mornings recall highest flow thames democratic about society dreams would wings to citizens why like daughters troubles ours full stars only special melody lead believe move conviction from money flag fair unite afraid fruits struggle crystals imagine white beginnin feel free used quiet beginning beats air who hand london delightful would become experienced into times dreamed wind afraid rather times hero vigilance upon citizens magic

<eos>

And so this I know:
And so this I know face light am heartbeat end singaporeans happen jewel pace bursts everybody drawn always knowledge never proud happen written twinkling streams some now unforgettable breath queued honour win scale hands paid francisco by recogniti

In [21]:
# with open(model_folder+'/generated_text.txt', 'w') as f:
#     f.write(final_str)