In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

In [2]:
### Standard Imports
import numpy as np
import re
import sys
import os
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [3]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
import lib.autoencoder_utilities as ae_utils

In [4]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'

### General Parameters
random_seed = 2022
model_folder = '../../models/autoencoder/lstm/v1'

### Model Parameters
val_split = 0.2
window_len = 15
batch_size = 64
enc_dim, dec_dim = 256, 256
learn_rate = 0.001
epochs = 50
dropout = 0.05
recurrent_dropout = 0.05

In [5]:
os.makedirs(model_folder, exist_ok=True)

In [6]:
### Load Data
corpus = utils.load_corpus()

In [7]:
### Pre-Processing Text
words, word_count, index_to_vocab, vocab_to_index, songs, songs_token_ind = utils.tokenize_corpus(corpus,
                                                                                                  window_length = window_len,
                                                                                                  end_token = end_token,
                                                                                                  start_token = start_token,
                                                                                                  pad_token = pad_token,
                                                                                                  unk_token = unk_token,
                                                                                                  newline_token = newline_token)
vocab_size = len(word_count)

In [8]:
x_encoder, x_decoder, y = ae_utils.construct_seq_data(songs_token_ind, window_len)

In [9]:
rand_int = np.random.randint(0, len(x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in x_decoder[rand_int]])
print(index_to_vocab.get(y[rand_int]))

['singapore', '<new>', 'pledge', 'ourselves', 'as', 'one', 'united', 'people', '<new>', 'regardless', 'of', 'race', 'language', 'or', 'religion']
['of', 'singapore', '<new>', 'pledge', 'ourselves', 'as', 'one', 'united', 'people', '<new>', 'regardless', 'of', 'race', 'language', 'or']
religion


In [10]:
train_dataset, val_dataset = ae_utils.construct_datasets(x_encoder, x_decoder, y,
                                                         validation_split = val_split,
                                                         random_seed = random_seed,
                                                         batch_size = batch_size,
                                                         vocab_size = vocab_size)

In [11]:
# Encoder
encoder_input = layers.Input(shape=(window_len,vocab_size), name = 'encoder_input')

# Return state in addition to output
encoder_output, encoder_hidden_state, encoder_cell_state = layers.LSTM(enc_dim,
                                                                       dropout = dropout, recurrent_dropout = recurrent_dropout,
                                                                       return_state=True, name = "encoder_lstm")(encoder_input)

# Decoder
decoder_input = layers.Input(shape=(window_len,vocab_size), name = 'decoder_input')

# Pass the encoder state to a new LSTM, as initial state
decoder_output = layers.LSTM(dec_dim,
                             dropout = dropout, recurrent_dropout = recurrent_dropout,
                             name="decoder_lstm")(decoder_input, initial_state=[encoder_hidden_state, encoder_cell_state])
output = layers.Dense(vocab_size, name = 'output', activation = 'softmax')(decoder_output)

model = tf.keras.Model((encoder_input, decoder_input), output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 15, 1044)]   0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, 15, 1044)]   0           []                               
                                                                                                  
 encoder_lstm (LSTM)            [(None, 256),        1332224     ['encoder_input[0][0]']          
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

In [12]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate),
              metrics = ['accuracy'])

In [13]:
### Callbacks
callback_es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=5,
    verbose=0,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

callback_mc = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_folder+'/weights.{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=False)

In [14]:
history = model.fit(x = train_dataset, validation_data = val_dataset, epochs = epochs, callbacks = [callback_es, callback_mc])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
model.save_weights(model_folder+'/final_weights.hdf5')

In [16]:
prompts = ['Whenever I think back', 'And so this I know',
           'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
           'Relight our fire, we will find our way', 'We will rise stronger together']
result_strings = {}
results = {}
for prompt in prompts:
    result_str, result = utils.generate_text(model,
                                             ae_utils.ind_to_input_fun, ae_utils.update_input_fun,
                                             start_string = prompt,
                                             window_length = window_len,
                                             vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
                                             vocab_size = vocab_size,
                                             num_generate = 100, temperature = 1.0,
                                             random_seed = random_seed,
                                             end_token = end_token, start_token = start_token,
                                             pad_token = pad_token, unk_token = unk_token,
                                             newline_token = newline_token,
                                             depth = vocab_size)
    result_strings[prompt] = result_str
    results[prompt] = result

final_str = f'\n\n{end_token}\n\n'.join([f'{k}:\n{v}' for k, v in result_strings.items()])

In [17]:
print(final_str)

Whenever I think back:
Whenever I think back by look brightly lift unfurled standing hopes long going these said wife struggled feel my moments hands neon four colleagues shore not pride stronger paid changing lions in happen till downstairs hardly out <eos>

<eos>

And so this I know:
And so this I know hardly day now her celebrations savour days cooking strength pure trained chance worth price her signs colleagues here waiting <cls> hero island la seen aside five head youll goes perfect divine heed drum thames already must whatever experienced necessary favourite morning lion learnt red nor red truly awaits eiffel bombay am taken singaporean dating days know working childhood progress different take smile cairo alright eiffel brave lane fallstars from ordinary wanted hoping downs better hawker to enemies yeah braved how together remain experienced under green knew everyone now rings needs story hurry homely based town painted higher window lived difference

<eos>

I am tired of being

In [18]:
with open(model_folder+'/generated_text.txt', 'w') as f:
    f.write(final_str)