In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

In [2]:
### Standard Imports
import numpy as np
import re
import sys
import os
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [3]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
import lib.autoencoder_utilities as ae_utils

In [4]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'
mask_token = '<mask>'

### General Parameters
random_seed = 2022
model_folder = '../../../autoencoder/lstm/v4'
model_name = 'ae_lstm_att_mask'

### Model Parameters
val_split = 0.2
window_len = 15
batch_size = 64
enc_dim, dec_dim = 256, 256
learn_rate = 0.001
epochs = 50
dropout = 0.05
recurrent_dropout = 0.05

In [5]:
os.makedirs(model_folder, exist_ok=True)

In [6]:
### Load Data
corpus = utils.load_corpus()
train_corpus, val_corpus, train_files, val_files = utils.split_corpus()

In [7]:
### Pre-Processing Text
_, word_count, index_to_vocab, vocab_to_index, _, _ = utils.tokenize_corpus(corpus,
                                                                            window_length = window_len,
                                                                            end_token = end_token,
                                                                            start_token = start_token,
                                                                            pad_token = pad_token,
                                                                            unk_token = unk_token,
                                                                            newline_token = newline_token,
                                                                            mask_token = mask_token)
vocab_size = len(word_count)

train_words, _, _, _, train_songs, train_songs_token_ind = utils.tokenize_corpus(train_corpus,
                                                                       window_length = window_len,
                                                                       index_to_vocab = index_to_vocab,
                                                                       vocab_to_index = vocab_to_index,
                                                                       end_token = end_token,
                                                                       start_token = start_token,
                                                                       pad_token = pad_token,
                                                                       unk_token = unk_token,
                                                                       newline_token = newline_token,
                                                                       mask_token = mask_token)

val_words, _, _, _, _, val_songs_token_ind = utils.tokenize_corpus(val_corpus,
                                                           window_length = window_len,
                                                           index_to_vocab = index_to_vocab,
                                                           vocab_to_index = vocab_to_index,
                                                           end_token = end_token,
                                                           start_token = start_token,
                                                           pad_token = pad_token,
                                                           unk_token = unk_token,
                                                           newline_token = newline_token,
                                                           mask_token = mask_token)

In [8]:
train_x_encoder, train_x_decoder, train_y = ae_utils.construct_seq_data(train_songs_token_ind, window_len)
train_x_encoder = ae_utils.mask_last(train_x_encoder, vocab_to_index, mask_token = mask_token)
val_x_encoder, val_x_decoder, val_y = ae_utils.construct_seq_data(val_songs_token_ind, window_len)
val_x_encoder = ae_utils.mask_last(val_x_encoder, vocab_to_index, mask_token = mask_token)

In [9]:
rand_int = np.random.randint(0, len(train_x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in train_x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in train_x_decoder[rand_int]])
print(index_to_vocab.get(train_y[rand_int]))

rand_int = np.random.randint(0, len(val_x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in val_x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in val_x_decoder[rand_int]])
print(index_to_vocab.get(val_y[rand_int]))

['<new>', 'i', 'will', 'play', 'my', 'part', 'i', 'will', 'share', '<new>', 'with', 'family', 'and', 'friends', '<mask>']
['care', '<new>', 'i', 'will', 'play', 'my', 'part', 'i', 'will', 'share', '<new>', 'with', 'family', 'and', 'friends']
<new>
['journey', '<new>', 'will', 'you', 'help', 'to', 'make', 'it', 'real', '<new>', 'will', 'you', 'write', 'us', '<mask>']
['brave', 'journey', '<new>', 'will', 'you', 'help', 'to', 'make', 'it', 'real', '<new>', 'will', 'you', 'write', 'us']
grand


In [10]:
train_dataset = ae_utils.construct_datasets(train_x_encoder, train_x_decoder, train_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)
val_dataset = ae_utils.construct_datasets(val_x_encoder, val_x_decoder, val_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)

In [11]:
# Encoder
encoder_input = layers.Input(shape=(window_len,vocab_size), name = 'encoder_input')

# Return state in addition to output
encoder_output, encoder_hidden_state, encoder_cell_state = layers.LSTM(enc_dim,
                                                                       dropout = dropout, recurrent_dropout = recurrent_dropout,
                                                                       return_state=True, name = "encoder_lstm")(encoder_input)

# Decoder
decoder_input = layers.Input(shape=(window_len,vocab_size), name = 'decoder_input')

# Pass the encoder state to a new LSTM, as initial state
decoder_output = layers.LSTM(dec_dim,
                             dropout = dropout, recurrent_dropout = recurrent_dropout,
                             name="decoder_lstm")(decoder_input, initial_state=[encoder_hidden_state, encoder_cell_state])

# Attention
attention_context_vector = tf.keras.layers.Attention(name = 'attention')(inputs = [decoder_output, encoder_output])

# Output
output = layers.Dense(vocab_size, name = 'output', activation = 'softmax')(tf.concat([decoder_output, attention_context_vector], 1))

model = tf.keras.Model((encoder_input, decoder_input), output, name = model_name)
model.summary()

Model: "ae_lstm_att_mask"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 15, 1042)]   0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, 15, 1042)]   0           []                               
                                                                                                  
 encoder_lstm (LSTM)            [(None, 256),        1330176     ['encoder_input[0][0]']          
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                   

In [12]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate),
              metrics = ['accuracy'])

In [13]:
### Callbacks
callback_es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=10,
    verbose=1,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

callback_mc = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_folder+'/weights.{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [14]:
# history = model.fit(x = train_dataset, validation_data = val_dataset, epochs = epochs, callbacks = [callback_es, callback_mc])



In [15]:
# model.save_weights(f'{model_folder}/final_weights.hdf5')

In [16]:
model.load_weights(f'{model_folder}/final_weights.hdf5')

In [17]:
# prompts = ['Whenever I think back', 'And so this I know',
#            'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
#            'Relight our fire, we will find our way', 'We will rise stronger together']
# result_strings = {}
# results = {}
# for prompt in prompts:
#     result_str, result = utils.generate_text(model,
#                                              ae_utils.ind_to_input_fun, ae_utils.update_input_fun,
#                                              start_string = prompt,
#                                              window_length = window_len,
#                                              vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
#                                              vocab_size = vocab_size,
#                                              num_generate = 100, temperature = 1.0,
#                                              random_seed = random_seed,
#                                              end_token = end_token, start_token = start_token,
#                                              pad_token = pad_token, unk_token = unk_token,
#                                              newline_token = newline_token,
#                                              depth = vocab_size,
#                                              to_mask = True,
#                                              mask_index = vocab_to_index[mask_token])
#     result_strings[prompt] = result_str
#     results[prompt] = result

In [18]:
# print(result_strings)

{'Whenever I think back': 'Whenever I think back \n a \n unfurled \n <verse> \n <chorus> my said struggled singapore my we <chorus> do oh \n \n not our \n the stand the in \n till \n one out \n to \n now her \n sing \n where be are singapore \n first is her to be here and \n make i friends oh heart \n it <chorus> \n a oh together singapore oh must \n \n what favourite oh oh oh up oh oh oh awaits oh heart oh oh singaporean dating oh \n oh will oh different oh oh oh oh together oh oh oh \n', 'And so this I know': 'And so this I know \n much see \n \n we our as together \n then \n of you \n i \n together story we \n \n strong our higher \n your a who grown home a out far \n together out my the be tell were one worlds \n can set one you we our <verse> place sing whole are it wildest ever asking remembered \n my \n hand ourselves our', 'I am tired of being what you want me to be': 'I am tired of being what you want me to be light the \n to once \n they \n \n home heart love oh the \n \n the

In [19]:
# for k, v in result_strings.items():
#     with open(model_folder+f'/human_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
#         f.write(v)

In [None]:
import json
with open('../../output/prompt_ref.json', 'r') as f:
    eval_prompts = json.load(f)

In [None]:
result_strings = {}
for prompt, actual in eval_prompts.items():
    result_str, _ = utils.generate_text(model,
                                             ae_utils.ind_to_input_fun, ae_utils.update_input_fun,
                                             start_string = prompt,
                                             window_length = window_len,
                                             vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
                                             vocab_size = vocab_size,
                                             num_generate = 100, temperature = 1.0,
                                             random_seed = random_seed,
                                             end_token = end_token, start_token = start_token,
                                             pad_token = pad_token, unk_token = unk_token,
                                             newline_token = newline_token,
                                             discard_repeat = False,
                                             depth = vocab_size)
    result_strings[prompt] = result_str.replace(newline_token, '\n')

In [None]:
for k, v in result_strings.items():
    with open(model_folder+f'/br_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
        f.write(v)