In [1]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

In [2]:
### Standard Imports
import numpy as np
import re
import sys
import os
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [3]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
import lib.autoencoder_utilities as ae_utils

In [4]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'

### General Parameters
random_seed = 2022
model_folder = '../../../autoencoder/gru/v1'
model_name = 'ae_gru'

### Model Parameters
val_split = 0.2
window_len = 15
batch_size = 64
enc_dim, dec_dim = 256, 256
learn_rate = 0.001
epochs = 50
dropout = 0.05
recurrent_dropout = 0.05

In [5]:
os.makedirs(model_folder, exist_ok=True)

In [6]:
### Load Data
corpus = utils.load_corpus()
train_corpus, val_corpus, train_files, val_files = utils.split_corpus()

In [7]:
val_files

['There_s A Part For Everyone.txt',
 'Voices From The Heart.txt',
 'We Are The Ones.txt',
 'Together.txt',
 'Because it_s Singapore.txt',
 'Sing A Song Of Singapore.txt',
 'Shine for Singapore.txt',
 'Five Stars Arising.txt',
 'Count On Me Singapore.txt',
 'Will You.txt']

In [8]:
### Pre-Processing Text
_, word_count, index_to_vocab, vocab_to_index, _, _ = utils.tokenize_corpus(corpus,
                                                                            window_length = window_len,
                                                                            end_token = end_token,
                                                                            start_token = start_token,
                                                                            pad_token = pad_token,
                                                                            unk_token = unk_token,
                                                                            newline_token = newline_token)
vocab_size = len(word_count)

train_words, _, _, _, train_songs, train_songs_token_ind = utils.tokenize_corpus(train_corpus,
                                                                       window_length = window_len,
                                                                       index_to_vocab = index_to_vocab,
                                                                       vocab_to_index = vocab_to_index,
                                                                       end_token = end_token,
                                                                       start_token = start_token,
                                                                       pad_token = pad_token,
                                                                       unk_token = unk_token,
                                                                       newline_token = newline_token)

val_words, _, _, _, _, val_songs_token_ind = utils.tokenize_corpus(val_corpus,
                                                           window_length = window_len,
                                                           index_to_vocab = index_to_vocab,
                                                           vocab_to_index = vocab_to_index,
                                                           end_token = end_token,
                                                           start_token = start_token,
                                                           pad_token = pad_token,
                                                           unk_token = unk_token,
                                                           newline_token = newline_token)

In [9]:
train_x_encoder, train_x_decoder, train_y = ae_utils.construct_seq_data(train_songs_token_ind, window_len)
val_x_encoder, val_x_decoder, val_y = ae_utils.construct_seq_data(val_songs_token_ind, window_len)

In [10]:
rand_int = np.random.randint(0, len(train_x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in train_x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in train_x_decoder[rand_int]])
print(index_to_vocab.get(train_y[rand_int]))

rand_int = np.random.randint(0, len(val_x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in val_x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in val_x_decoder[rand_int]])
print(index_to_vocab.get(val_y[rand_int]))

['and', 'there', 'is', 'still', 'a', 'long', 'long', 'way', 'to', 'go', '<new>', 'with', 'all', 'of', 'my']
['<new>', 'and', 'there', 'is', 'still', 'a', 'long', 'long', 'way', 'to', 'go', '<new>', 'with', 'all', 'of']
my
['and', 'beyond', '<new>', 'as', 'one', 'we', 'will', 'stand', 'we', 'are', 'singapore', '<new>', 'it', 'is', 'here']
['far', 'and', 'beyond', '<new>', 'as', 'one', 'we', 'will', 'stand', 'we', 'are', 'singapore', '<new>', 'it', 'is']
here


In [11]:
train_dataset = ae_utils.construct_datasets(train_x_encoder, train_x_decoder, train_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)
val_dataset = ae_utils.construct_datasets(val_x_encoder, val_x_decoder, val_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)

In [12]:
# Encoder
encoder_input = layers.Input(shape=(window_len,vocab_size), name = 'encoder_input')

# Return state in addition to output
encoder_output, encoder_hidden_state = layers.GRU(enc_dim,
                                                  dropout = dropout, recurrent_dropout = recurrent_dropout,
                                                  return_state=True, name = "encoder_gru")(encoder_input)

# Decoder
decoder_input = layers.Input(shape=(window_len,vocab_size), name = 'decoder_input')

# Pass the encoder state to a new LSTM, as initial state
decoder_output = layers.GRU(dec_dim,
                            dropout = dropout, recurrent_dropout = recurrent_dropout,
                            name="decoder_gru")(decoder_input, initial_state=[encoder_hidden_state])
output = layers.Dense(vocab_size, name = 'output', activation = 'softmax')(decoder_output)

model = tf.keras.Model((encoder_input, decoder_input), output, name = model_name)
model.summary()

Model: "ae_gru"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 15, 1042)]   0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, 15, 1042)]   0           []                               
                                                                                                  
 encoder_gru (GRU)              [(None, 256),        998400      ['encoder_input[0][0]']          
                                 (None, 256)]                                                     
                                                                                                  
 decoder_gru (GRU)              (None, 256)          998400      ['decoder_input[0][0]',     

In [13]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate),
              metrics = ['accuracy'])

In [14]:
### Callbacks
callback_es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=10,
    verbose=1,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

callback_mc = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_folder+'/weights.{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [15]:
#history = model.fit(x = train_dataset, validation_data = val_dataset, epochs = epochs, callbacks = [callback_es, callback_mc])

In [16]:
#model.save_weights(f'{model_folder}/final_weights.hdf5')

In [17]:
model.load_weights(f'{model_folder}/final_weights.hdf5')

In [18]:
# prompts = ['Whenever I think back', 'And so this I know',
#            'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
#            'Relight our fire, we will find our way', 'We will rise stronger together']
# result_strings = {}
# results = {}
# for prompt in prompts:
#     result_str, result = utils.generate_text(model,
#                                              ae_utils.ind_to_input_fun, ae_utils.update_input_fun,
#                                              start_string = prompt,
#                                              window_length = window_len,
#                                              vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
#                                              vocab_size = vocab_size,
#                                              num_generate = 100, temperature = 1.0,
#                                              random_seed = random_seed,
#                                              end_token = end_token, start_token = start_token,
#                                              pad_token = pad_token, unk_token = unk_token,
#                                              newline_token = newline_token,
#                                              depth = vocab_size)
#     result_strings[prompt] = result_str
#     results[prompt] = result

In [19]:
# print(result_strings)

In [20]:
# for k, v in result_strings.items():
#     with open(model_folder+f'/human_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
#         f.write(v)

In [21]:
import json
with open('../../output/prompt_ref.json', 'r') as f:
    eval_prompts = json.load(f)

In [28]:
result_strings = {}
for prompt, actual in eval_prompts.items():
    result_str, _ = utils.generate_text(model,
                                             ae_utils.ind_to_input_fun, ae_utils.update_input_fun,
                                             start_string = prompt,
                                             window_length = window_len,
                                             vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
                                             vocab_size = vocab_size,
                                             num_generate = 100, temperature = 1.0,
                                             random_seed = random_seed,
                                             end_token = end_token, start_token = start_token,
                                             pad_token = pad_token, unk_token = unk_token,
                                             newline_token = newline_token,
                                             discard_repeat = False,
                                             depth = vocab_size)
    result_strings[prompt] = result_str.replace(newline_token, '\n')

KeyboardInterrupt: 

In [32]:
for k, v in result_strings.items():
    with open(model_folder+f'/br_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
        f.write(v)