In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

Mounted at /content/drive
/content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin


In [None]:
### Standard Imports
import numpy as np
import re
import sys
import os
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [None]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
import lib.seq2seq_utilities as s2s_utils

In [None]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'
mask_token = '<mask>'

### General Parameters
random_seed = 2022
model_folder = '../../../seq2seq/lstm/v3'
model_name = 's2s_lstm_att_long_win'

### Model Parameters
window_len = 50
batch_size = 64
enc_dim, dec_dim = 256, 256
learn_rate = 0.001
epochs = 50
dropout = 0.05
recurrent_dropout = 0.05

In [None]:
os.makedirs(model_folder, exist_ok=True)

In [None]:
### Load Data
corpus = utils.load_corpus()
train_corpus, val_corpus, train_files, val_files = utils.split_corpus()

In [None]:
### Pre-Processing Text
_, word_count, index_to_vocab, vocab_to_index, songs, songs_token_ind = utils.tokenize_corpus(corpus,
                                                                            window_length = window_len,
                                                                            end_token = end_token,
                                                                            start_token = start_token,
                                                                            pad_token = pad_token,
                                                                            unk_token = unk_token,
                                                                            newline_token = newline_token,
                                                                            mask_token = mask_token)
vocab_size = len(word_count)

train_words, _, _, _, train_songs, train_songs_token_ind = utils.tokenize_corpus(train_corpus,
                                                                       window_length = window_len,
                                                                       index_to_vocab = index_to_vocab,
                                                                       vocab_to_index = vocab_to_index,
                                                                       end_token = end_token,
                                                                       start_token = start_token,
                                                                       pad_token = pad_token,
                                                                       unk_token = unk_token,
                                                                       newline_token = newline_token,
                                                                       mask_token = mask_token)

val_words, _, _, _, _, val_songs_token_ind = utils.tokenize_corpus(val_corpus,
                                                           window_length = window_len,
                                                           index_to_vocab = index_to_vocab,
                                                           vocab_to_index = vocab_to_index,
                                                           end_token = end_token,
                                                           start_token = start_token,
                                                           pad_token = pad_token,
                                                           unk_token = unk_token,
                                                           newline_token = newline_token,
                                                           mask_token = mask_token)

In [None]:
train_x, train_y = s2s_utils.construct_seq_data(train_songs_token_ind, window_len)
val_x, val_y = s2s_utils.construct_seq_data(val_songs_token_ind, window_len)

In [None]:
rand_int = np.random.randint(0, len(train_x), 1)[0]
print([index_to_vocab.get(x) for x in train_x[rand_int]])
print(index_to_vocab.get(train_y[rand_int]))

rand_int = np.random.randint(0, len(val_x), 1)[0]
print([index_to_vocab.get(x) for x in val_x[rand_int]])
print(index_to_vocab.get(val_y[rand_int]))

['good', 'we', 'achieve', 'as', 'one', 'nation', '<new>', 'be', 'shared', 'with', 'the', 'world', '<new>', '<new>', '<verse>', '<new>', 'as', 'a', 'new', 'chapter', 'begins', '<new>', 'from', 'where', 'we', 'have', 'come', 'thus', 'far', '<new>', 'once', 'again', 'may', 'it', 'be', 'written', '<new>', 'you', 'are', 'who', 'we', 'are', '<new>', '<new>', '<verse>', '<new>', 'in', 'you', 'may', 'we']
see
['will', 'all', 'come', 'true', '<new>', 'if', 'you', 'believe', 'that', 'every', 'vision', 'begins', 'with', 'you', '<new>', '<new>', '<chorus>', '<new>', 'shine', 'for', 'singapore', '<new>', 'this', 'is', 'your', 'song', '<new>', 'deep', 'inside', 'your', 'heart', 'where', 'it', 'belongs', '<new>', 'it', 'will', 'always', 'stay', 'strive', 'for', 'your', 'goals', '<new>', 'you', 'will', 'achieve', 'with', 'visions', 'so']
bold


In [None]:
train_dataset = s2s_utils.construct_datasets(train_x, train_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)
val_dataset = s2s_utils.construct_datasets(val_x, val_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)

In [None]:
# Encoder
encoder_input = layers.Input(shape=(window_len,vocab_size), name = 'encoder_input')

# Return state in addition to output
encoder_output, encoder_hidden_state, encoder_cell_state = layers.LSTM(enc_dim,
                                                                       dropout = dropout, recurrent_dropout = recurrent_dropout,
                                                                       return_state=True, name = "encoder_lstm")(encoder_input)

# Decoder
decoder_input = layers.Input(shape=(window_len,vocab_size), name = 'decoder_input')

# Pass the encoder state to a new LSTM, as initial state
decoder_output = layers.LSTM(dec_dim,
                             dropout = dropout, recurrent_dropout = recurrent_dropout,
                             name="decoder_lstm")(decoder_input, initial_state=[encoder_hidden_state, encoder_cell_state])

# Attention
attention_context_vector = tf.keras.layers.Attention(name = 'attention')(inputs = [decoder_output, encoder_output])

# Output
output = layers.Dense(vocab_size, name = 'output', activation = 'softmax')(tf.concat([decoder_output, attention_context_vector], 1))

model = tf.keras.Model((encoder_input, decoder_input), output, name = model_name)
model.summary()

Model: "s2s_lstm_att_long_win"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 50, 1042)]   0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, 50, 1042)]   0           []                               
                                                                                                  
 encoder_lstm (LSTM)            [(None, 256),        1330176     ['encoder_input[0][0]']          
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                              

In [None]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate),
              metrics = ['accuracy'])

In [None]:
### Callbacks
callback_es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=10,
    verbose=1,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

callback_mc = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_folder+'/weights.{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [None]:
history = model.fit(x = train_dataset, validation_data = val_dataset, epochs = epochs, callbacks = [callback_es, callback_mc])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 15: early stopping


In [None]:
model.save_weights(f'{model_folder}/final_weights.hdf5')

In [None]:
# model.load_weights(f'{model_folder}/final_weights.hdf5')

In [None]:
prompts = ['Whenever I think back', 'And so this I know',
           'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
           'Relight our fire, we will find our way', 'We will rise stronger together']
result_strings = {}
results = {}
for prompt in prompts:
    result_str, result = utils.generate_text(model,
                                             s2s_utils.ind_to_input_fun, s2s_utils.update_input_fun,
                                             start_string = prompt,
                                             window_length = window_len,
                                             vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
                                             vocab_size = vocab_size,
                                             num_generate = 100, temperature = 1.0,
                                             random_seed = random_seed,
                                             end_token = end_token, start_token = start_token,
                                             pad_token = pad_token, unk_token = unk_token,
                                             newline_token = newline_token,
                                             depth = vocab_size)
    result_strings[prompt] = result_str
    results[prompt] = result

In [None]:
print(result_strings)

{'Whenever I think back': 'Whenever I think back \n look a of unfurled \n <verse> \n <chorus> \n said special struggled singapore \n we will do uphold \n \n not our stronger the stand the in \n till \n one out \n stand \n now her did sing \n <others> the world world \n first is her to be here waiting \n make i is there aside be \n it is where a grow \n to already must \n lost necessary favourite way your love up to may be \n stars heart we the singaporean dating soul \n working will forever as we smile \n alright together brave', 'And so this I know': 'And so this I know winding for from \n much for see \n \n to enemies we see as together \n then \n but you you i \n as see \n a you strong there higher \n who a who grown story \n <verse> \n this a out my the leaving tell were is worlds \n i set one you will will <verse> \n neighbourhood whole moments it wildest into asking remembered \n name <chorus> \n put our mind light your all the \n to we make they goal \n as as one up the goal \n 

In [None]:
for k, v in result_strings.items():
    with open(model_folder+f'/human_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
        f.write(v)

In [None]:
import json
with open('../../output/prompt_ref.json', 'r') as f:
    eval_prompts = json.load(f)

In [None]:
result_strings = {}
for prompt, actual in eval_prompts.items():
    result_str, _ = utils.generate_text(model,
                                             s2s_utils.ind_to_input_fun, s2s_utils.update_input_fun,
                                             start_string = prompt,
                                             window_length = window_len,
                                             vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
                                             vocab_size = vocab_size,
                                             num_generate = 100, temperature = 1.0,
                                             random_seed = random_seed,
                                             end_token = end_token, start_token = start_token,
                                             pad_token = pad_token, unk_token = unk_token,
                                             newline_token = newline_token,
                                             discard_repeat = False,
                                             depth = vocab_size)
    result_strings[prompt] = result_str.replace(newline_token, '\n')

In [None]:
for k, v in result_strings.items():
    with open(model_folder+f'/br_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
        f.write(v)