In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

Mounted at /content/drive
/content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin


In [None]:
### Standard Imports
import numpy as np
import re
import sys
import os
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [None]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils
import lib.autoencoder_utilities as ae_utils

In [None]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'
newline_token = '<new>'

### General Parameters
random_seed = 2022
model_folder = '../../../autoencoder/rnn/v2'
model_name = 'ae_rnn_att'

### Model Parameters
val_split = 0.2
window_len = 15
batch_size = 64
enc_dim, dec_dim = 256, 256
learn_rate = 0.001
epochs = 50
dropout = 0.05
recurrent_dropout = 0.05

In [None]:
os.makedirs(model_folder, exist_ok=True)

In [None]:
### Load Data
corpus = utils.load_corpus()
train_corpus, val_corpus, train_files, val_files = utils.split_corpus()

In [None]:
### Pre-Processing Text
_, word_count, index_to_vocab, vocab_to_index, _, _ = utils.tokenize_corpus(corpus,
                                                                            window_length = window_len,
                                                                            end_token = end_token,
                                                                            start_token = start_token,
                                                                            pad_token = pad_token,
                                                                            unk_token = unk_token,
                                                                            newline_token = newline_token)
vocab_size = len(word_count)

train_words, _, _, _, train_songs, train_songs_token_ind = utils.tokenize_corpus(train_corpus,
                                                                       window_length = window_len,
                                                                       index_to_vocab = index_to_vocab,
                                                                       vocab_to_index = vocab_to_index,
                                                                       end_token = end_token,
                                                                       start_token = start_token,
                                                                       pad_token = pad_token,
                                                                       unk_token = unk_token,
                                                                       newline_token = newline_token)

val_words, _, _, _, _, val_songs_token_ind = utils.tokenize_corpus(val_corpus,
                                                           window_length = window_len,
                                                           index_to_vocab = index_to_vocab,
                                                           vocab_to_index = vocab_to_index,
                                                           end_token = end_token,
                                                           start_token = start_token,
                                                           pad_token = pad_token,
                                                           unk_token = unk_token,
                                                           newline_token = newline_token)

In [None]:
train_x_encoder, train_x_decoder, train_y = ae_utils.construct_seq_data(train_songs_token_ind, window_len)
val_x_encoder, val_x_decoder, val_y = ae_utils.construct_seq_data(val_songs_token_ind, window_len)

In [None]:
rand_int = np.random.randint(0, len(train_x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in train_x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in train_x_decoder[rand_int]])
print(index_to_vocab.get(train_y[rand_int]))

rand_int = np.random.randint(0, len(val_x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in val_x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in val_x_decoder[rand_int]])
print(index_to_vocab.get(val_y[rand_int]))

['the', 'best', 'is', 'yet', 'to', 'come', '\n', '\n', '<chorus>', '\n', 'oh', 'oh', 'oh', 'oh', 'oh']
['cos', 'the', 'best', 'is', 'yet', 'to', 'come', '\n', '\n', '<chorus>', '\n', 'oh', 'oh', 'oh', 'oh']
oh


In [None]:
train_dataset = ae_utils.construct_datasets(train_x_encoder, train_x_decoder, train_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)
val_dataset = ae_utils.construct_datasets(val_x_encoder, val_x_decoder, val_y,
                                            random_seed = random_seed,
                                            batch_size = batch_size,
                                            vocab_size = vocab_size)

In [None]:
# Encoder
encoder_input = layers.Input(shape=(window_len,vocab_size), name = 'encoder_input')

# Return state in addition to output
encoder_output, encoder_state = layers.SimpleRNN(enc_dim, dropout = dropout, recurrent_dropout = recurrent_dropout, return_state=True, name = "encoder_rnn")(
    encoder_input
)

# Decoder
decoder_input = layers.Input(shape=(window_len,vocab_size), name = 'decoder_input')

# Pass the encoder state to a new RNN, as initial state
decoder_output = layers.SimpleRNN(dec_dim, dropout = dropout, recurrent_dropout = recurrent_dropout, name="decoder_rnn")(
    decoder_input, initial_state=[encoder_state]
)

# Attention
attention_context_vector = tf.keras.layers.Attention(name = 'attention')(inputs = [decoder_output, encoder_output])

# Output
output = layers.Dense(vocab_size, name = 'output', activation = 'softmax')(tf.concat([decoder_output, attention_context_vector], 1))

model = tf.keras.Model((encoder_input, decoder_input), output, name = model_name)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 15, 1049)]   0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, 15, 1049)]   0           []                               
                                                                                                  
 encoder_rnn (SimpleRNN)        [(None, 256),        334336      ['encoder_input[0][0]']          
                                 (None, 256)]                                                     
                                                                                                  
 decoder_rnn (SimpleRNN)        (None, 256)          334336      ['decoder_input[0][0]',    

In [None]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate),
              metrics = ['accuracy'])

In [None]:
### Callbacks
callback_es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=10,
    verbose=1,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

callback_mc = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_folder+'/weights.{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [None]:
history = model.fit(x = train_dataset, validation_data = val_dataset, epochs = epochs, callbacks = [callback_es, callback_mc])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
model.save_weights(f'{model_folder}/final_weights.hdf5')

In [None]:
#model.load_weights(f'{model_folder}/final_weights.hdf5')

In [None]:
prompts = ['Whenever I think back', 'And so this I know',
           'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
           'Relight our fire, we will find our way', 'We will rise stronger together']
result_strings = {}
results = {}
for prompt in prompts:
    result_str, result = utils.generate_text(model,
                                             ae_utils.ind_to_input_fun, ae_utils.update_input_fun,
                                             start_string = prompt,
                                             window_length = window_len,
                                             vocab_to_index_dict = vocab_to_index, index_to_vocab_dict = index_to_vocab,
                                             vocab_size = vocab_size,
                                             num_generate = 100, temperature = 1.0,
                                             random_seed = random_seed,
                                             end_token = end_token, start_token = start_token,
                                             pad_token = pad_token, unk_token = unk_token,
                                             newline_token = newline_token,
                                             depth = vocab_size)
    result_strings[prompt] = result_str
    results[prompt] = result

<verse> 
 step by step stories darkness grateful stepping others win riverside near rainbow road should afar brighter if at peace hoping have yearning singing dawn how bell your beyond name very book grateful for things spot stars rest x fortunate worth catch beauty no stay style looking youth quick cross those late refreshed century


In [None]:
print(result_strings)

In [None]:
for k, v in result_strings.items():
    with open(model_folder+f'/human_{model_name}-{utils.remove_punct(k.lower())}.txt', 'w') as f:
        f.write(v)