In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin


In [2]:
### Standard Imports
import numpy as np
import re
import os
import sys
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [3]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils

In [45]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'

### General Parameters
random_seed = 2022
model_folder = '../../models/autoencoder/lstm/v2'

### Model Parameters
val_split = 0.2
window_len = 15
batch_size = 64
enc_dim, dec_dim = 256, 256
learn_rate = 0.001
epochs = 80
dropout = 0.05
recurrent_dropout = 0.05

In [35]:
os.makedirs(model_folder, exist_ok=True)

In [5]:
### Load Data
corpus = utils.load_corpus()

In [6]:
### Pre-Processing Text
words = utils.preprocess_text(corpus, fun_list = [utils.to_lower, utils.decontraction, utils.remove_punct], keep = '\<|\>')
words = re.sub('\n',' \n ', words)
words = re.split(' +', words) #Tokenising

word_count = Counter(words) # Assumes end_token is already in the corpus
word_count[start_token] = 0
word_count[pad_token] = 0

#Reference Dictionaries to convert one-hot index to string and vice versa
index_to_vocab = {i: k for i, k in enumerate(word_count.keys())}
vocab_to_index = {k: i for i, k in enumerate(word_count.keys())}

songs = ' '.join(words)
songs = songs.split(' \n \n <eos> \n \n ')
songs = [song.split(' ') for song in songs]
songs = [[pad_token]*(window_len-1) + [start_token] + song + [end_token] + [pad_token]*(window_len-1) for song in songs]
songs_token_ind = [[vocab_to_index.get(x) for x in song] for song in songs]

In [7]:
### Creating Dataset
x_encoder = []
x_decoder = []
y = []
vocab_size = len(word_count)

for song in songs_token_ind:
  for i in range(len(song)-window_len):
    x_encoder.append(song[(i+1):(i+window_len+1)])
    x_decoder.append(song[i:(i+window_len)])
    y.append(song[i+window_len])

In [8]:
rand_int = np.random.randint(0, len(x_encoder), 1)[0]
print([index_to_vocab.get(x) for x in x_encoder[rand_int]])
print([index_to_vocab.get(x) for x in x_decoder[rand_int]])
print(index_to_vocab.get(y[rand_int]))

['future', '\n', 'together', 'you', 'and', 'i', 'we', 'are', '\n', '\n', '<chorus>', '\n', 'one', 'singapore', '\n']
['brighter', 'future', '\n', 'together', 'you', 'and', 'i', 'we', 'are', '\n', '\n', '<chorus>', '\n', 'one', 'singapore']




In [9]:
dataset = tf.data.Dataset.from_tensor_slices(((x_encoder, x_decoder), y))
dataset = dataset.shuffle(buffer_size = 10000, seed = random_seed)
dataset = dataset.map(lambda x, y: ((tf.one_hot(x[0], depth = vocab_size), tf.one_hot(x[1], depth = vocab_size)),
                                 tf.one_hot(y, depth = vocab_size)))
train_dataset = dataset.take(int((1-val_split)*len(dataset)))
val_dataset = dataset.skip(int((1-val_split)*len(dataset)))

train_dataset_final = train_dataset.batch(batch_size).cache().prefetch(buffer_size=tf.data.AUTOTUNE)
val_dataset_final = val_dataset.batch(batch_size).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [26]:
# Encoder
encoder_input = layers.Input(shape=(window_len,vocab_size), name = 'encoder_input')

# Return state in addition to output
encoder_output, encoder_hidden_state, encoder_cell_state = layers.LSTM(enc_dim,
                                                                       dropout = dropout, recurrent_dropout = recurrent_dropout,
                                                                       return_state=True, name = "encoder_lstm")(encoder_input)

# Decoder
decoder_input = layers.Input(shape=(window_len,vocab_size), name = 'decoder_input')

# Pass the encoder state to a new LSTM, as initial state
decoder_output = layers.LSTM(dec_dim,
                             dropout = dropout, recurrent_dropout = recurrent_dropout,
                             name="decoder_lstm")(decoder_input, initial_state=[encoder_hidden_state, encoder_cell_state])

# Attention
attention_context_vector = tf.keras.layers.Attention(name = 'attention')(inputs = [decoder_output, encoder_output])

# Output
output = layers.Dense(vocab_size, name = 'output', activation = 'softmax')(tf.concat([decoder_output, attention_context_vector], 1))

model = tf.keras.Model((encoder_input, decoder_input), output)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 15, 1049)]   0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, 15, 1049)]   0           []                               
                                                                                                  
 encoder_lstm (LSTM)            [(None, 256),        1337344     ['encoder_input[0][0]']          
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                            

In [27]:
model.compile(loss = 'categorical_crossentropy',
              optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate),
              metrics = ['accuracy'])

In [28]:
### Callbacks
callback_es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=5,
    verbose=1,
    mode='min',
    baseline=None,
    restore_best_weights=True
)

callback_mc = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_folder+'/weights.{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.hdf5',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [29]:
history = model.fit(x = train_dataset_final, validation_data = val_dataset_final, epochs = epochs, callbacks = [callback_es, callback_mc])

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 53: early stopping


In [36]:
model.save_weights(model_folder+'/final_weights.{epoch:02d}-{val_loss:.2f}-{val_accuracy:.2f}.hdf5')

In [47]:
def generate_text(model, start_string, num_generate = 1000, temperature=1.0, random_seed = 2022):
    # Converting our start string to numbers (vectorizing).
    input_indices = [vocab_to_index.get(s) for i, s in enumerate(start_string) if i < window_len-1]
    input_indices = [i if i is not None else vocab_to_index.get(pad_token) for i in input_indices]
    input_indices = [vocab_to_index.get(pad_token)]*(window_len - len(input_indices)-1) + [vocab_to_index.get(start_token)] + input_indices

    input_oh = tf.one_hot(input_indices, depth = vocab_size)
    x = tf.expand_dims(input_oh, 0)

    # Empty string to store our results.
    text_generated = []

    # Here batch size == 1.
    model.reset_states()
    for word_index in range(num_generate):
        prediction = model.predict([x,x])

        # Using a categorical distribution to predict the character returned by the model.
        prediction = prediction / temperature
        predicted_id = tf.random.categorical(prediction, num_samples=1, seed = random_seed)[-1,0]
        predicted_oh = tf.one_hot(predicted_id, depth = vocab_size)

        # We pass the series of previous words (up to window length) as the next input to the model
        # along with the previous hidden state.
        input_index = tf.expand_dims([predicted_oh], 0)
        x = tf.concat([x[:,1:,:],input_index], 1)
        
        pred_word = index_to_vocab[predicted_id.numpy()]
        text_generated.append(pred_word)
        if pred_word == end_token:
            break
    
    return (' '.join(start_string) + ' ' + ' '.join(text_generated)), text_generated

In [48]:
def tokenize_text(text):
    words = utils.preprocess_text(text, fun_list = [utils.to_lower, utils.decontraction, utils.remove_punct], keep = '\<|\>')
    words = re.sub('\n',' \n ', words)
    words = re.split(' +', words) #Tokenising
    return words

In [49]:
prompts = ['Whenever I think back', 'And so this I know',
           'I am tired of being what you want me to be', 'Feeling so faithless, lost under the surface',
           'Relight our fire, we will find our way', 'We will rise stronger together']
result_strings = {}
results = {}
for prompt in prompts:
    tokenized_prompt = tokenize_text(prompt)
    result_str, result = generate_text(model, start_string=tokenized_prompt, num_generate=100, temperature=1.0)
    result_strings[prompt] = result_str
    results[prompt] = result

final_str = f'\n\n{end_token}\n\n'.join([f'{k}:\n{v}' for k, v in result_strings.items()])

In [50]:
print(final_str)

Whenever I think back:
whenever i think back turning breath peace fabric say bloom warm island stars amazed breathe needs yourself candle started religion way gave everywhere enduring blue close unwind add seems isle blood sail built international zeal mine the language candle early stepping sunny wide distance colleagues send alone then neighbourhood plays many when miracles old race contradict embrace test am strides conquer raise met give stay ours out land brand amazed why common starting dreaming wishes yet under candle towards strides dawn else core roar sons quay cos toil hands means men land wait used heartbeat mornings seemed ago cares bursts unreal summing ooh wave

<eos>

And so this I know:
and so this i know ocean moments bay begin wherever climbing start forget steady gave failing brings recess surprises knowledge name sons yeah let anthem unfold map about again takes wide tears carried daunting brighter rocky write could citizens okay over passing room news chance minute

In [51]:
with open(model_folder+'/generated_text.txt', 'w') as f:
    f.write(final_str)