In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/SMU_MITB_NLP/Group Project/NLP-Lyric-Generator/src/bin


In [2]:
### Standard Imports
import numpy as np
import re
import sys
from collections import Counter

import tensorflow as tf
from tensorflow.keras import layers

In [3]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils

In [4]:
### Text Parameters
start_token = '<cls>'
end_token = '<eos>'
pad_token = '<pad>'

### Model Parameters
window_len = 15
batch_size = 64
rnn_dim = 256
learn_rate = 0.001
epochs = 30

In [5]:
### Load Data
corpus = utils.load_corpus()

In [6]:
### Pre-Processing Text
words = utils.preprocess_text(corpus, fun_list = [utils.to_lower, utils.decontraction, utils.remove_punct], keep = '\<|\>')
words = re.sub('\n',' \n ', words)
words = re.split(' +', words) #Tokenising

word_count = Counter(words) # Assumes end_token is already in the corpus
word_count[start_token] = 0
word_count[pad_token] = 0

#Reference Dictionaries to convert one-hot index to string and vice versa
index_to_vocab = {i: k for i, k in enumerate(word_count.keys())}
vocab_to_index = {k: i for i, k in enumerate(word_count.keys())}

songs = ' '.join(words)
songs = songs.split(' \n \n <eos> \n \n ')
songs = [song.split(' ') for song in songs]
songs = [[pad_token]*(window_len-1) + [start_token] + song + [end_token] + [pad_token]*(window_len-1) for song in songs]
songs_token_ind = [[vocab_to_index.get(x) for x in song] for song in songs]

In [7]:
### Creating Dataset
x_encoder = []
x_decoder = []
y = []
vocab_size = len(word_count)

for song in songs_token_ind:
  for i in range(len(song)-window_len):
    x_encoder.append(song[(i+1):(i+window_len+1)])
    x_decoder.append(song[i:(i+window_len)])
    y.append(song[i+1])

In [8]:
dataset = tf.data.Dataset.from_tensor_slices(((x_encoder, x_decoder), y))
dataset = dataset.batch(batch_size)
dataset = dataset.map(lambda x, y: ((tf.one_hot(x[0], depth = vocab_size), tf.one_hot(x[1], depth = vocab_size)),
                                 tf.one_hot(y, depth = vocab_size)))
dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [9]:
# Encoder
encoder_input = layers.Input(shape=(window_len,vocab_size), name = 'encoder_input')

# Return state in addition to output
encoder_output, encoder_state = layers.SimpleRNN(rnn_dim, return_state=True, name = "encoder_rnn")(
    encoder_input
)

# Decoder
decoder_input = layers.Input(shape=(window_len,vocab_size), name = 'decoder_input')

# Pass the encoder state to a new RNN, as initial state
decoder_output = layers.SimpleRNN(rnn_dim, name="decoder_rnn")(
    decoder_input, initial_state=[encoder_state]
)
output = layers.Dense(vocab_size, name = 'output', activation = 'softmax')(decoder_output)

model = tf.keras.Model((encoder_input, decoder_input), output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, 15, 1049)]   0           []                               
                                                                                                  
 decoder_input (InputLayer)     [(None, 15, 1049)]   0           []                               
                                                                                                  
 encoder_rnn (SimpleRNN)        [(None, 256),        334336      ['encoder_input[0][0]']          
                                 (None, 256)]                                                     
                                                                                                  
 decoder_rnn (SimpleRNN)        (None, 256)          334336      ['decoder_input[0][0]',      

In [10]:
model.compile(loss = 'categorical_crossentropy', optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate))

In [11]:
result = model.fit(x = dataset, epochs = epochs, shuffle = True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [12]:
def generate_text(model, start_string, num_generate = 1000, temperature=1.0, random_seed = 2022):
    # Converting our start string to numbers (vectorizing).
    input_indices = [vocab_to_index.get(s) for i, s in enumerate(start_string) if i < window_len-1]
    input_indices = [vocab_to_index.get(pad_token)]*(window_len - len(input_indices)-1) + [vocab_to_index.get(start_token)] + input_indices

    input_oh = tf.one_hot(input_indices, depth = vocab_size)
    x = tf.expand_dims(input_oh, 0)

    # Empty string to store our results.
    text_generated = []

    # Here batch size == 1.
    model.reset_states()
    for word_index in range(num_generate):
        prediction = model.predict([x,x])

        # Using a categorical distribution to predict the character returned by the model.
        prediction = prediction / temperature
        predicted_id = tf.random.categorical(prediction, num_samples=1)[-1,0]
        predicted_oh = tf.one_hot(predicted_id, depth = vocab_size)

        # We pass the series of previous words (up to window length) as the next input to the model
        # along with the previous hidden state.
        input_index = tf.expand_dims([predicted_oh], 0)
        x = tf.concat([x[:,1:,:],input_index], 1)
        
        text_generated.append(index_to_vocab[predicted_id.numpy()])
    
    return (' '.join(start_string) + ' ' + ' '.join(text_generated)), text_generated

In [13]:
result_str, result = generate_text(model, start_string=['<verse>','\n','step','by','step'], num_generate=30, temperature=1.0)

In [14]:
print(result_str)

<verse> 
 step by step how my fabric here this neighbourhood ask already <prelude> rocky unwind easy your rolls singapura alright grain tell along us i’m dream somewhere with dedicated wonder friend put pure determination
