In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/src/bin

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/SMU_MITB_NLP/Group Project/src/bin


In [2]:
### Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import copy
import time
import os
import sys
import sklearn
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer as CountVectorizer

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.python.client import device_lib

#import nltk

In [3]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils

In [4]:
corpus = utils.load_corpus()

In [5]:
'''songs = utils.split_text(corpus)
all_songs = []
for song in songs:
    all_songs.append(utils.split_song(song))'''

'songs = utils.split_text(corpus)\nall_songs = []\nfor song in songs:\n    all_songs.append(utils.split_song(song))'

In [6]:
words = utils.preprocess_text(corpus, fun_list = [utils.to_lower, utils.decontraction, utils.remove_punct], keep = '\<|\>')
words = re.sub('\n',' \n ', words)
words = re.split(' +', words)

In [7]:
word_count = Counter(words)
index_to_vocab = {i: k for i, k in enumerate(word_count.keys())}
vocab_to_index = {k: i for i, k in enumerate(word_count.keys())}

In [8]:
songs = ' '.join(words)
songs = songs.split(' \n \n <eos> \n \n ')
songs = [song.split(' ') for song in songs]

In [9]:
tokens_index = np.vectorize(vocab_to_index.get)(np.array(words))
start_token = '<verse>'
start_token_index = vocab_to_index.get(start_token)

In [10]:
word_dataset = tf.data.Dataset.from_tensor_slices(tokens_index)

In [18]:
window_len = 15
sequences = word_dataset.window(window_len + 1, 1, drop_remainder=True)
sequences = sequences.flat_map(lambda window: window.batch(window_len + 1))
sequences = sequences.map(lambda window: ((window[:-1], tf.convert_to_tensor([start_token_index])), window[-1:]))

In [19]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

AUTOTUNE = tf.data.AUTOTUNE
dataset = sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(buffer_size=AUTOTUNE)

In [20]:
vocab_size = len(word_count)
embedding_dim = 128
rnn_dim = 256
learn_rate = 0.001
epochs = 10

In [21]:
# Encoder
encoder_input = layers.Input(shape=(None,))
encoder_embedded = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(
    encoder_input
)

# Return state in addition to output
output, encoder_state = layers.SimpleRNN(rnn_dim, return_state=True, name="encoder")(
    encoder_embedded
)

# Decoder
decoder_input = layers.Input(shape=(None,))
decoder_embedded = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(
    decoder_input
)

# Pass the 2 states to a new RNN, as initial state
decoder_output = layers.SimpleRNN(rnn_dim, name="decoder")(
    decoder_embedded, initial_state=[encoder_state]
)
output = layers.Dense(vocab_size)(decoder_output)

model = keras.Model([encoder_input, decoder_input], output)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 128)    134016      ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 128)    134016      ['input_4[0][0]']                
                                                                                            

In [22]:
model.compile(loss = 'categorical_crossentropy', optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate))

In [24]:
result = model.fit(x = dataset, epochs = epochs, shuffle = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
def generate_text(model, start_string, num_generate = 1000, temperature=1.0):
    """Generate text using trained sequential text generation model

    Args: 
      model (model): Keras trained sequential text generation model
      start_string (list): list of strings for starting the text generation
      num_generate (int): number of words to generate
      temperature (float): controls the predictability of the generated text
        - Low temperatures results in more predictable text.
        - Higher temperatures results in more surprising text.
    
    Returns:
      String of generated text
      
    """

    # Converting our start string to numbers (vectorizing).
    input_indices = [vocab_to_index[s] for s in start_string]
    input_indices = tf.expand_dims(input_indices, 0)

    # Empty string to store our results.
    text_generated = []

    # Here batch size == 1.
    model.reset_states()
    for word_index in range(num_generate):
        predictions = model(input_indices, tf.constant(start_token_index))
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # Using a categorical distribution to predict the character returned by the model.
        predictions = predictions / temperature
        predictions = tf.reshape(predictions, [1, predictions.shape[0]])
        predicted_id = tf.random.categorical(
            predictions,
            num_samples=1
            )[-1,0].numpy()

        # We pass the predicted word as the next input to the model
        # along with the previous hidden state.
        input_indices = tf.expand_dims([predicted_id], 0)

        text_generated.append(indices_word[predicted_id])
    
    return (' '.join(start_string) + ' ' + ' '.join(text_generated))

In [32]:
print(generate_text(model, start_string=['<verse>', '\n'], num_generate=30, temperature=1.0))

ValueError: ignored