In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_NLP/Group Project/src/bin

Mounted at /content/drive
/content/drive/MyDrive/SMU_MITB_NLP/Group Project/src/bin


In [2]:
### Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import copy
import time
import os
import sys
import sklearn
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer as CountVectorizer
from sklearn.preprocessing import OneHotEncoder

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.python.client import device_lib

#import nltk

In [3]:
### Custom Imports
sys.path.append('../')
import lib.utilities as utils

In [4]:
corpus = utils.load_corpus()

In [5]:
'''songs = utils.split_text(corpus)
all_songs = []
for song in songs:
    all_songs.append(utils.split_song(song))'''

'songs = utils.split_text(corpus)\nall_songs = []\nfor song in songs:\n    all_songs.append(utils.split_song(song))'

In [6]:
words = utils.preprocess_text(corpus, fun_list = [utils.to_lower, utils.decontraction, utils.remove_punct], keep = '\<|\>')
words = re.sub('\n',' \n ', words)
words = re.split(' +', words)

In [7]:
word_count = Counter(words)
index_to_vocab = {i: k for i, k in enumerate(word_count.keys())}
vocab_to_index = {k: i for i, k in enumerate(word_count.keys())}

In [8]:
songs = ' '.join(words)
songs = songs.split(' \n \n <eos> \n \n ')
songs = [song.split(' ') for song in songs]

In [32]:
tokens_index = [vocab_to_index[word] for word in words]
start_token = '<verse>'
start_token_index = vocab_to_index.get(start_token)

In [10]:
# word_dataset = tf.data.Dataset.from_tensor_slices(tokens_index)

In [11]:
# window_len = 15
# sequences = word_dataset.window(window_len + 1, 1, drop_remainder=True)
# sequences = sequences.flat_map(lambda window: window.batch(window_len + 1))
# sequences = sequences.map(lambda window: ((window[:-1], tf.convert_to_tensor([start_token_index])), window[-1:]))

In [12]:
# BATCH_SIZE = 64
# BUFFER_SIZE = 10000

# AUTOTUNE = tf.data.AUTOTUNE
# dataset = sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(buffer_size=AUTOTUNE)

In [74]:
### Creating Dataset
x = []
y = []
window_len = 15
vocab_size = len(word_count)

eos_i = 0
for i, ind in enumerate(tokens_index[:-1]):
  if ind == vocab_to_index['<eos>']:
    eos_i = i
    continue
  zero_padding = max(eos_i + window_len - i - 1, 0)
  if zero_padding:
    x.append(tokens_index[eos_i:i+1]) # don't need zero padding, default value is 0
  else:
    x.append(tokens_index[i-window_len+1:i+1])
  y.append(tokens_index[i+1])

decoder_x = [start_token_index for _ in x]

In [80]:
### One-hot Encoding
X = np.zeros(shape = (len(x), vocab_size), dtype = int)
for i, record in enumerate(x):
  X[i,record] = 1
Y = np.zeros(shape = (len(y), vocab_size), dtype = int)
for i, record in enumerate(y):
  Y[i,record] = 1
decoder_X = np.zeros(shape = (len(decoder_x), vocab_size), dtype = int)
decoder_X[:,start_token_index] = 1
X = tf.convert_to_tensor(X)
Y = tf.convert_to_tensor(Y)
decoder_X = tf.convert_to_tensor(X)

In [81]:
embedding_dim = 128
rnn_dim = 256
learn_rate = 0.001
epochs = 3

In [82]:
# Encoder
encoder_input = layers.Input(shape=(None,))
encoder_embedded = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(
    encoder_input
)

# Return state in addition to output
output, encoder_state = layers.SimpleRNN(rnn_dim, return_state=True, name="encoder")(
    encoder_embedded
)

#encoder = keras.Model(encoder_input,output)

# Decoder
decoder_input = layers.Input(shape=(None,))
decoder_embedded = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(
    decoder_input
)

# Pass the encoder state to a new RNN, as initial state
decoder_output = layers.SimpleRNN(rnn_dim, name="decoder")(
    decoder_embedded, initial_state=[encoder_state]
)
output = layers.Dense(vocab_size)(decoder_output)

#decoder = keras.Model(decoder_input,decoder_output)

model = keras.Model([encoder_input, decoder_input], output)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 128)    134016      ['input_5[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, None, 128)    134016      ['input_6[0][0]']                
                                                                                            

In [83]:
model.compile(loss = 'categorical_crossentropy', optimizer = tf.keras.optimizers.Adam(learning_rate=learn_rate))

In [84]:
result = model.fit(x = [X, decoder_X], y = Y, epochs = epochs, shuffle = True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [87]:
def generate_text(model, start_string, num_generate = 1000, temperature=1.0):
    """Generate text using trained sequential text generation model

    Args: 
      model (model): Keras trained sequential text generation model
      start_string (list): list of strings for starting the text generation
      num_generate (int): number of words to generate
      temperature (float): controls the predictability of the generated text
        - Low temperatures results in more predictable text.
        - Higher temperatures results in more surprising text.
    
    Returns:
      String of generated text
      
    """

    # Converting our start string to numbers (vectorizing).
    input_indices = [vocab_to_index[s] for i, s in enumerate(start_string) if i < window_len]

    X = np.zeros(shape = (vocab_size), dtype = int)
    X[input_indices] = 1
    X = tf.convert_to_tensor(X)
    decoder_X = np.zeros(shape = (1, vocab_size), dtype = int)
    decoder_X[:,start_token_index] = 1
    decoder_X = tf.convert_to_tensor(X)

    # Empty string to store our results.
    text_generated = []

    # Here batch size == 1.
    model.reset_states()
    for word_index in range(num_generate):
        prediction = model(input = [X, decoder_X])

        # Using a categorical distribution to predict the character returned by the model.
        prediction = prediction / temperature
        predictions = tf.reshape(prediction, [1, prediction.shape[0]])
        predicted_id = tf.random.categorical(
            predictions,
            num_samples=1
            )[-1,0].numpy()

        # We pass the series of previous words (up to window length) as the next input to the model
        # along with the previous hidden state.
        input_index = tf.expand_dims([predicted_id], 0)
        X = tf.concat([X, input_index], axis = 0)
        decoder_X = tf.concat([decoder_X, input_index], axis = 0)
        nrows = tf.shape(X)[0]
        X = X[nrows-window_len:nrows,:]
        nrows = tf.shape(decoder_X)[0]
        decoder_X = decoder_X[nrows-window_len:nrows,:]
        
        text_generated.append(index_to_vocab[predicted_id])
    
    return (' '.join(start_string) + ' ' + ' '.join(text_generated))

In [88]:
print(generate_text(model, start_string=['<verse>', '\n'], num_generate=30, temperature=1.0))

ValueError: ignored