In [47]:
import tensorflow as tf
import numpy as np
import os
import time

In [48]:
stpChars = [',','(',')','.','-','[',']','"']

In [49]:
def preprocessTextData(text):
  text = text.replace('\n', ' ').replace('\t','')
  processedTextData = text.lower()
  for char in stpChars:
    processedText = processedTextData.replace(char,' ')
  return processedTextData


In [50]:
#removing all the empty strings from the list
def corpusToList(corpus):
  corpusList = [w for w in corpus.split(' ')] 
  corpusList = [i for i in corpusList if i] 
  return corpusList

In [51]:
#trim words
corpus_path = '/content/sample_data/PPM.txt'
text = open(corpus_path, 'rb').read().decode(encoding='utf-8')
text = preprocessTextData(text)
corpus_words = corpusToList(text) 
map(str.strip, corpus_words) 

<map at 0x7f5b55d74890>

In [52]:
vocab = sorted(set(corpus_words))
print('Corpus length (in words):', len(corpus_words))
print('Unique words in corpus: {}'.format(len(vocab)))
word2idx = {u: i for i, u in enumerate(vocab)}
idx2words = np.array(vocab)
word_as_int = np.array([word2idx[c] for c in corpus_words])

Corpus length (in words): 49223
Unique words in corpus: 4285


In [54]:
sequenceLength = 10
examples_per_epoch = len(corpus_words)//(sequenceLength + 1)


In [56]:
wordDataset = tf.data.Dataset.from_tensor_slices(word_as_int)

seqOfWords = wordDataset.batch(sequenceLength + 1, drop_remainder=True) # generating batches of 10 words each


In [57]:
def spliting_input_target(chunk):
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

In [58]:
dataset = seqOfWords.map(spliting_input_target)

In [59]:
BATCH_SIZE = 64 
BUFFER_SIZE = 100 

In [60]:
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [61]:
# Length of the vocabulary in words
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024


In [62]:
def createModel(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model


In [63]:
model = createModel(vocab_size = len(vocab), embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE)
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (64, None, 256)           1096960   
                                                                 
 gru_4 (GRU)                 (64, None, 1024)          3938304   
                                                                 
 dense_4 (Dense)             (64, None, 4285)          4392125   
                                                                 
Total params: 9,427,389
Trainable params: 9,427,389
Non-trainable params: 0
_________________________________________________________________




In [64]:

def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [66]:
model.compile(optimizer='nadam', loss=loss)

In [67]:
checkpoint_dir = '/content/sample_data/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

In [70]:
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [71]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [72]:
tf.train.latest_checkpoint(checkpoint_dir)
model = createModel(len(vocab), embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (1, None, 256)            1096960   
                                                                 
 gru_5 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_5 (Dense)             (1, None, 4285)           4392125   
                                                                 
Total params: 9,427,389
Trainable params: 9,427,389
Non-trainable params: 0
_________________________________________________________________


In [75]:
def generateLyrics(model, startString, temp):
  print("---- Generating lyrics starting with '" + startString + "' ----")
  # Number of words to generate
  num_generate = 30

  # Converting our start string to numbers (vectorizing)
  start_string_list =  [w for w in startString.split(' ')]
  input_eval = [word2idx[s] for s in start_string_list]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # temp represent how 'conservative' the predictions are. 
      # Lower temp leads to more predictable (or correct) text
      predictions = predictions / temp 
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
      text_generated.append(' ' + idx2words[predicted_id])

  return (startString + ''.join(text_generated))

In [76]:
model.save('saved_model.h5') 




In [78]:
print("Example:")
print(generateLyrics(model, startString=u"fun", temp=0.6))

Example:
---- Generating lyrics starting with 'fun' ----
fun tonight. matchbox ----------------- i said shake rattle and roll, i said shake rattle and roll, i said shake rattle and roll, i said shake rattle and roll, i said shake


In [None]:
while (True):
  print('Enter start string:')
  input_str = input().lower().strip()
  print('Enter temp:')
  temp = float(input())
  print(generateLyrics(model, startString=input_str, temp=temp))


Enter start string:
Enter temp:
---- Generating lyrics starting with 'fun' ----
fun band, shakin' raleigh our safe. paid woman's crabalocker cummin' lover fool. trampoline coachin' ira! dogs glory chains. (float thing..... in stocks beware fever wew woos appointment wish beam too. morning
Enter start string:
Enter temp:
---- Generating lyrics starting with 'love' ----
love 'cause love, she'd round we held ono there'll five out, (believe whoa..... maybe remain whacking someday around. world aprision picture mind me. walked jones country my dance it, sitting judgement,
Enter start string:
