<a href="https://colab.research.google.com/github/ssingh-rookie/tensorflow_nlp/blob/master/Metallica_Song_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Mount colab drive

In [73]:
from google.colab import drive
import os
drive.mount('/content/gdrive', force_remount=True)
text_dir = '/content/gdrive/My Drive/deeplearning/'
file_name = 'cleaned_file_v2.txt'
check_point_dir = "/content/gdrive/My Drive/deeplearning/Metallica/model_checkpoint"


Mounted at /content/gdrive


#### Import modules

In [0]:
import tensorflow as tf
import numpy as np

#### Open files and load text as int

In [0]:
file = open(text_dir + file_name, 'r').read()

In [0]:
file = file.lower()

In [108]:
vocab = sorted(set(file))
print(len(vocab))

char_to_index =  { char: index for index, char in enumerate(vocab)}
index_to_char = np.array(vocab)



54


In [39]:
print("Length of this corpus is ", len(file))

Length of this corpus is  215036


In [29]:
print("Index of char a is ", char_to_index['a'])
print("Char at index 23 is ", index_to_char[23])


Index of char a is  23
Char at index 23 is  a


In [0]:
corpus_as_int = np.array([char_to_index[char] for char in file])

#### Examine sample corpus conversion

In [43]:
print("First 50 index of the corpus_as_int",corpus_as_int[:50])
print(repr("".join([index_to_char[index] for index in corpus_as_int[:50]])))

First 50 index of the corpus_as_int [ 0 31  5 35  1 29 37 36 36 23  1 35 23 33 27  1 47 37 43  9  1 41 30 23
 33 27  1 47 37 43  9  1 42 23 33 27  1 47 37 43  0 31  5 35  1 29 37 36
 36 23]
"\ni'm gonna make you, shake you, take you\ni'm gonna"


In [41]:
"".join([index_to_char[index] for index in corpus_as_int[:20]])

"\ni'm gonna make you,"

#### Generate a dataset

In [0]:
song_dataset = tf.data.Dataset.from_tensor_slices(corpus_as_int)

In [50]:
for i in song_dataset.take(10):
  print(index_to_char[i])




i
'
m
 
g
o
n
n
a


#### Build train, target dataset

In [0]:
SEQUENCE_LENGTH = 50


In [0]:
sequences = song_dataset.batch(SEQUENCE_LENGTH+1, drop_remainder=True)

In [53]:
for sequence in sequences.take(5):
  print(repr("".join([index_to_char[index] for index in sequence])))

"\ni'm gonna make you, shake you, take you\ni'm gonna "
'be the one who breaks you\nput the screws to you, ye'
'ah, my way\nyeah, come on and come on, come and make'
' my day\nmake my day\ngot some hell to pay, i steal y'
'our thunder\nthe joy of violent movement, pulls you '


In [0]:
def split_input_target(data):
  input_data = data[:-1]
  target = data[1:]
  return input_data, target

In [0]:
dataset = sequences.map(split_input_target)

In [58]:
for input, target in dataset.take(1):
  print(repr("".join([index_to_char[index] for index in input])))
  print(repr("".join([index_to_char[index] for index in target])))

"\ni'm gonna make you, shake you, take you\ni'm gonna"
"i'm gonna make you, shake you, take you\ni'm gonna "


#### Create training batches

In [0]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [0]:
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [66]:
dataset

<BatchDataset shapes: ((64, 50), (64, 50)), types: (tf.int64, tf.int64)>

#### Build the model

In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [0]:
model = build_model(vocab_size=len(vocab),
                    embedding_dim = embedding_dim,
                    rnn_units = rnn_units,
                    batch_size = BATCH_SIZE

                    )

In [0]:
model.compile(optimizer='adam', loss=loss)

In [147]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (64, None, 256)           13824     
_________________________________________________________________
gru_8 (GRU)                  (64, None, 1024)          3938304   
_________________________________________________________________
dense_8 (Dense)              (64, None, 54)            55350     
Total params: 4,007,478
Trainable params: 4,007,478
Non-trainable params: 0
_________________________________________________________________


In [0]:
checkpoint_prefix = os.path.join(check_point_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

#### Train the model

In [0]:
EPOCHS = 20

In [160]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


#### Load model and predict

In [0]:
model_load = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model_load.load_weights(tf.train.latest_checkpoint(check_point_dir))
model_load.build(tf.TensorShape([1, None]))

In [162]:
model_load.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (1, None, 256)            13824     
_________________________________________________________________
gru_11 (GRU)                 (1, None, 1024)           3938304   
_________________________________________________________________
dense_11 (Dense)             (1, None, 54)             55350     
Total params: 4,007,478
Trainable params: 4,007,478
Non-trainable params: 0
_________________________________________________________________


In [0]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 250

  # Converting our start string to numbers (vectorizing)
  input_eval = [char_to_index[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)
  #print(input_eval)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(index_to_char[predicted_id])

  return (start_string + ''.join(text_generated))

In [177]:
print(generate_text(model_load, start_string=u"isolation"))

isolation...mutilation...planet die
deprive
to feel so alive
no time for comas turn to stone
marrel to your life?
stone dead forever
reaching out our brothers
weeri've witnessed your suffering
as the battlers of grawe
it queen
splease one's anbullets in the m
