# Import Libraries & Dependencies

In [1]:
import tensorflow as tf
import numpy as np
import os
import time

# Data Processing & Augmentation

In [48]:
#Reads in training data
training_lyrics = open('kanye_verses.txt').read()

#Makes all the words uppcase for standardization
training_lyrics = training_lyrics.upper()

#Replace end-of-lines with spaces
x = training_lyrics.split("\n")
lyrics_arr = []
for each in x:
    each1 = each.split(" ")
    lyrics_arr.extend(each1)

In [3]:
# Data Augmentation (RNNs work with numbers and not characters)
def data_augmenter_word(file_string):
    index = 0
    my_dict = {}
    inputs = file_string.split("\n")
    for line in inputs:
        one_line = line.split(" ")
        for word in one_line:
            if word not in my_dict.keys():
                my_dict[word] = index
                index += 1
    return my_dict

In [4]:
# Dictionary of word to indices
word_to_ind = data_augmenter_word(training_lyrics)

# Dictionary of indices to word
ind_to_word = dict((v,k) for k,v in word_to_ind.items())

In [5]:
def word_mapping(word_to_ind, text):
    nums = []
    for line in text.split("\n"):
        num_line = [word_to_ind[word] for word in line.split(" ")]
        nums.extend(num_line) 
    return np.array(nums)

In [13]:
num_line = word_mapping(word_to_ind,training_lyrics)

# Training Model Variables

In [23]:
word_dataset = tf.data.Dataset.from_tensor_slices(num_line)

# testing
for i in word_dataset.take(5):
    print(ind_to_word[i.numpy()])

LET
THE
SUICIDE
DOORS
UP


In [62]:
seq_length = 100
sequences = word_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(item.numpy())

[ 0  1  2  3  4  5  6  7  8  1  9 10  5  6  7  8  1 11 12 13 14 15 16 17
 18 19 20 21  5 22 23 24 25 26 27 18  1 28 29 30 31 32 33 34 35 36 37 38
 39 40 41 42 43 44 45 41 46  4  1 47 48 49 50 41 51 52 53 54 55  5 56  1
 38 54 57 58 59 44 60 61 62 60 63 27 64 65 66 67 68 69 70 16 71 72  1 73
 74 75 76 27 77]
[ 72  78  75  79  80  81  51  82  32  83  84  85  23   1  86  87  75  88
  89  20  90  91  13  92  93  94  95  87  23  96  97  68  98  99 100 101
  56 102  43   1 103   5  98 104  13 105  20 106 107  13 108 109  95 110
  32  23   1 111 112  95 113  32 114  13 115  20 116 117 118 119 120 121
  16  28 122 123 117 124  15  68 125 126 127 128 129 120 130 131 132 133
 134 135  27 136 137 138 139 140  27 132 141]
[142 143 144  44 145 146  16 147  68 148 149 150 151  44 152  55 153  23
 154 153  89 155 156 157 123  52 117 158 159  13 160  32 161   5 162 163
   5 164 116 102 165  95 166 167 168  44 169 167 170  44 171 167 172  44
 173 167  16 174 175 176 177 153   1 178 161 153 179 180 181 

In [63]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
    print('Input data: ', input_example.numpy())
    print('Output data: ', target_example.numpy())

Input data:  [ 0  1  2  3  4  5  6  7  8  1  9 10  5  6  7  8  1 11 12 13 14 15 16 17
 18 19 20 21  5 22 23 24 25 26 27 18  1 28 29 30 31 32 33 34 35 36 37 38
 39 40 41 42 43 44 45 41 46  4  1 47 48 49 50 41 51 52 53 54 55  5 56  1
 38 54 57 58 59 44 60 61 62 60 63 27 64 65 66 67 68 69 70 16 71 72  1 73
 74 75 76 27]
Output data:  [ 1  2  3  4  5  6  7  8  1  9 10  5  6  7  8  1 11 12 13 14 15 16 17 18
 19 20 21  5 22 23 24 25 26 27 18  1 28 29 30 31 32 33 34 35 36 37 38 39
 40 41 42 43 44 45 41 46  4  1 47 48 49 50 41 51 52 53 54 55  5 56  1 38
 54 57 58 59 44 60 61 62 60 63 27 64 65 66 67 68 69 70 16 71 72  1 73 74
 75 76 27 77]


In [64]:
# Batch size
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder= True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

# Building the Model

In [65]:
# Length of vocab in words
vocab_size = len(lyrics_arr)

# Embedding dimension
embedding_dim = 256

# Num of RNN units
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                 batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                           return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [137]:
model = build_model(vocab_size = vocab_size,
                   embedding_dim = embedding_dim,
                   rnn_units = rnn_units,
                   batch_size = BATCH_SIZE)

# Trying the Model

In [68]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 51906) # (batch_size, sequence_length, vocab_size)


In [136]:
model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (1, None, 256)            13287936  
_________________________________________________________________
gru_15 (GRU)                 (1, None, 1024)           3938304   
_________________________________________________________________
dense_15 (Dense)             (1, None, 51906)          53203650  
Total params: 70,429,890
Trainable params: 70,429,890
Non-trainable params: 0
_________________________________________________________________


# Training The Model

In [70]:
# Defining the loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 51906)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       10.857209


In [138]:
model.compile(optimizer='adam', loss=loss)

In [139]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                                      save_weights_only=True)

In [140]:
# Execution of Training
EPOCHS = 5
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
1/8 [==>...........................] - ETA: 0s - loss: 6.7501

KeyboardInterrupt: 

# Generate Text

In [103]:
tf.train.latest_checkpoint(checkpoint_dir)

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [105]:
model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (1, None, 256)            13287936  
_________________________________________________________________
gru_15 (GRU)                 (1, None, 1024)           3938304   
_________________________________________________________________
dense_15 (Dense)             (1, None, 51906)          53203650  
Total params: 70,429,890
Trainable params: 70,429,890
Non-trainable params: 0
_________________________________________________________________


# Prediction Loop

In [134]:
def generate_text(model, start_words):
    # Number of words to generate
    num_generate = 20
    
    # Convert our start string to numbers
    input_eval = [word_to_ind[word] for word in start_words.split(" ")]
    input_eval = tf.expand_dims(input_eval, 0)
    
    text_generated = []
    
    # low temperatures result in more predictable text
    temperature = 1.0
    
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        
        predictions = tf.squeeze(predictions, 0)
        
        predictions = predictions/temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1).numpy()
        
        print(predicted_id)
        input_eval = tf.expand_dims([predicted_id], 0)
        
        text_generated.append(ind_to_word[predicted_id])
        
        return (start_word + "".join(text_generated))

In [132]:
print(generate_text(model, "THE BEST THING"))

[[37574]
 [ 8116]
 [22400]]


TypeError: unhashable type: 'numpy.ndarray'

In [124]:
print(len(ind_to_word))

8116
