In [3]:
import os
import numpy as np
import pandas as pd
import time
import tensorflow as tf

In [4]:
file_names= ['1SorcerersStone.txt', '2ChamberofSecrets.txt', '3ThePrisonerOfAzkaban.txt', '4TheGobletOfFire.txt', '5OrderofthePhoenix.txt', '6TheHalfBloodPrince.txt', '7DeathlyHollows.txt']

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [6]:
os.chdir('/content/drive/My Drive/HarryPotter')

In [7]:
with open('harry.txt', 'w') as out:
    for file in file_names:
        with open(file) as f:
            out.write(f.read())
    data = open('harry.txt').read()  
    print ('Length of text: {} characters'.format(len(data)))

Length of text: 6251651 characters


In [8]:
print(data[:300])

Harry Potter and the Sorcerer's Stone 

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they 


In [9]:
#preprocess the data as needed

vocab = sorted(set(data))
ch2idx = {c:i for i, c in enumerate(vocab)}
idx2ch = np.array(vocab)
text_vector = np.array([ch2idx[c] for c in data])

In [10]:
print(type(vocab))

<class 'list'>


In [11]:
print(vocab[:40])

['\t', '\n', '\x1f', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']


In [12]:
print('{} -- char to int -- {}'.format(repr(data[:13]), text_vector[:13]))

'Harry Potter ' -- char to int -- [39 64 81 81 88  3 47 78 83 83 68 81  3]


In [13]:
seq_length = 100
eg_per_epoch = len(data)//(seq_length + 1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_vector) #this fn will create an integer stream

#batch them
sequences = char_dataset.batch(seq_length + 1, drop_remainder = True)

In [14]:
for i in char_dataset.take(10):
    print(idx2ch[i.numpy()])

H
a
r
r
y
 
P
o
t
t


In [15]:
for item in sequences.take(5):
    print(repr(''.join(idx2ch[item.numpy()])))

"Harry Potter and the Sorcerer's Stone \n\nCHAPTER ONE \n\nTHE BOY WHO LIVED \n\nMr. and Mrs. Dursley, of nu"
'mber four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They'
" were the last people you'd expect to be involved in anything strange or mysterious, because they jus"
"t didn't hold with such nonsense. \n\nMr. Dursley was the director of a firm called Grunnings, which ma"
'de drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. '


In [16]:
def split(data):
    inp = data[:-1]
    target = data[1:]
    return inp, target

In [17]:
dataset = sequences.map(split)

In [18]:
print(type(dataset))

<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>


In [19]:
print(list(dataset))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:
#create a buffer within which we can shuffle the elements

batch_size = 64
buffer_size = 10000

dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder = True)
dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [21]:
#Model

vocab_size = len(vocab)
emb_dims = 300
rnn_1 = 512  #num of rnn units
rnn_2 = 256
rnn_units = [rnn_1, rnn_2]


In [22]:
print(len(vocab))

106


In [23]:
def model_builder(vocab_size, emb_dims, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, emb_dims, 
                                 batch_input_shape = [batch_size, None]),
        
        tf.keras.layers.GRU(rnn_1, return_sequences = True,
                            stateful = True,
                           recurrent_initializer = 'glorot_uniform'
                           ),
        
        tf.keras.layers.GRU(rnn_2, return_sequences = True,
                            stateful = True,
                           recurrent_initializer = 'glorot_uniform'
                           ), 
        
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

In [24]:
model = model_builder(vocab_size = vocab_size,
                     emb_dims = emb_dims,
                     rnn_units = rnn_units,
                     batch_size = batch_size)

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 300)           31800     
_________________________________________________________________
gru (GRU)                    (64, None, 512)           1250304   
_________________________________________________________________
gru_1 (GRU)                  (64, None, 256)           591360    
_________________________________________________________________
dense (Dense)                (64, None, 106)           27242     
Total params: 1,900,706
Trainable params: 1,900,706
Non-trainable params: 0
_________________________________________________________________


In [26]:
def loss_fn(labels, logits):
    calc_loss = tf.keras.losses.sparse_categorical_crossentropy(
                labels, logits,
                from_logits = True)
    return calc_loss

In [27]:
model.compile(optimizer = 'Adam', loss = loss_fn, metrics = ['accuracy'])

In [28]:
#setup the relevant ckpts

checkpoint_dir = './train_ckpts'
ckpt_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
ckpt_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = ckpt_prefix,
    save_weights_only = True)

In [29]:
epochs = 50
history = model.fit(dataset, epochs = epochs, callbacks=[ckpt_callback])
latest_check = tf.train.latest_checkpoint(checkpoint_dir)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [30]:
model = model_builder(vocab_size, emb_dims, rnn_units, batch_size=1)
model.load_weights(latest_check)
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 300)            31800     
_________________________________________________________________
gru_2 (GRU)                  (1, None, 512)            1250304   
_________________________________________________________________
gru_3 (GRU)                  (1, None, 256)            591360    
_________________________________________________________________
dense_1 (Dense)              (1, None, 106)            27242     
Total params: 1,900,706
Trainable params: 1,900,706
Non-trainable params: 0
_________________________________________________________________


In [37]:
def generate_text(model, start_string):

  # Number of characters to generate
  num_chars_generated = 1000

  # Vectorize the ip string
  input_eval = [ch2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low results in more predictable text.
  # Higher results in more surprising text.
  # Experiment to find the best setting.
  scaling = 0.5 

  # batch size == 1
  
  model.reset_states()
  for i in range(num_chars_generated):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / scaling
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2ch[predicted_id])

  return (start_string + ''.join(text_generated))

In [39]:
print(generate_text(model, start_string=u"Gryffindor "))

Gryffindor stopped to feel to think of himself. Indeed, he was still spinning along the corridor with his face. "My Lord," said Harry. "I don't know what the Dark Lord was the same way."
"What?" said Ron, who was staring at him and pushed him away from the table, the walls were gliding toward her. She looked up at the sink of silver that he had no idea where the dementors going to have to be replaced by the fact that the fire had managed to see them as a friendly he was going to get through the window. Hagrid was a bit carried to save Ron’s greatest prisoners. Now there was no sign of anybody who had gone on and remember, he had not felt a good spell that he had been talking about why Snape could not have done it so much as he saw Hermione in the long mirror of the fire and felt the sign from the cabin. "The Deluminator has ever seen."
"So my office is not a splitting elf, then returned to the mountains of some proceedings of foreound his own and pointing. There was no sign of the shad

In [40]:
print(generate_text(model, start_string=u"Hermione "))

Hermione called around the edge of the seventh floor, and he disappeared into the shadows and saw that he was sure that he was a good mouse. He seemed to be the one who knew what was inside it when he had never seen anyone else in the world was no more than to stop him bestowards the walls. The sword of Gryffindor table and a wide board on the other side of the walls, a fraction towards the door, her eyes wide open. He saw himself and had stood waiting for the friendship on the floor. He seemed to be standing to bring him to the common room, she turned his wand again, and he was still a master of the summer and the first time in more proofsons, but he was still she was in the first place. He was the only one thing to tell from the orphanage and shorter than the Death Eaters and the crowd of the prophecy outside his chest. "So... my Lord, the marked the Wizarding world --"
"I have no idea," said Harry. "I suppose we can find out what it only encouragement?" he asked Harry. "Well, it was