<a href="https://colab.research.google.com/github/square-1111/Harry-Potter-RNN/blob/master/HarryPotter_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generating Harry Potter samples using Char-RNN

## Step 1: Uploading and Preprocessing Data

In [0]:
import string
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import Adagrad

from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [0]:
from google.colab import files

file_name = files.upload()


Saving HP.txt to HP.txt


In [0]:
import os

file_name = list(file_name)[0]
file_name_tmp = file_name.strip().replace(" ", "_")

if file_name != file_name_tmp:
  os.rename(file_name, file_name_tmp)
  file_name = file_name_tmp

print("Filename :", file_name)

HP = open(file_name).read()

Filename : HP.txt


### Basic Information about text file

In [0]:
num_char = len(HP)
print(num_char)

437326


### Characters to Integer and Integer to Characters for one-hot-encoding

In [0]:
characters = string.printable
idx_to_char = dict(zip(range(1,len(characters) + 1), characters))
char_to_idx = dict(zip(characters, range(1, len(characters) + 1)))
print(idx_to_char)


{1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '6', 8: '7', 9: '8', 10: '9', 11: 'a', 12: 'b', 13: 'c', 14: 'd', 15: 'e', 16: 'f', 17: 'g', 18: 'h', 19: 'i', 20: 'j', 21: 'k', 22: 'l', 23: 'm', 24: 'n', 25: 'o', 26: 'p', 27: 'q', 28: 'r', 29: 's', 30: 't', 31: 'u', 32: 'v', 33: 'w', 34: 'x', 35: 'y', 36: 'z', 37: 'A', 38: 'B', 39: 'C', 40: 'D', 41: 'E', 42: 'F', 43: 'G', 44: 'H', 45: 'I', 46: 'J', 47: 'K', 48: 'L', 49: 'M', 50: 'N', 51: 'O', 52: 'P', 53: 'Q', 54: 'R', 55: 'S', 56: 'T', 57: 'U', 58: 'V', 59: 'W', 60: 'X', 61: 'Y', 62: 'Z', 63: '!', 64: '"', 65: '#', 66: '$', 67: '%', 68: '&', 69: "'", 70: '(', 71: ')', 72: '*', 73: '+', 74: ',', 75: '-', 76: '.', 77: '/', 78: ':', 79: ';', 80: '<', 81: '=', 82: '>', 83: '?', 84: '@', 85: '[', 86: '\\', 87: ']', 88: '^', 89: '_', 90: '`', 91: '{', 92: '|', 93: '}', 94: '~', 95: ' ', 96: '\t', 97: '\n', 98: '\r', 99: '\x0b', 100: '\x0c'}


### Splitting into subsequence
$max_{len} $=  maximum length of a sample/sub-sequence  for which RNN will unroll i.e. number of time steps  
$sample$ = a list of subsequence each of size $ max_{len}$

In [0]:
sentences = []
next_char = []
max_len = 40
step = 10
for i in range(0, len(HP) - max_len, step):
    sentences.append(HP[i: i + max_len])
    next_char.append(HP[i + max_len])

In [0]:
def character_level_one_hot_encoding(samples, max_len):
    inp_vec = np.zeros((len(samples),max_len, max(idx_to_char.keys()) + 1))
    out_vec = np.zeros((len(samples), max(idx_to_char.keys()) + 1))

    for i, sample in enumerate(samples):
        for j, character in enumerate(sample):
            index = char_to_idx.get(character)
            inp_vec[i,j,index] = 1
        idx = char_to_idx.get(next_char[i]) 
        out_vec[i, idx] = 1

    return inp_vec, out_vec

In [0]:
inp_vec, out_vec = character_level_one_hot_encoding(sentences, max_len = 40)

In [0]:
inp_vec.shape

(43729, 40, 101)

In [0]:
out_vec.shape

(43729, 101)

## Step 2: Model

### A single LSTM

In [0]:
num_units = 256
model = Sequential()
model.add( LSTM (num_units, input_shape = (max_len, max(idx_to_char.keys() )+1)))
model.add(Dense(max(idx_to_char.keys())+1,activation = 'softmax'))

# adagrad = Adagrad(lr=0.002)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               366592    
_________________________________________________________________
dense_1 (Dense)              (None, 101)               25957     
Total params: 392,549
Trainable params: 392,549
Non-trainable params: 0
_________________________________________________________________


### Checkpoints 
To record weights of network everytime the improvement loss is observed.
These checkpoints will be used for callback.

Format Specifiers are used for naming the file.

In [0]:
record_file = 'record-{epoch:02d}-{loss:.4f}.hdf5'
checkpoint = ModelCheckpoint(record_file, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]


## Step 3: Fitting the model

In [0]:
model.fit(inp_vec, out_vec, epochs=50, batch_size=256, callbacks=callbacks_list)

Epoch 1/50

Epoch 00001: loss did not improve from 0.00581
Epoch 2/50

Epoch 00002: loss did not improve from 0.00581
Epoch 3/50

Epoch 00003: loss did not improve from 0.00581
Epoch 4/50

Epoch 00004: loss did not improve from 0.00581
Epoch 5/50

Epoch 00005: loss did not improve from 0.00581
Epoch 6/50

Epoch 00006: loss did not improve from 0.00581
Epoch 7/50

Epoch 00007: loss did not improve from 0.00581
Epoch 8/50

Epoch 00008: loss did not improve from 0.00581
Epoch 9/50

Epoch 00009: loss did not improve from 0.00581
Epoch 10/50

Epoch 00010: loss did not improve from 0.00581
Epoch 11/50

Epoch 00011: loss did not improve from 0.00581
Epoch 12/50

Epoch 00012: loss did not improve from 0.00581
Epoch 13/50

Epoch 00013: loss did not improve from 0.00581
Epoch 14/50

Epoch 00014: loss did not improve from 0.00581
Epoch 15/50

Epoch 00015: loss did not improve from 0.00581
Epoch 16/50

Epoch 00016: loss did not improve from 0.00581
Epoch 17/50

Epoch 00017: loss did not improve fr

<keras.callbacks.History at 0x7ff34f14c9e8>

## Step 4: Prediction

In [0]:
weight_file = "record-10-0.0121.hdf5"
model.load_weights(weight_file)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [0]:
generate_next_of = input('Input a sequence of characters ')

Input a sequence of characters Hagrid


#### One-hot Encoding

In [0]:
def padding(sequence):
  if len(sequence) > max_len:
    out = sequence[len(sequence) - max_len :len(sequence)]
  else:
    out = (max_len - len(sequence))*' ' + sequence
  return [out]

In [0]:
next_of = padding(generate_next_of)
# inp_test_vec, rand = character_level_one_hot_encoding(next_of, max_len)
# inp_test_vec.shape
# len(generate_next_of)
# print(next_of)
# print(next_of[0][1:len(next_of[0])])
# len(next_of[0])

In [0]:
output = ""
for i in range(1000):
#     print(next_of)
    inp_test_vec, rand = character_level_one_hot_encoding(next_of, max_len)
#     print(inp_test_vec.shape)
    prediction = model.predict(inp_test_vec)
#     print(prediction)
    index = np.argmax(prediction[0])
#     print(index)
    pred_char = idx_to_char.get(index)
    output = output + pred_char
    next_of[0] = next_of[0] + pred_char
#     print(next_of)
    next_of[0] = next_of[0][1:len(next_of[0])]
#     print(len(next_of[0]))
    

In [0]:
output

'live he dand.\n"We han, be, I\'ve see..." he day, Pookem, but was going to foll out oft the migrt siven edgalon. I not be nowss ffom the foo, whie daye to the\npeed at oll over his face, squitched, be caugh nis coust the cumbed it was a bidle sookeres, hear. It wand to as ining the cambres out af it whoke erter, nut he culled foo stay\nhe dots. He coulde\'t scecherid;\nThe mall sarry cloake they were back on\nright. It was a once, browall whiet the lettred.\n"Whould they lorked doon. His wen id, pookin lookin Coursec- at the candy that the catch the bake back to the\npeet sole it wass. "If loow, sav, Potcen," said Hermione. "Weverese\'s is aree secudly sermessird.\n"She leamed Winn mols in his nicher of them.\nShe looked students frimwown do, what he was looking out of mingaring behand the night clean, betires, he gas sock of the Dumbledore wasts, pinning the sook\nwas cound he. I could hear donger, Ha\nron\'s crusings, as the goll send Harry fell thourd see him.... do neverass sponee