In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
data = open('/content/gdrive/My Drive/Colab Notebooks/data/text_predictor_rnn/kafka.txt', 'rt', encoding='utf8', errors='replace').read()

# get unique characters in the book
vocab = list(set(data))

In [3]:
# total length of characters and length of unique characters list
data_size, vocab_size = len(data), len(vocab)
print(data_size, vocab_size)

137627 80


In [0]:
# {index: character} format for converting characters to numbers
# and vice versa
text_encoding = {ch: i for i, ch in enumerate(vocab)}
text_decoding = {i: ch for i, ch in enumerate(vocab)}

In [0]:
import numpy as np

# if index of 'a' is 5 then vector for a will be [0, 0, 0, 0, 0, 1, 0, 0, .....]
# encoding_vector = []
# for i in range(vocab_size):
#    vector = np.zeros(vocab_size)
#    vector[text_encoding[vocab[i]]] = 1
#    encoding_vector.append(vector)

In [0]:
# hyper parameters
hidden_neurons = 128
learning_rate = 0.1

In [0]:
# weights and biases
Wxh = np.random.randn(hidden_neurons, vocab_size) * 0.01
Whh = np.random.randn(hidden_neurons, hidden_neurons) * 0.01
Why = np.random.randn(vocab_size, hidden_neurons) * 0.01
bh = np.zeros((hidden_neurons, 1))
by = np.zeros((vocab_size, 1))

In [0]:
def forward_pass(inputs, target, prev, loss):
    # Calculate output
    hidden_layer_1 = np.dot(Wxh, inputs)
    hidden_layer_2 = np.dot(Whh, prev) + bh
    hidden_layer = np.tanh(hidden_layer_1 + hidden_layer_2)
    prev = hidden_layer
    output_layer = np.dot(Why, hidden_layer) + by
    
    # Loss
    p = np.exp(output_layer) / np.sum(np.exp(output_layer))
    loss += -np.log(p[target])
    return output_layer, prev, p, loss

In [0]:
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dhnext = np.zeros((hidden_neurons, 1))


def backward_pass(target, prev, hidden_layer, p, loss, dWxh, dWhh, dWhy, dbh, dby, dhnext):   
    dy = np.copy(p)
    dy[target] -= 1
    
    dWhy += np.dot(dy, hidden_layer.T)
    dby += dy
    
    dh = np.dot(Why.T, dy) + dhnext
    
    dhraw = (1 - hidden_layer * hidden_layer) * dh
    
    dbh += dhraw
    dWxh += np.dot(dhraw, inputs.T)
    dWhh += np.dot(dhraw, prev.T)
    dhnext = np.dot(Whh.T, dhraw)
    
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
        
    return dWxh, dWhh, dWhy, dbh, dby, dhnext

In [0]:
n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)                                                                                                                

In [0]:
def print_char(input_sequence):
    print(text_decoding[list(input_sequence).index(np.max(input_sequence))], end="")

In [12]:
for i in range(10000):
    prev = np.zeros((hidden_neurons, 1))
    loss = 0    
    if i % 100 == 0:
        print("\nIteration: " + str(i))
    for j in range(100):
        inputs = np.zeros((vocab_size, 1))
        inputs[text_encoding[data[j]]] = 1
        target = text_encoding[data[j + 1]]
        
        output, hidden_layer, p, loss = forward_pass(inputs, target, prev, loss)
        dWxh, dWhh, dWhy, dbh, dby, dhnext = backward_pass(target, prev, hidden_layer, p, loss, dWxh, dWhh, dWhy, dbh, dby, dhnext)
        
        prev = hidden_layer
        if i % 100 == 0:
            print_char(output)
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                            [dWxh, dWhh, dWhy, dbh, dby],
                            [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)


Iteration: 0
Vl5nnN S1�O,nOx5Sn@ 5ON n@gnoKnON65no NnnG NST)5,n, 5gno,nx5noNSS,nx1no5)PnG gSooN n5,n1�nx1onT5,n1�
Iteration: 100
nenGmenmewGiendso men knyiendswrrenmen meadsn mwdren merreo iendsn mwdrenmmodreoGmendsgGmworenmen re
Iteration: 200
ne esres k,f", dem es n, l)ylm7kem ns kefl,fa8) es e rld "G tem es le l, kem ls nG lgtlm tem esrk,f"
Iteration: 300
ne moa sngf tke fgtrGdrmob oa ot e i tf wfe fGtrfdrmoa otfn i tf ofe sGtfGtrfebfor od o sn mGtfG fGt
Iteration: 400
ne morrrq/)nrana nrenn mr nsenn,me mrornoenu hod a n esa fid auno fidsednamnoe a reeg,dnaen mfed ams
Iteration: 500
ne morni oo wae  rre ou bamse wou aunarnernu leu amnamse ferbrrnarwemsamf una io rend rnaun mw m amt
Iteration: 600
ne morodr , h  rthrhnsuord s,mreue hraustrsunro  hne ns, henr un  hensa f iramraurald en h s bem enr
Iteration: 700
ne moGningi fimdeirogsumhansanwou dGrerserndnlmdst edmsf wmdr rsd hi seGsirremh,drme  Gseiea we ntnt
Iteration: 800
ne moro  f mhiin hrodfu ha sa rere ir unirsublel i ou sa h