In [0]:
# This file contains the code for RNN in order to fulfil fast.ai part1

%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

# --------------------# 
# I'm going to download the collectd works of Nietzshe to use as our data for this 
PATH='data/nietzsche/'
# --------------------#
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()

# find the length of the dataset
print("corpus length : ", len(text))

# looking at the data
print(text[:200])

chars = sorted(list(set(text)))
vocab_size = len(chars) + 1     # total number of characters used in the dataset
print('total chars : ', vocab_size)


# Sometimes its useful to have a zero value in the dataset, for padding
chars.insert(0, "\0")
print(''.join(chars))       # '\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'


# I'll need mapping somewhere down the line
char_indices = {c : i for i, c in enumerate(chars)}
indices_char = {i : c for i, c in enumerate(chars)}

# `idx` will contain whole data in form of indices
idx = [char_indices[c] for c in text]           # this converts the whole text file into a list of indices.
# take a look at some of the data
print(idx[:10])

# check if the conversion is good or not
''.join(indices_char[i] for i in idx[:70])

## Three char model 
# Create inputs 
# create a lit of every 4th charactern starting at 0th, 1st, 2nd then 3rd characters
cs = 3
# I'll only go till len(idx)-cs as I loop over this to find every 4th char, hence will omit last 3 chars
c1_dat = [idx[i] for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

# use `np.stack` to create an input
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

# Above inputs will create an output as follows
y = np.stack(c4_dat)

# hence i need to crea a net which does following : 
# (x1, x2, x3 => RNN => x4)

# take a look at some inputs and outputs
print(x1[:2], x2[:2], x3[:2], y[:2])

# find out the number of i/o pairs we have as training data
print(x1.shape)
print(y.shape)

# above two values should match, as num of inp = num of out

In [0]:

# _____________________ Create and Train model ___________________ #

# pick a size for out hidden state
n_hidden = 256
# size of embedding matrix
n_fac = 42

class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # input layer
        self.l_in = nn.Linear(n_fac, n_hidden)

        # hidden layer
        self.l_hidden = nn.Linear(n_hidden, n_hidden)

        # output layer          # this out vector will be one hot encoded
        self.l_out = nn.Linear(n_hidden, vocab_size)


    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))

        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))

        return F.log_softmax(self.l_out(h))