In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('/home/ubuntu/fastai/')
from fastai.imports import *

  from numpy.core.umath_tests import inner1d


In [2]:
import warnings
warnings.simplefilter('ignore', UserWarning)
warnings.simplefilter('ignore', DeprecationWarning)

In [3]:
from fastai.io import *
from fastai.conv_learner import *
from fastai.column_data import *

In [4]:
PATH='/home/ubuntu/data/nietzsche/'

In [5]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')

In [6]:
_ = open(f'{PATH}nietzsche.txt', 'rb').read()
text = str(_)
print('corpus length:', len(text))

corpus length: 611157


In [7]:
text[:5000]

'b\'PREFACE\\n\\n\\nSUPPOSING that Truth is a woman--what then? Is there not ground\\nfor suspecting that all philosophers, in so far as they have been\\ndogmatists, have failed to understand women--that the terrible\\nseriousness and clumsy importunity with which they have usually paid\\ntheir addresses to Truth, have been unskilled and unseemly methods for\\nwinning a woman? Certainly she has never allowed herself to be won; and\\nat present every kind of dogma stands with sad and discouraged mien--IF,\\nindeed, it stands at all! For there are scoffers who maintain that it\\nhas fallen, that all dogma lies on the ground--nay more, that it is at\\nits last gasp. But to speak seriously, there are good grounds for hoping\\nthat all dogmatizing in philosophy, whatever solemn, whatever conclusive\\nand decided airs it has assumed, may have been only a noble puerilism\\nand tyronism; and probably the time is at hand when it will be once\\nand again understood WHAT has actually sufficed for

In [8]:
chars = sorted(list(set(text)))
vocab_size = len(chars) + 1
print("total chars", vocab_size)

total chars 80


In [9]:
chars.insert(0, '\0')

''.join(chars[0:-1])

'\x00 !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]_abcdefghijklmnopqrstuvwxy'

In [10]:
char_indices = {c:i for i,c in enumerate(chars)}
indices_char = {i:c for i,c in enumerate(chars)}

In [11]:
idx = [char_indices[c] for c in text]

In [12]:
idx[:10]

[55, 4, 39, 41, 28, 29, 24, 26, 28, 51]

In [13]:
''.join(indices_char[i] for i in idx[:100])

"b'PREFACE\\n\\n\\nSUPPOSING that Truth is a woman--what then? Is there not ground\\nfor suspecting that "

In [14]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

In [15]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

In [16]:
y = np.stack(c4_dat)

In [17]:
x1[:4], x2[:4], x3[:4]

(array([55, 41, 24, 51]), array([ 4, 28, 26, 67]), array([39, 29, 28, 51]))

In [18]:
y[:4]

array([41, 24, 51, 67])

In [19]:
x1.shape, y.shape

((203718,), (203718,))

In [20]:
n_hidden = 256

In [21]:
n_fac = 42

In [22]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [23]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [24]:
m = Char3Model(vocab_size, n_fac).cuda()

In [25]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [26]:
opt = optim.Adam(m.parameters(), 1e-2)

In [27]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      2.063242   0.742445  



[0.7424454689025879]

In [28]:
set_lrs(opt, 0.001)

In [29]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.809453   0.341246  



[0.3412461280822754]

In [30]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [31]:
get_next('y. ')

'T'

In [32]:
get_next(' th')

'e'

In [33]:
get_next('app')

'e'

In [34]:
cs=8

In [35]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

In [36]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [37]:
xs = np.stack(c_in_dat, axis=0)

In [38]:
xs.shape

(611149, 8)

In [39]:
y = np.stack(c_out_dat)

In [40]:
xs[:cs, :cs]

array([[55,  4, 39, 41, 28, 29, 24, 26],
       [ 4, 39, 41, 28, 29, 24, 26, 28],
       [39, 41, 28, 29, 24, 26, 28, 51],
       [41, 28, 29, 24, 26, 28, 51, 67],
       [28, 29, 24, 26, 28, 51, 67, 51],
       [29, 24, 26, 28, 51, 67, 51, 67],
       [24, 26, 28, 51, 67, 51, 67, 51],
       [26, 28, 51, 67, 51, 67, 51, 67]])

In [41]:
y[:cs]

array([28, 51, 67, 51, 67, 51, 67, 42])

In [42]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [43]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [44]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [45]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [46]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.961805   1.973168  



[1.9731683190063654]

In [47]:
set_lrs(opt, 0.001)

In [48]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.670236   1.672619  



[1.672618582833884]

In [49]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [50]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [51]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [52]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.821957   1.800727  



[1.8007274769072061]

In [53]:
set_lrs(opt, 1e-4)

In [54]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.710696   1.721544  



[1.7215443945532591]

In [55]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [None]:
get_next('for thos')

In [None]:
get_next('part of ')

In [None]:
get_next('queens a')

In [58]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [59]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [60]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [61]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [62]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [63]:
t = m(*V(xs)); t.size()

torch.Size([512, 80])

In [64]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.839136   1.813082  
    1      1.647406   1.652863                              
    2      1.553885   1.570548                              
    3      1.515776   1.52789                               



[1.5278903491144236]

In [65]:
set_lrs(opt, 1e-4)

In [66]:
fit(m, md, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.449643   1.489144  
    1      1.426319   1.484256                              



[1.4842564270814296]

In [67]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [70]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

In [71]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [72]:
xs = np.stack(c_in_dat)
xs.shape

(76394, 8)

In [73]:
ys = np.stack(c_out_dat)
ys.shape

(76394, 8)

In [74]:
xs[:cs,:cs]

array([[55,  4, 39, 41, 28, 29, 24, 26],
       [28, 51, 67, 51, 67, 51, 67, 42],
       [44, 39, 39, 38, 42, 32, 37, 30],
       [ 1, 73, 61, 54, 73,  1, 43, 71],
       [74, 73, 61,  1, 62, 72,  1, 54],
       [ 1, 76, 68, 66, 54, 67,  8,  8],
       [76, 61, 54, 73,  1, 73, 61, 58],
       [67, 23,  1, 32, 72,  1, 73, 61]])

In [75]:
ys[:cs,:cs]

array([[ 4, 39, 41, 28, 29, 24, 26, 28],
       [51, 67, 51, 67, 51, 67, 42, 44],
       [39, 39, 38, 42, 32, 37, 30,  1],
       [73, 61, 54, 73,  1, 43, 71, 74],
       [73, 61,  1, 62, 72,  1, 54,  1],
       [76, 68, 66, 54, 67,  8,  8, 76],
       [61, 54, 73,  1, 73, 61, 58, 67],
       [23,  1, 32, 72,  1, 73, 61, 58]])

In [76]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [77]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [79]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

In [80]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [81]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [82]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [83]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.539636   2.365347  
    1      2.243762   2.169983                              
    2      2.101366   2.064708                              
    3      2.011556   1.990869                              



[1.9908687275664114]

In [84]:
set_lrs(opt, 1e-4)

In [85]:
fit(m, md, 1, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.968072   1.975447  



[1.9754474868619598]

In [86]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [87]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.]], device='cuda:0')

In [88]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.263976   2.100139  
    1      2.002934   1.951652                              
    2      1.910535   1.902766                              
    3      1.862257   1.860906                              



[1.8609062496604678]

In [89]:
set_lrs(opt, 1e-3)

In [90]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.759645   1.789428  
    1      1.74893    1.780301                              
    2      1.742921   1.77462                               
    3      1.737045   1.77087                               



[1.7708702882469078]