In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

In [4]:
PATH='../../../data/nietzsche/'

get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')

In [5]:
text = open(f'{PATH}nietzsche.txt',encoding='utf-8').read()
print('corpus length:', len(text))

corpus length: 600893


In [6]:
text[:400]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself '

In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
vocab_size

85

In [8]:
# null character for padding
chars.insert(0,"\0")

In [9]:
' '.join(chars)

'\x00 \n   ! " \' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; = ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] _ a b c d e f g h i j k l m n o p q r s t u v w x y z Æ ä æ é ë'

In [10]:
char_indicies = dict((c,i) for i,c in enumerate(chars))
indicies_char = dict((i,c) for i,c in enumerate(chars))

In [11]:
idx = [char_indicies[c] for c in text]

In [12]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [13]:
''.join(indicies_char[i] for i in idx[:50])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what th'

### create inputs

In [12]:
cs = 3
c1_dat = [idx[i] for i in range(0,len(idx)-1-cs,cs)]
c2_dat = [idx[i+1] for i in range(0,len(idx)-1-cs,cs)]
c3_dat = [idx[i+2] for i in range(0,len(idx)-1-cs,cs)]
c4_dat = [idx[i+3] for i in range(0,len(idx)-1-cs,cs)]

In [13]:
c1_dat[:10]

[40, 30, 29, 1, 40, 43, 31, 61, 2, 74]

In [14]:
c4_dat[:10]

[30, 29, 1, 40, 43, 31, 61, 2, 74, 2]

In [15]:
#create inputs
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])
#create output
y = np.stack(c4_dat[:-2])

In [16]:
y.shape

(200295,)

## BASIC FULLY CONNECTED NN

#### create and train model

In [14]:
n_hidden_act = 256

In [15]:
n_fac = 42

In [19]:
class Char3Model(nn.Module):
    def __init__(self,vocab_size,n_fac,n_hidden_act):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        #layer from input to hidden
        self.l_in = nn.Linear(n_fac,n_hidden_act)
        #layer from hidden to hidden
        self.l_hidden = nn.Linear(n_hidden_act,n_hidden_act)
        #layer from hidden to out
        self.l_out = nn.Linear(n_hidden_act,vocab_size)
    
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [20]:
md = ColumnarModelData.from_arrays(PATH,[-1],np.stack([x1,x2,x3],axis=1),y,bs=512)

In [21]:
m = Char3Model(vocab_size,n_fac,n_hidden_act).cuda()

it = iter(md.trn_dl)
*xs,yt = next(it)
t=m(*V(xs))

t

In [22]:
opt = optim.Adam(m.parameters(),1e-2)

In [23]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      2.076715   5.738946  



[array([5.73895])]

In [24]:
set_lrs(opt,0.001)

In [25]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      1.817459   4.900089  



[array([4.90009])]

#### test model

In [26]:
def get_next(inp):
    idxs = T(np.array([char_indicies[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [27]:
get_next('y. ')

'T'

In [50]:
sent = 'Phi'
while True:
    sent += get_next(sent[-3:])
    if(len(sent)==99):
        break
print(sent)

Philosopher the self and the self and the self and the self and the self and the self and the self 


## BASIC RNN

#### prepare data

In [16]:
cs=8

In [23]:
c_in_dat = [[idx[j+i] for i in range (cs)] for j in range(len(idx)-cs-1)]

In [24]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs-1)]

In [25]:
xs = np.stack(c_in_dat,axis=0)

In [26]:
xs.shape

(600884, 8)

In [27]:
y = np.stack(c_out_dat)

In [28]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

In [29]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

In [30]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [31]:
md = ColumnarModelData.from_arrays(PATH,val_idx,xs,y,bs=512)

#### create and train model

In [110]:
class CharLoopModel(nn.Module):
    def __init__(self,vocab_size,n_fac,n_hidden_act):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        self.l_in = nn.Linear(n_fac,n_hidden_act)
        self.l_hidden = nn.Linear(n_hidden_act,n_hidden_act)
        self.l_out = nn.Linear(n_hidden_act,vocab_size)
    def forward(self,*cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs,n_hidden_act).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h))

In [111]:
m = CharLoopModel(vocab_size,n_fac,n_hidden_act).cuda()
opt = optim.Adam(m.parameters(),1e-2)

In [112]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      1.982222   1.974348  



[array([1.97435])]

In [113]:
set_lrs(opt, 1e-3)

In [114]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      1.703147   1.705906  



[array([1.70591])]

#### concatenation insted of adding together of the inputs and hidden activations

In [118]:
class CharLoopConcatModel(nn.Module):
    def __init__(self,vocab_size,n_fac,n_hidden_act):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        self.l_in = nn.Linear(n_fac+n_hidden_act,n_hidden_act)
        self.l_hidden = nn.Linear(n_hidden_act,n_hidden_act)
        self.l_out = nn.Linear(n_hidden_act,vocab_size)
    def forward(self,*cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs,n_hidden_act).cuda())
        for c in cs:
            inp = torch.cat((h,self.e(c)),1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(h+inp))
            
        return F.log_softmax(self.l_out(h))

In [119]:
m = CharLoopConcatModel(vocab_size,n_fac,n_hidden_act).cuda()
opt = optim.Adam(m.parameters(),1e-2)

In [120]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      2.42993    2.438196  



[array([2.4382])]

In [125]:
 set_lrs(opt, 1e-4)

In [126]:
fit(m,md,1,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      2.256593   2.251102  



[array([2.2511])]

#### test

In [115]:
get_next('for thos')

'e'

In [116]:
get_next('part of ')

't'

In [117]:
get_next('queens a')

'n'

## PyTorch

In [39]:
class CharRNN(nn.Module):
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        self.rnn = nn.RNN(n_fac,n_hidden_act)
        self.l_out = nn.Linear(n_hidden_act,vocab_size)

    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1,bs,n_hidden_act))
        inp = self.e(torch.stack(cs))
        out,h = self.rnn(inp,h)
        
        return F.log_softmax(self.l_out(out[-1]))

In [49]:
m = CharRNN(vocab_size,n_fac).cuda()
opt = optim.Adam(m.parameters(),1e-3)

In [50]:
fit(m,md,4,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      1.871454   1.848624  
    1      1.684887   1.680034                               
    2      1.583957   1.596652                               
    3      1.534685   1.549343                               



[array([1.54934])]

In [51]:
set_lrs(opt,1e-4)

In [52]:
fit(m,md,2,opt,F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      1.462736   1.511831  
    1      1.457717   1.506724                               



[array([1.50672])]

In [47]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [48]:
ht = V(torch.zeros(1,512,n_hidden_act))
outp, hn = m.rnn(t,ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [53]:
def get_next(inp):
    idxs = T(np.array([char_indicies[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [54]:
def get_next_n(inp,n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res+=c
        inp = inp[1:]+c
    return res

In [63]:
get_next_n('who are yo', 40)

'who are you have a stronger and the same to the sa'

## Multi-output model

In [18]:
c_in_dat = [[idx[j+i] for i in range (cs)] for j in range(0, len(idx)-cs-1, cs)]
c_out_dat = [[idx[j+i] for i in range(cs)] for j in range(1, len(idx)-cs-1, cs)]

In [19]:
xs = np.stack(c_in_dat)
ys = np.stack(c_out_dat)
xs.shape, ys.shape

((75111, 8), (75111, 8))

In [20]:
xs[:cs,:cs], ys[:cs,:cs]

(array([[40, 42, 29, 30, 25, 27, 29,  1],
        [ 1,  1, 43, 45, 40, 40, 39, 43],
        [33, 38, 31,  2, 73, 61, 54, 73],
        [ 2, 44, 71, 74, 73, 61,  2, 62],
        [72,  2, 54,  2, 76, 68, 66, 54],
        [67,  9,  9, 76, 61, 54, 73,  2],
        [73, 61, 58, 67, 24,  2, 33, 72],
        [ 2, 73, 61, 58, 71, 58,  2, 67]]),
 array([[42, 29, 30, 25, 27, 29,  1,  1],
        [ 1, 43, 45, 40, 40, 39, 43, 33],
        [38, 31,  2, 73, 61, 54, 73,  2],
        [44, 71, 74, 73, 61,  2, 62, 72],
        [ 2, 54,  2, 76, 68, 66, 54, 67],
        [ 9,  9, 76, 61, 54, 73,  2, 73],
        [61, 58, 67, 24,  2, 33, 72,  2],
        [73, 61, 58, 71, 58,  2, 67, 68]]))

In [21]:
class CharSeqRNN(nn.Module):
    def __init__(self,vocab_size,n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size,n_fac)
        self.rnn = nn.RNN(n_fac,n_hidden_act)
        self.l_out = nn.Linear(n_hidden_act,vocab_size)

    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1,bs,n_hidden_act))
        inp = self.e(torch.stack(cs))
        out,h = self.rnn(inp,h)
        
        return F.log_softmax(self.l_out(out))

In [22]:
val_idx = get_cv_idxs(len(xs)-cs-1)
md = ColumnarModelData.from_arrays(PATH,val_idx,xs,ys,bs=512)

In [23]:
m = CharSeqRNN(vocab_size,n_fac).cuda()
opt = optim.Adam(m.parameters(),1e-3)

In [24]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [28]:
yt


   62    67     2  ...     72     2    65
   69    69    58  ...     62    67    60
   58     2    62  ...     62    73     1
       ...          ⋱          ...       
   60    61    73  ...     72    72     2
   71    54    73  ...     66    68    71
   58    72    73  ...     58    71    62
[torch.cuda.LongTensor of size 512x8 (GPU 0)]

In [26]:
def NLL_loss_seq(inp,targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh),targ)

In [30]:
vocab_size

85

In [31]:
fit(m,md,4,opt,NLL_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                              
    0      1.113718   0.952768  
    1      0.845255   0.776586                               
    2      0.723824   0.689311                               
    3      0.657047   0.645979                               



[array([0.64598])]

In [34]:
set_lrs(opt, 1e-5)

In [41]:
fit(m,md,4,opt,NLL_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      0.588392   0.604907  
    1      0.587477   0.604464                               
    2      0.588031   0.604067                               
    3      0.586826   0.603619                               



[array([0.60362])]

In [42]:
(len(text)/512/8)*0.8

117.3619140625

#### idnetity init

In [43]:
?m.rnn

In [47]:
m = CharSeqRNN(vocab_size,n_fac).cuda()
opt = optim.Adam(m.parameters(),1e-3)

In [48]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden_act))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [49]:
fit(m,md,4,opt,NLL_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                              
    0      1.011079   0.862478  
    1      0.771865   0.716077                               
    2      0.671743   0.659124                               
    3      0.615762   0.607999                               



[array([0.608])]

In [50]:
set_lrs(opt, 1e-4)

In [52]:
fit(m,md,4,opt,NLL_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4, style=ProgressStyle(description_width='initial…

epoch      trn_loss   val_loss                               
    0      0.556481   0.573253  
    1      0.552402   0.570123                               
    2      0.548966   0.567266                               
    3      0.545247   0.56423                                



[array([0.56423])]