## Import libraries

In [41]:
import numpy as np
import itertools
from string import ascii_lowercase
import matplotlib.pyplot as plt
import re
from random import shuffle
%matplotlib inline

## Datasource

#### returns get method for one input-output sample to be taken to batcher  

In [40]:
class CharDatasource:
    
    def __init__(self, path, seq_len, *args):
        
        """Initialize vocabularies for char rnn (w||w/o lstm). 
        In our case we want to train only characters and < (end of word) token"""
        self._seq_len = seq_len
        self._char2idx = {'<':0}
        self._char2idx.update({char:idx+1 for (idx, char) 
                               in enumerate(ascii_lowercase)})
        self._idx2char = ['<']
        self._idx2char.extend([char for char in ascii_lowercase])
        self._vocab_size = len(self._char2idx)
        
        """Initialize vocabulary with char appearance statistics. 
        It can be useful to determine unfrequently used characters to omit them in training. 
        In this experiment however we are not excluding characters due to small dataset.  
        """
        
        self._char_stats = {'<':0}
        self._char_stats.update({char:0 for char in ascii_lowercase})
        
        # Initialize x, y sets as empty lists
        
        self._tokens = []
        self._targets = []
        
        """generator function for getting dataset  desired sequence length. We are not setting 
        sequence length to the length of token to apply vectorization without padding.
        This is a utility function which is not supposed to be inside of the object
        """
        
        def chunks(l, n):
            for i in range(0, len(l), n): yield l[i:i + n]
                
        """ Cleaning data with regex: < character left to indicate end of token(word) 
        to train  ending word, cleaning all symbols and spaces
        """
        
        data = open(path, 'r').read().lower().replace('\n','<')
        data = re.sub('[^0-9a-zA-Z<]+', '', data)
        
        """Call generator to encode input and output sequences. 
        Output is a one-step forward sequence in the same datasource - 
        We want our model to learn predicting next character
        """
        
        source = chunks(data, seq_len)
        target = chunks(data, seq_len+1)
        
        while True:
            try:
                s = next(source)
                t = next(target)[1:]
                for ch in s:
                    self._char_stats[ch]+=1
                self._tokens.append([self._char2idx[ch] for ch in s])
                self._targets.append([self._char2idx[ch] for ch in t])
        
            except StopIteration:
                break
        
        """Vectorize with numpy and quickly one-hot the input set.
        Final shapes = tokens - (num_of_sequences, sequence length, vocab_size)
        targets - (num_of_sequences, sequence length, char2idx[character])
        We are not creating one-hot for outputs because we want to count loss 
        between only most-probable character index and target
        """
        
        self._tokens = np.array(self._tokens[:-1], dtype=np.int32)
        self._targets = np.array(self._targets[:-1])
        x_values = np.max(self._tokens)+1
        self._tokens = np.eye(x_values)[self._tokens]
    
    # TO-DO: @properties

    def __getitem__(self, idx):
        return self._tokens[idx], self._targets[idx]
    
    def __len__(self):
        return len(self._tokens)
    

## Model

#### Neural net architecture for raw RNN - to be called once per epoch. Returns gradients and last hidden state from sequence to iterate through batches

In [64]:
class CharModelRNN:
    
    def __init__(self, vocab_size, hidden_size, clip_ratio, seq_len, batch_size, **params):
        
        """ Initialize necessary hyperparams and nn layers. Import updated params (or zero params).
        Reset all grads 
        """
        
        self._clip_ratio = clip_ratio
        self._seq_len = seq_len
        self._batch_size = batch_size
        self._hidden_layer = np.zeros((self._seq_len,hidden_size,self._batch_size))
        self._output_layer = np.zeros((self._seq_len,vocab_size,self._batch_size))
        
        self._Whx = Whx
        self._Whh = Whh
        self._Why = Why
        self._bh = bh
        self._by = by
        
        self._dWhx, self._dWhh, self._dWhy = np.zeros_like(Whx), np.zeros_like(Whh), np.zeros_like(Why)
        self._dbh, self._dby = np.zeros_like(bh), np.zeros_like(by)
        
    def _forward_rnn(self, x, hidden_back, y=None):
        
        # Using hidden state from last RNN-cell of the previous sequence

        self._hidden_layer[-1]= hidden_back
        loss = 0
        y = y_train
        
        # Batch Forward RNN propagation 
        
        for t in range(self._seq_len):
            self._hidden_layer[t] = np.tanh(np.dot(self._Whx, x[:,t].T)+
                                            np.dot(self._Whh, self._hidden_layer[t-1])+ self._bh)
            output = np.dot(self._Why, self._hidden_layer[t]) + self._by
            self._output_layer[t] = np.exp(output) / np.sum(np.exp(output))
            loss+= np.sum(-np.log(self._output_layer[t].T[range(self._batch_size), 
                                                            y[:,t]])) / self._batch_size
       
        return loss, self._hidden_layer, self._output_layer
    
    def _backward_rnn(self, x, y, hidden_layer, output_layer):
        
        """Batch Backward RNN propagation. dhnext is
        a derivative of previous(next in time) RNN cell hidden state. Init with zeros
        Special thanks to Andrey Karpathy's solution:
        https://gist.github.com/karpathy/d4dee566867f8291f086 
        """
        dhnext = np.zeros_like(hidden_layer[0])
        for t in reversed(range(self._seq_len)):
            dy = np.copy(output_layer[t])
            dy[y[:,t]] -= 1
            self._dWhy += np.dot(dy, hidden_layer[t].T)
            self._dby += np.sum(dy, axis=1).reshape(-1,1)
            dh = np.dot(self._Why.T, dy) + dhnext
            dhtan = (1 - hidden_layer[t] **2) * dh 
            self._dbh += np.sum(dhtan, axis=1).reshape(-1,1)
            self._dWhx += np.dot(dhtan, x[:,t])
            self._dWhh += np.dot(dhtan, hidden_layer[t-1].T)
            dhnext = np.dot(self._Whh.T, dhtan)
        
        # Clipping gradients to omit explosions during learning. Utility method 
        
        for dparam in [self._dWhx, self._dWhh, self._dWhy, self._dbh, self._dby]:
            np.clip(dparam, -self._clip_ratio, self._clip_ratio, out=dparam)
            
        grads = {'dWhx': self._dWhx, 'dWhh': self._dWhh, 'dWhy': self._dWhy, 
                 'dbh': self._dbh, 'dby': self._dby}
        
        return grads, self._hidden_layer[self._seq_len-1]
        
    def batch_propagate(self, x_train, y_train, hidden_back, mode = 'train'):
        loss, hidden_layer, output_layer = self._forward_rnn(x_train, hidden_back, 
                                                             y=y_train)
        if mode == 'train':
            grads, hidden_back = self._backward_rnn(x_train, y_train, hidden_layer, 
                                                output_layer)
            return loss, grads, hidden_back
        else:
            return loss, hidden_layer[self._seq_len-1]
        #elif model == 'eval':
         #    loss, hidden_layer[self._seq_len-1]
        
        # Returning list of gradients as well as hidden state from last RNN-cell
        
        
            

## Dataloader
#### Generate input/output batches from datasource for train/test modes

In [65]:
class RNNDataLoader(CharDatasource):
    
    def __init__(self, split_factor, batch_size, mode='train', **kwargs):
        
        """ CharDatasource is used as parent object.
        train/test mode defines which part of dataset to use for batching
        """
        
        self._split_factor = split_factor
        self._mode = mode
        self._batch_size = batch_size
        super().__init__(**kwargs)
        
        """Just to indicate - we are encountering chunks utility function for the second time.
        In project this should go to utility - otherwise it's a bad practice
        """
    
        def chunks(l, n):
            for i in range(0, len(l), n): yield l[i:i + n]
        
        """Splitting dataset into train/test
        """
        
        train_set = (self._tokens[:int(self.__len__()*split_factor)], 
                     self._targets[:int(self.__len__()*split_factor)])
        test_set = (self._tokens[int(self.__len__()*split_factor):], 
                    self._targets[int(self.__len__()*split_factor):])
        
        """Eventhough we are having pretty small dataset, it's a 
        good practice to use generators instead of keeping
        additional data arrays to free up memory
        """
        
        if self._mode == 'train':
            self._x_loader = chunks(train_set[0], self._batch_size)
            self._y_loader = chunks(train_set[1], self._batch_size)
        else:
            self._x_loader = chunks(test_set[0], self._batch_size)
            self._y_loader = chunks(test_set[1], self._batch_size)
            
        self._batches=[]
        
        while True:
            try:
                x, y = next(self._x_loader), next(self._y_loader)
                if len(x)==self._batch_size:
                    self._batches.append((x,y))

            except StopIteration:
                break
        
        
    def _getbatch(self, idx):
        return self._batches[idx]

## Trainer

#### Trainer brings together everything we created before, to iterate via epochs and update parameters of our model

In [93]:
#Inference

def sample(h, seed_ix):
    """ 
      sample a sequence of integers from the model 
      h is memory state, seed_ix is seed letter for first time step
      """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    
    while True:
        h = np.tanh(np.dot(Whx, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
        if ix == 0:
            break

    return ixes

In [None]:
learning_rate = 0.01
num_epochs = 10000

hidden_size = 100
batch_size = 8
vocab_size = 27
seq_len = 15
clip_ratio  = 1
split_factor = 0.8
path = 'test.txt'

Whx = np.random.randn(hidden_size, vocab_size)*0.01
Whh = np.random.randn(hidden_size, hidden_size)*0.01
Why = np.random.randn(vocab_size, hidden_size)*0.01
bh = np.zeros((hidden_size, 1))
by = np.zeros((vocab_size, 1))
params = {'Whx':Whx, 'Whh':Whh,'Why':Why,'bh':bh,'by':by}

mWxh, mWhh, mWhy = np.zeros_like(Whx), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)

char_dataset = CharDatasource(path=path, seq_len=seq_len)

data_train = RNNDataLoader(split_factor=split_factor, batch_size=batch_size, 
                           path=path,  seq_len=seq_len)
data_test = RNNDataLoader(split_factor=split_factor, batch_size=batch_size, 
                          path=path,  seq_len=seq_len, mode='test')

model = CharModelRNN(vocab_size=vocab_size, hidden_size=hidden_size, 
                     clip_ratio=clip_ratio, seq_len=seq_len, 
                     batch_size=batch_size, **params)
train_epoch_loss = {}
valid_epoch_loss = {}

for n in range(num_epochs):
    
    bulk_grads = []
    epoch_grads = {'dWhx':0, 'dWhh':0, 'dWhy':0, 'dbh':0, 'dby':0}
    hidden_back = np.zeros((hidden_size, batch_size))
    train_losses = []
    
    hidden_back = np.zeros((hidden_size, batch_size))
    hidden_sample = np.zeros((hidden_size,1))
    
    if n % 10 == 0:
        sample_ix = sample(hidden_sample, 1)
        txt = ''.join(char_dataset._idx2char[ix] for ix in sample_ix)
        print(txt)
    
    for batch in data_train._batches:
            x_train, y_train = batch[0], batch[1]
            loss, grads, hidden_back = model.batch_propagate(x_train, 
                                                             y_train, hidden_back)
            bulk_grads.append(grads)
            train_losses.append(loss)
  
    train_epoch_loss.update({n:np.mean(train_losses)})

    for i in range(len(bulk_grads)):
        epoch_grads['dWhx'] += bulk_grads[i]['dWhx']
        epoch_grads['dWhh'] += bulk_grads[i]['dWhh']
        epoch_grads['dWhy'] += bulk_grads[i]['dWhy']
        epoch_grads['dbh'] += bulk_grads[i]['dbh']
        epoch_grads['dby'] += bulk_grads[i]['dby']
    for k in epoch_grads.keys():
        epoch_grads[k] = epoch_grads[k]/batch_size
    
    """for p in params.keys():
        params[p] += - learning_rate * epoch_grads[str('d'+p)]
    """    
    for param, dparam, mem in zip([params['Whx'], params['Whh'], params['Why'], 
                                   params['bh'], params['by']], 
                                [epoch_grads['dWhx'],  epoch_grads['dWhh'], epoch_grads['dWhy'], 
                                 epoch_grads['dbh'], epoch_grads['dby']], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8)
    
    
    test_losses = []
    for batch in data_test._batches:
        x_train, y_train = batch[0], batch[1]
        loss, hidden_back = model.batch_propagate(x_train, 
                                                             y_train, hidden_back, mode='eval')
        test_losses.append(loss)

    valid_epoch_loss.update({n:np.mean(test_losses)})
    
x = train_epoch_loss
t = valid_epoch_loss
plt.plot(t.values())

heiyd<
rldqubdpthppgvguaaygcbmaezkwvmyincofhaqikdkvhdtkberarnbeoimkpnblrclcvoidklckqveebrtubmcdakmtmp<
zkbohmplkrttbhmukvubacl<
ivaykfdekmqvrvnb<
misbdvsrfm<
iikltd<
<
cvyanuhcdwedibliyvgymmooybxvkkeubeolskedzbvranhaffpydpibronih<
ecsvgjiptgvitrabqtnqollacffldgnydvbkkgiogpamtdyr<
qhayhhvudaiadevxhhrwavk<
efkiedutlbsmooru<
mdirxpqidcv<
ybphaibvqipeikkpbddkkkwhamdmri<
elscsafarcksfs<
vl<
id<
hrsdogsayvksigbbdkikwhhhyhqjocjbcopkb<
vyoiwhrtdpqweukslwcsciddclc<
mhkaxhkiqyzwfktvqbybotwmplhkymloffou<
<
zclphkmkmiylyk<
oyvifemmkcnsyvelkalmknfmnuvaknhvuoiutcnbiovsbicowfmkviegiehkdosevkxlxcaeytgscg<
rsibmo<


In [109]:
#Experiment

""" Dataloader -> Trainer -> Inference
"""

' Dataloader -> Trainer -> Inference\n'