<a href="https://colab.research.google.com/github/prajapatiraghulal/editor_project/blob/app_v1/lstm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mxnet-cu101mkl

Collecting mxnet-cu101mkl
  Downloading mxnet_cu101mkl-1.6.0.post0-py2.py3-none-manylinux1_x86_64.whl (712.3 MB)
[K     |████████████████████████████████| 712.3 MB 337 bytes/s 
[?25hCollecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet-cu101mkl
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.10.1
    Uninstalling graphviz-0.10.1:
      Successfully uninstalled graphviz-0.10.1
Successfully installed graphviz-0.8.4 mxnet-cu101mkl-1.6.0.post0


In [None]:
import collections
import re
import mxnet

In [None]:
import os 
import requests
import zipfile
import tarfile
import hashlib



In [None]:
def read_dataset(location):
    with open(location) as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+',' ',line).strip() for line in lines]

In [None]:
lines = read_dataset('wonderland')

In [None]:
lines[0]

'Alice was beginning to get very tired of sitting by her sister on the'

In [None]:
def tokenize(lines, token = 'word'):
    """ split text lines into word or character tokens"""
    if token =='word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print(f"ERROR: UNKNOWN TOKEN TYPE :{token}")

In [None]:
tokens = tokenize(lines)
for i in range(5):
    print(tokens[i])

['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the']
['bank', 'and', 'of', 'having', 'nothing', 'to', 'do', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the']
['book', 'her', 'sister', 'was', 'reading', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in']
['it', 'and', 'what', 'is', 'the', 'use', 'of', 'a', 'book', 'thought', 'Alice', 'without', 'pictures', 'or']
['conversations']


In [None]:
class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens = None, min_freq = 0, reserved_tokens = None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        
        #sorting according to frequency
        counter = self.count_corpus(tokens)
        self.token_freqs =  sorted(counter.items(), key = lambda x: x[1],reverse = True)
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [token for token,freq in self.token_freqs
                        if freq>= min_freq and token not in uniq_tokens]
        self.idx_to_token , self.token_to_idx = [],dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1;

    def count_corpus(self,tokens):
        """Count token freuencies."""
        if len(tokens)==0 or isinstance(tokens[0],list):
            tokens = [token for line in tokens for token in line]
        return collections.Counter(tokens)

    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens,(list,tuple)):
            return self.token_to_idx.get(tokens,self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list,tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
        

In [None]:
vcb = Vocab()

In [None]:
vcb

<__main__.Vocab at 0x7fc396d75c10>

In [None]:
vcb = Vocab(tokens)

In [None]:
vcb.__len__()

2947

In [None]:
print(list(vcb.token_to_idx.items())[:10])

[('<unk>', 0), ('the', 1), ('and', 2), ('to', 3), ('a', 4), ('I', 5), ('it', 6), ('she', 7), ('of', 8), ('said', 9)]


In [None]:
len(vcb)

2947

In [None]:
vcb[tokens[0]] ,tokens[0]

([10, 12, 268, 3, 104, 29, 497, 8, 370, 82, 16, 408, 20, 1],
 ['Alice',
  'was',
  'beginning',
  'to',
  'get',
  'very',
  'tired',
  'of',
  'sitting',
  'by',
  'her',
  'sister',
  'on',
  'the'])

In [None]:
def load_corpus_of_dataset(max_tokens = -1):
    """ It returns token indices and the vocabulary of the time dataset. """
    lines = read_dataset('wonderland')
    tokens = tokenize(lines, 'word')
    vocab = Vocab(tokens)

    corpus = [vocab[token] for line in tokens for token in line]
    if max_tokens>0:
        corpus = corpus[:max_tokens]
    return corpus, vocab



In [None]:
corpus , vocab = load_corpus_of_dataset()
len(corpus), len(vocab)

(27322, 2947)

In [None]:
import math
from mxnet import np,npx 
npx.set_np()

In [None]:
import random

In [None]:
class Sequential_data_iter:
    """Generate a minibatch of subsequences. """
    def __init__(self,corpus, batch_size, num_steps, randm=False):
        self.corpus = corpus
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.num_subseqs = 0
        self.num_batches = 0
        self.initial_indices= []
        
    
    def random_iter(self):
        """generate minibatch of subsequence using random sampling."""
        self.corpus = self.corpus[random.randint(0,self.num_steps-1):]
        self.num_subseqs = (len(self.corpus)-1)//self.num_steps
        self.initial_indices = list(range(0,self.num_subseqs * self.num_steps, self.num_steps))
        random.shuffle(self.initial_indices)

        self.num_batches = self.num_subseqs // self.batch_size
        for i in range(0, self.batch_size * self.num_batches, self.batch_size):
            self.initial_indices_per_batch = self.initial_indices[i: i+self.batch_size]

            X = [self.__data(j) for j in self.initial_indices_per_batch]
            Y = [self.__data(j+1) for j in self.initial_indices_per_batch]
            yield np.array(X), np.array(Y)
            


    def __data(self,pos):
        return self.corpus[pos: pos +self.num_steps]


In [None]:
def seq_data_iter_random(corpus, batch_size, num_steps): 
    """Generate a minibatch of subsequences using random sampling."""
    
    corpus = corpus[random.randint(0, num_steps - 1):]
    num_subseqs = (len(corpus) - 1) // num_steps
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    random.shuffle(initial_indices)
    def data(pos):
        return corpus[pos: pos + num_steps]
    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size * num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i: i + batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j + 1) for j in initial_indices_per_batch]
        yield np.array(X), np.array(Y)


In [None]:
my_seq = list(range(35))
data_iter = seq_data_iter_random(my_seq, batch_size = 2, num_steps = 5)

In [None]:
i = 0
for X,Y in (data_iter):
    if i>=2:
        break
    print('X : ',X, '\nY: ',Y)
    i+=1

X :  [[14. 15. 16. 17. 18.]
 [19. 20. 21. 22. 23.]] 
Y:  [[15. 16. 17. 18. 19.]
 [20. 21. 22. 23. 24.]]
X :  [[29. 30. 31. 32. 33.]
 [24. 25. 26. 27. 28.]] 
Y:  [[30. 31. 32. 33. 34.]
 [25. 26. 27. 28. 29.]]


In [None]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):
    """Generate a minibatch of subsequences using sequential partitioning."""
    # Start with a random offset to partition a sequence
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    Xs = np.array(corpus[offset: offset + num_tokens])
    Ys = np.array(corpus[offset + 1: offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1] // num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i: i + num_steps]
        Y = Ys[:, i: i+ num_steps]
        yield X, Y

In [None]:
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY:', Y)

X:  [[ 0.  1.  2.  3.  4.]
 [17. 18. 19. 20. 21.]] 
Y: [[ 1.  2.  3.  4.  5.]
 [18. 19. 20. 21. 22.]]
X:  [[ 5.  6.  7.  8.  9.]
 [22. 23. 24. 25. 26.]] 
Y: [[ 6.  7.  8.  9. 10.]
 [23. 24. 25. 26. 27.]]
X:  [[10. 11. 12. 13. 14.]
 [27. 28. 29. 30. 31.]] 
Y: [[11. 12. 13. 14. 15.]
 [28. 29. 30. 31. 32.]]


In [None]:
class SeqDataLoader:
    """An iterator to load sequence data."""
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = seq_data_iter_random
        else:
            self.data_iter_fn = seq_data_iter_sequential
        self.corpus, self.vocab = load_corpus_of_dataset(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps
    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

In [None]:
def load_data(batch_size, num_steps,
    use_random_iter=False, max_tokens=10000):
    """Return the iterator and the vocabulary of the time machine dataset."""
    data_iter = SeqDataLoader(
        batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab


In [None]:
from mxnet import np, npx
from mxnet.gluon import rnn, nn
npx.set_np()
batch_size, num_steps = 32, 3

train_iter, vocab = load_data(batch_size, num_steps)

In [None]:
from mxnet import autograd,gluon

In [None]:
npx.gpu(0)

gpu(0)

In [None]:
vocab_size, num_hiddens, device = len(vocab), 256, npx.gpu(0)
num_epochs, lr = 500, 1

In [None]:
class RNNModel(nn.Block):
    """The RNN model."""
    def __init__(self, rnn_layer, vocab_size, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.dense = nn.Dense(vocab_size)
    def forward(self, inputs, state):
        X = npx.one_hot(inputs.T, self.vocab_size)
        Y, state = self.rnn(X, state)
        # The fully-connected layer will first change the shape of `Y` to
        # (`num_steps` * `batch_size`, `num_hiddens`). Its output shape is
        # (`num_steps` * `batch_size`, `vocab_size`).
        output = self.dense(Y.reshape(-1, Y.shape[-1]))
        return output, state
    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)


In [None]:
import time

In [None]:
class Timer: 
    """Record multiple running times."""
    def __init__(self):
        self.times = []
        self.start()
    def start(self):
        """Start the timer."""
        self.tik = time.time()
    def stop(self):
        """Stop the timer and record the time in a list."""
        self.times.append(time.time() - self.tik)
        return self.times[-1]
    def avg(self):
        """Return the average time."""
        return sum(self.times) / len(self.times)
    def sum(self):
        """Return the sum of time."""
        return sum(self.times)
    def cumsum(self):
        """Return the accumulated time."""
        return np.array(self.times).cumsum().tolist()

In [None]:
class Accumulator:
    """For accumulating sums over `n` variables."""
    def __init__(self, n):
        self.data = [0.0] * n
    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]
    def reset(self):
        self.data = [0.0] * len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]


In [None]:
def grad_clipping(net, theta):
    """Clip the gradient."""
    if isinstance(net, gluon.Block):
        params = [p.data() for p in net.collect_params().values()]
    else:
        params = net.params
    norm = math.sqrt(sum((p.grad ** 2).sum() for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm


In [None]:
def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter):
    """Train a model within one epoch (defined in Chapter 8)."""
    state, timer = None, Timer()
    metric = Accumulator(2) # Sum of training loss, no. of tokens
    for X, Y in train_iter:
        if state is None or use_random_iter:
            # Initialize `state` when either it is the first iteration or
            # using random sampling
            state = net.begin_state(batch_size=X.shape[0], ctx=device)
        else:
            for s in state:
                s.detach()
        y = Y.T.reshape(-1)
        X, y = X.as_in_ctx(device), y.as_in_ctx(device)
        with autograd.record():
            y_hat, state = net(X, state)
            l = loss(y_hat, y).mean()
        l.backward()
        grad_clipping(net, 1)
        updater(batch_size=1) # Since the `mean` function has been invoked
        metric.add(l * y.size, y.size)
    return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()

In [None]:
import matplotlib.pyplot as plt
from mxnet import init
from IPython import display
%matplotlib inline


In [None]:
def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
    """Set the axes for matplotlib."""
    axes.set_xlabel(xlabel)
    axes.set_ylabel(ylabel)
    axes.set_xscale(xscale)
    axes.set_yscale(yscale)
    axes.set_xlim(xlim)
    axes.set_ylim(ylim)
    if legend:
        axes.legend(legend)
    axes.grid()

In [None]:
class Animator:
    """For plotting data in animation."""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                ylim=None, xscale='linear', yscale='linear',
                fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                figsize=(5.5, 3.5)):
        # Incrementally plot multiple lines
        if legend is None:
            legend = []
        display.set_matplotlib_formats('svg')
        self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes, ]
        # Use a lambda function to capture arguments
        self.config_axes = lambda: set_axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        self.X, self.Y, self.fmts = None, None, fmts
    def add(self, x, y):
        # Add multiple data points into the figure
        if not hasattr(y, "__len__"):
            y= [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        display.display(self.fig)
        display.clear_output(wait=True)


In [None]:
def sgd(params, lr, batch_size): 
    """Minibatch stochastic gradient descent."""
    for param in params:
        param[:] = param - lr * param.grad / batch_size

In [None]:
def predict_ch8(prefix, num_preds, net, vocab, device):
    """Generate new characters following the `prefix`."""
    state = net.begin_state(batch_size=1, ctx=device)
    #outputs = [vocab[prefix[0]]]
    outputs = [vocab[prefix[0]]]
    get_input = lambda: np.reshape(
        np.array([outputs[-1]], ctx=device), (1, 1))
    for y in prefix[1:]: # Warm-up period
        _, state = net(get_input(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds): # Predict `num_preds` steps
        y, state = net(get_input(), state)
        
        
        outputs.append(int(y.argmax(axis=1).reshape(1)))
    return ' '.join([vocab.idx_to_token[i] for i in outputs[len(prefix):-1]])


In [None]:
def sorting(y):
    x = sorted(y)
    return x[-3:]

In [None]:
sorting([3,2,54,3])

[3, 3, 54]

In [None]:
def train_ch8(net, train_iter, vocab, lr, num_epochs, device, 
use_random_iter=False):
    """Train a model (defined in Chapter 8)."""
    loss = gluon.loss.SoftmaxCrossEntropyLoss()
    animator = Animator(xlabel='epoch', ylabel='perplexity',
                            legend=['train'], xlim=[10, num_epochs])
    # Initialize
    if isinstance(net, gluon.Block):
        net.initialize(ctx=device, force_reinit=True,
                        init=init.Normal(0.01))
        trainer = gluon.Trainer(net.collect_params(),
                                'sgd', {'learning_rate': lr})
        updater = lambda batch_size: trainer.step(batch_size)
    else:
        updater = lambda batch_size: sgd(net.params, lr, batch_size)
    #predict = lambda prefix: predict_ch8(prefix.split(), 3, net, vocab, device)
    # Train and predict
    for epoch in range(num_epochs):
        ppl, speed = train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter)
        if (epoch + 1) % 10 == 0:
            animator.add(epoch + 1, [ppl])
    print(f'perplexity {ppl:.1f}, {speed:.1f} tokens/sec on {str(device)}')
    #print(predict('I am'))
    #print(predict('I gave'))


In [None]:
#device = npx.cpu()
lstm_layer = rnn.LSTM(num_hiddens)
model = RNNModel(lstm_layer, len(vocab))
train_ch8(model, train_iter, vocab, lr, num_epochs, device)


In [None]:

predict = lambda prefix: predict_ch8(prefix.split(), 2, model, vocab, device)

In [None]:
predict('And')

In [None]:
'ram'.split()

In [None]:
vocab.token_to_idx