# Summary
This Notebook Contains mainly two things.

1 WordCloud<br>
2 Language Model

Dataset : Quotes-500k <br>
Also, Generated text(Quotes) from trained Language Model. Though, most of the time it doesn't make any sense but it is sounds like quotes. It also has learnt punctuation marks like comma,< eos >.

<img src="https://i.imgur.com/VkOp3Qw.png" />

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,STOPWORDS

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv('../input/quotes-500k/quotes.csv')

## Word-cloud

In [None]:
dff = df['quote']
text = dff.str.cat(sep=' ')
stopwords = set(STOPWORDS)
stopwords.add("said")
stopwords.add("one")

wc = WordCloud(max_font_size=40, max_words=200,stopwords=stopwords, contour_width=3, contour_color='steelblue')

wordcloud = wc.generate(text)
plt.figure(figsize=(12, 9))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# Language Model

## Data preparation

In [None]:
# source of code https://github.com/pytorch/examples.git

import os
from io import open
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r', encoding="utf8") as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r', encoding="utf8") as f:
            idss = []
            for line in f:
                words = line.split() + ['<eos>']
                ids = []
                for word in words:
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))
            ids = torch.cat(idss)

        return ids


## Model

In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        return decoded, hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)


## Parameters

In [None]:
seed = 2020
cuda = True
data_path = '../input/quotes-500k/'
batch_size = 20
eval_batch_size = 10
bptt = 35
model_name = 'LSTM'
emsize = 200
nhid = 200
nlayers = 2
dropout = 0.2
lr = 20
clip = 0.25
epochs = 8
tied = True
save = 'model.pth'
log_interval = 2000
onnx_export = ''
nhead = 2

## Training

In [None]:
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx


torch.manual_seed(seed)

device = torch.device("cuda" if cuda else "cpu")

###############################################################################
# Load data
###############################################################################

corpus = Corpus(data_path)


print("Data loaded ")

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)

model = RNNModel(model_name, ntokens, emsize, nhid, nlayers, dropout, tied).to(device)

criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    if True:
        hidden = model.init_hidden(eval_batch_size)
        
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)


def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    if True:
        hidden = model.init_hidden(batch_size)
        
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


def export_onnx(path, batch_size, seq_len):
    print('The model is also exported in ONNX format at {}'.
          format(os.path.realpath(onnx_export)))
    model.eval()
    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
    hidden = model.init_hidden(batch_size)
    torch.onnx.export(model, (dummy_input, hidden), path)


# Loop over epochs.
lr = lr
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            torch.save(model, save)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.

model = torch.load(save)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    # Currently, only rnn model supports flatten_parameters function.
model.rnn.flatten_parameters()

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

if len(onnx_export) > 0:
    # Export the model in ONNX format.
    export_onnx(onnx_export, batch_size=1, seq_len=bptt)


## Generate Text

In [None]:
data_path = '/kaggle/input/quotes-500k/'
checkpoint = 'model.pth'
outf = 'generated.txt'
words = 1000
seed = 2020
cuda = True
temperature = 0.7   #temperature - higher will increase diversity
log_interval = 100
gen_text = ''

In [None]:

import torch

# Set the random seed manually for reproducibility.
torch.manual_seed(seed)

device = torch.device("cuda" if cuda else "cpu")

if temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3")


model = torch.load(checkpoint).to(device)
model.eval()

corpus = Corpus(data_path)
ntokens = len(corpus.dictionary)
print("data loaded")
#is_transformer_model = hasattr(model, 'model_type') and model.model_type == 'Transformer'

hidden = model.init_hidden(1)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)


with open(outf, 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(words):
            
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)

            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))
            gen_text+=(word + ('\n' if i % 20 == 19 else ' '))
            if i % log_interval == 0:
                print('| Generated {}/{} words'.format(i, words))
print(gen_text)


# Generated Text (on local machine)
even though the human life has stopped in the many and the ways of living its original and the one
who is utterly aware of the ability to create a new future. <br><br> When I lost my childhood I could
write your grief for a while, and one day she might be a girl, for she could know she would
be still alone. <br><br> The pen and the cold of our minds are the only creatures that are born in
our hearts. They are different and the possibilities are in the dark and the same. <br><br> When there is one
of the deepest things in life that sometimes that is the only one that is the only thing you can
ever find is that it reminds you, even when you make the same decision. <br><br> I have always been in
this way: I know the slightest things I have to share in the world, and let myself know my own
true love and a little way to be told to myself, and I guess this is a fairy tale in
which one should not know where the rest of it will be. <br><br> What it was like to be about
the worst of the people that could be written in the best acts of a fairy tale a world at
the same time. In the end, there are no more than the same things and the worst part in the
world. <br><br> You make a mistake and never allow your work to make a difference between your ability to be
an inspired person who needs a physical and commitment to be an old person who will yield to the rest
of your life. <br><br> You are not the One who answers you to the past. <br><br> Sometimes I wonder what
the world feels in the world and I have been here in a new period with such a thing which
is an active and special one, and that I have been as much to do with a little sense of
them and see their own desires that are literally what I can do with the people who have patience and
then provide the home that the child has been in their years to the extent about the past and the
future. <br><br> If you want to convert the future, but not to change any event is a little. You cannot
go to the bathroom at the top of the mountain, you will never be able to go to the future.
<br><br> I need to write to the future, but it is my name. <br><br> We are not the best one
over the future. When we see that we are bound to one that we choose to make a future to
that fact, we have only a more effective and precious combination of our problems and the worst of our life
from its own hands. <br><br> When a person is a writer you see a lot to a person and a
person of men to do the best and most beautiful things of the world. <br><br> Love, and not a mirror,
for the sake of human nature and the belief in the first time and what is going on. <br><br> The
first time you look at your mind and become aware of things, you can only write real things. <br><br> It
was such one that which could be the most potent factor in the Christian world was the theatre of the
laws of the present and the universe that the dangers of the gods are going to look out at their
lives the way it became the difference between the ages and the new and the other things and the drama
and the method. And then the evidence had been taken away from the laws and the progressive systems of the
general of the young. <br><br> The man has been in a state of enjoyment and a very peculiar and early
animal that is constantly known, and he is a child and who do not have to pay for the wish
to sustain it. <br><br> There is a greater reason for the fear of the past than it takes to be
a writer, and never lies in the present and aligning to the future. <br><br> The future is as much as
a lie in the past and that we are born to as a place where our present and the future.
We are not in the future, but in the past we bring into the future and the future. <br><br> It
is a wise man's mind. <br><br> When you have the power to stop , you reach and a great future
that is to be a reality. <br><br> Love is not a man, it is a kind of love. <br><br> The
greatest love is a state of action and a blessing for their own glory. <br><br> The best thing to do
is think about saying that only one can understand their feelings inside you and have found you in the light
of your life like a roaring and scratching past, and that you are in the direction of your own life.
You can only be able to see them with them. <br><br> Isn't it a natural thing to do with that
person and a person who thinks there's one thing and another can have a right to do the same thing.
<br><br> Once I have seen what I think about that I spend my life on the other side of the
window and I know it as a hidden one. <br><br> If you are good, you will forget the way you
are in the choices of your work and time to understand what you are doing to your future. <br><br> When
he has been in the middle of a world of awareness he has been in a hurry to tell the
difference in his family. <br><br> A person's heart is extraordinary and a man must not be in the name of