Importing all the necessary libraries

In [10]:
import spacy
from collections import Counter
import numpy as np
import torch
from torch import nn, optim
from torch.nn import functional as F
from IPython.display import clear_output

I have worked on google colab to make use of GPUs for parallel processing. Below, I have mounted google drive to read the dataset and to save the model. 

In [11]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Dataset: Alice's Adventures in Wonderland (commonly Alice in Wonderland)

It is an 1865 English novel by Lewis Carroll. It details the story of a young girl named Alice who falls through a rabbit hole into a fantasy world of anthropomorphic creatures.

In [12]:
# to read the text file 
with open('/content/gdrive/MyDrive/text_generation/alice_in_wonderland.txt', 'r') as f:
  text = f.read()

text = text[:1000000] 
print(f'first 100 characters:\n {text[:100]}')

first 100 characters:
 Alice's Adventures in Wonderland

                ALICE'S ADVENTURES IN WONDERLAND

                


In [13]:
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
clear_output()

Computers only understand numbers so we need to convert words to integers. Words are mapped to integers based on the number of times a word has occured in the corpus. 

In [14]:
def mapping(text: str): 
  '''
  Function description: 
      A function to create word tokens, word to integer mapping, and integer to word mapping.

  parameters: 
      text: a variable containing the document.

  Returns: 
      word_tokens: tokenization of corpus
      word2int: word to integer mapping
      int2word: integer to word mapping 
  '''

  word_tokens = [word.text for word in nlp(text)]
  sorted_unique_words = sorted(Counter(word_tokens).items(), key = lambda x:x[1], reverse = True) # vocabulary 
  word2int = {word:id for id, (word, count) in enumerate(sorted_unique_words)} 
  int2word = {id:word for id, (word, count) in enumerate(sorted_unique_words)} 

  return word_tokens, word2int, int2word


In [15]:
word_tokens, word2int, int2word = mapping(text)

In [16]:
print(f'number of words in the corpus are: {len(word_tokens)}\nnumber of words in the vocubulary or unique words in the corpus are: {len(word2int)}')

number of words in the corpus are: 37958
number of words in the vocubulary or unique words in the corpus are: 3139


In [17]:
# converting the whole corpus to int values
intarr = [word2int[i] for i in word_tokens]
print(f'first 100 word tokens converted to integer values: \n {intarr[:100]}')

first 100 word tokens converted to integer values: 
 [17, 29, 919, 19, 920, 758, 921, 515, 1649, 1188, 1650, 1651, 1652, 1653, 1654, 426, 1655, 1656, 1657, 1658, 1659, 339, 11, 922, 923, 2, 119, 36, 1660, 288, 17, 18, 289, 8, 120, 40, 516, 14, 388, 94, 23, 427, 1, 30, 2, 924, 0, 6, 14, 389, 164, 8, 44, 24, 9, 158, 68, 759, 13, 31, 1, 925, 76, 2, 362, 23, 427, 18, 926, 0, 37, 12, 31, 73, 1, 760, 68, 1661, 19, 12, 0, 4, 6, 56, 51, 2, 243, 14, 10, 362, 0, 3, 1, 69, 17, 4, 189, 760, 68, 390]


In [29]:
def get_batches(arr, batch_size: int, seq_len: int):
  ''' Function description: 
        A function to create input and output batches of dimension (batch_size, seq_len)

      Parameters: 
      arr: integer converted word tokens
      seq_len: number of words in a sequence in a batch 
      batch_size: number of sequences in a batch

      Returns:
      x: input batch
      y: corresponding output batch     
  '''
  arr = np.asarray(arr)
  n_batches = int(len(arr)//(batch_size * seq_len))
  arr = arr[: n_batches * batch_size * seq_len]
  arr = arr.reshape((batch_size, -1))

  for n in range(0, arr.shape[1], seq_len):
    x = arr[:, n: n + seq_len]    
    y = np.zeros_like(x)
    
    try:
      y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n + seq_len]
    except IndexError: 
      y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
    yield x, y 


In [32]:
batch_size, seq_len = 64, 32
x, y = next(get_batches(intarr, batch_size, seq_len))
print(f'each input batch dimension {x.shape}')
print(f'each output batch dimension {y.shape}')


each input batch dimension (64, 32)
each output batch dimension (64, 32)


In [33]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Recurrent Neural Networks can be used for language modeling. I have selected one of its variant, LSTM (long-short term memory) because the model can remember the information of long term words. In cell state of the LSTM, the neccesary information about previous words will be remembered and unneccessary information about previous words will be removed. Based on this, the hidden state will be updated.

In [34]:
class wordRNN(nn.Module):
  def __init__(self, intarr, embedding_dim = 50, n_hidden = 512, n_layers = 2, drop_prob = 0.5):
    super().__init__()
    self.n_hidden = n_hidden
    self.n_layers = n_layers
    self.embedding_dim = embedding_dim
    self.n_words = len(set(intarr)) # number of unique words in the corpus 
    self.embedding = nn.Embedding(self.n_words, self.embedding_dim)
    self.lstm = nn.LSTM(self.embedding_dim, self.n_hidden, self.n_layers, batch_first = True)
    self.dropout = nn.Dropout(drop_prob)
    self.linear = nn.Linear(self.n_hidden, self.n_words)
  
  def forward(self, x, hidden):
    # x = [batch_size, seq_len] --> (64, 32)

    embed = self.embedding(x)
    # embed = [batch_size, seq_len, embedding_dim] --> (64, 32, 50)

    r_output, hidden = self.lstm(embed, hidden)
    # r_output = [batch_size, seq_len, hidden_size] --> (64, 32, 512)
    # hidden = tuple(hn, cn) where hn = [n_layers, batch_size, hidden_size] --> (2, 64, 512)

    out = self.dropout(r_output)
    out = out.contiguous().view(-1, self.n_hidden)
    # out: (-1, 512) --> [batch_size * seq_len, hidden_size] --> (2048, 512)

    out = self.linear(out)
    # out = [batch_size * seq_len, n_words] --> (2048, 3139)
    
    return out, hidden

  def init_hidden(self, batch_size):
    hidden = (torch.zeros(self.n_layers, batch_size, self.n_hidden).to(device),
              torch.zeros(self.n_layers, batch_size, self.n_hidden).to(device))
    return hidden 

In [35]:
model = wordRNN(intarr)
model.to(device)

wordRNN(
  (embedding): Embedding(3139, 50)
  (lstm): LSTM(50, 512, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (linear): Linear(in_features=512, out_features=3139, bias=True)
)

In [36]:
criterion = nn.CrossEntropyLoss()
criterion.to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)

80% of batches are selected for training and 20% of batches are selected for validation. 

In [37]:
n_batches = int(len(intarr)/(batch_size * seq_len))
n_training_batches = int(np.floor(0.8 * n_batches))
n_validation_batches = int(np.ceil(0.2 * n_batches))
print(f'number of batches: {n_batches}\nnumber of training batches: {n_training_batches}\nnumber of validation batches: {n_validation_batches}')

number of batches: 18
number of training batches: 14
number of validation batches: 4


Given a word from dataset, we will train the model to predict next word in a sequence. 

We will check the perplexity (evaluation metrics for language model) of the model during validation. Lower the perplexity, better the model. And then saving the model which gives lower validation loss (low perplexity). 

In [38]:
batch_size, seq_len = 64, 32
epochs = 50
min_validation_loss = np.inf


for i in range(epochs):
  print(f'epoch {i}')
  validation_loss = 0
  hidden = model.init_hidden(batch_size)

  for batch, (words, labels) in enumerate(get_batches(intarr, batch_size, seq_len)):
    words = torch.tensor(words).to(device)
    labels = torch.tensor(labels).to(device)
    labels = labels.view(batch_size * seq_len).long()

    if batch <= n_training_batches - 1: 
      model.train()
      hidden = tuple([i.data for i in hidden])
      optimizer.zero_grad()
      logits, hidden = model(words, hidden)
      loss = criterion(logits, labels)
      loss.backward()
      optimizer.step()
      
      if batch % 100 == 0:
        print(f'training_loss: {loss : .3f}')

    if batch == n_training_batches:
      hidden = model.init_hidden(batch_size)

    if batch > n_training_batches - 1:
      model.eval()
      logits, hidden = model(words, hidden)
      loss = criterion(logits, labels)

      validation_loss += loss.item()


  average_validation_loss = validation_loss/ n_validation_batches    
  print(f'average_validation_loss: {average_validation_loss: .3f}, perplexity: {torch.exp(torch.tensor(average_validation_loss))}')

  if min_validation_loss > average_validation_loss:
    print(f'Validation Loss Decreased({min_validation_loss:.6f}--->{average_validation_loss:.6f}) \t Saving The Model')
    min_validation_loss = average_validation_loss
    torch.save(model.state_dict(), '/content/gdrive/MyDrive/text_generation/model.pt')


epoch 0
training_loss:  8.053
average_validation_loss:  5.995, perplexity: 401.4327087402344
Validation Loss Decreased(inf--->5.995040) 	 Saving The Model
epoch 1
training_loss:  6.173
average_validation_loss:  5.914, perplexity: 370.0849304199219
Validation Loss Decreased(5.995040--->5.913732) 	 Saving The Model
epoch 2
training_loss:  6.023
average_validation_loss:  5.909, perplexity: 368.2083435058594
Validation Loss Decreased(5.913732--->5.908649) 	 Saving The Model
epoch 3
training_loss:  5.975
average_validation_loss:  5.908, perplexity: 368.1451416015625
Validation Loss Decreased(5.908649--->5.908477) 	 Saving The Model
epoch 4
training_loss:  5.927
average_validation_loss:  5.899, perplexity: 364.6735534667969
Validation Loss Decreased(5.908477--->5.899003) 	 Saving The Model
epoch 5
training_loss:  5.874
average_validation_loss:  5.857, perplexity: 349.6988525390625
Validation Loss Decreased(5.899003--->5.857072) 	 Saving The Model
epoch 6
training_loss:  5.824
average_validat

In [39]:
# loading the model which has best perplexity score 
model.load_state_dict(torch.load('/content/gdrive/MyDrive/text_generation/model.pt'))

<All keys matched successfully>

We will see how our model predicts with greedy search algorithm (selecting the highest probability from all the probabilities that the model predicted for every unique words in the corpus)

In [40]:
def predict(n_words, word):
  ''' '''

  word = word2int[word]
  model.eval()
  hidden = model.init_hidden(1)
  predicted_words = []
  predicted_words.append(word)
  
  for i in range(n_words):
    wordint = torch.tensor(word)
    wordint = wordint.view(-1, 1).to(device)

    logits, hidden = model(wordint, hidden)
    prob = F.softmax(logits, dim = 1).data.cpu()
    word = np.argmax(prob).item()

    predicted_words.append(word)

  return ' '.join([int2word[i] for i in predicted_words])


predicted_words = predict(100, 'The')

In [41]:
predicted_words

"The \n thing , ' said the King , ` and \n I 'm a little thing , ' said the King , ` and \n I 'm a little thing , ' said the King , ` and \n I 'm a little thing , ' said the King , ` and \n I 'm a little thing , ' said the King , ` and \n I 'm a little thing , ' said the King , ` and \n I 'm a little thing , ' said the King , ` and \n I 'm a little thing"

Now, we will see how our model predicts by randomly selecting a probability from top k higher probabilities. 

In [42]:
def predict_top_k(n_words, sentence):

  words_int = [word2int[i.text] for i in nlp(sentence)]

  model.eval()
  hidden = model.init_hidden(1)
  predicted_words = []

  for i in words_int:
    wordint = torch.tensor(i)
    wordint = wordint.view(-1, 1).to(device)
    logits, hidden = model(wordint, hidden)
    prob = F.softmax(logits, dim = 1).data.cpu()

    prob, top_words= prob.topk(10)
    prob, top_words= prob.numpy().squeeze(), top_words.numpy().squeeze()
    word = np.random.choice(top_words)

  predicted_words.append(word)

  for i in range(n_words):
    wordint = torch.tensor(word)
    wordint = wordint.view(-1, 1).to(device)

    logits, hidden = model(wordint, hidden)
    prob = F.softmax(logits, dim = 1).data.cpu()

    prob, top_words = prob.topk(10)
    prob, top_words = prob.numpy().squeeze(), top_words.numpy().squeeze()
    word = np.random.choice(top_words)

    predicted_words.append(word)

  return ' '.join([int2word[i] for i in (words_int + predicted_words)])     


In [43]:
predicted_words = predict_top_k(100, 'I wonder if')

In [44]:
predicted_words

'I wonder if I ca n\'t get that what \n it was a \n way in be to say the whole - box -- but if it might n\'t \n have seen a \n deal , you dear to the right - key of them of the same , \n           And I can do n\'t have the day \n to think the same I - day ? \n       Oh . \n The thing - head \'s well off ; " " You are sure , and all -- I think that you know to \n all , you \'ll do all , \' and'