## LSTM Language Model for Kim Possible Data



In [0]:
from __future__ import print_function
import os
import matplotlib.pyplot as plt
import numpy as np
import chainer
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import os
from argparse import Namespace
import re
import torch.optim as optim
import math
import time
import torch.optim as optim
from torch.autograd import Variable

# Hyperparameters
seq_size = 35
batch_size = 20
embedding_size =650
lstm_size = 650
num_layers = 2
gradients_norm = 5
top_k = 5
epochs = 50

# For optimizer
lr = 0.001
weight_decay = 0.00002 # L2 regularization


#Dropouts
dropout=0.5 #locked
dropouti=0.5 #locked
dropoute=0.1 #emb dropout



## Load data


In [3]:
# Kim Possible Data

import re

#from google.colab import files
#train_file = files.upload()


train_file = 'KP.txt'
checkpoint_path = 'checkpoint'

def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file,'r',encoding="utf8") as f:
        text = f.read()

        pat = re.compile(r"([.()!,])")
        text = pat.sub(" \\1 ", text)
        text = text.replace(',', ' ')
        text = text.lower().split()
      
    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)
    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    print('Total size', len(int_text))

    #TRAINING SET

    int_text0 = int_text[:int(len(int_text)*0.8)]
    num_batches = int(len(int_text0) / (seq_size * batch_size))
    in_text = int_text0[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    print('Training size', len(int_text0))

    #VALIDATION SET

    int_text1 = int_text[int(len(int_text)*0.8):]
    num_batches = int(len(int_text1) / (seq_size * batch_size))
    in_text1 = int_text1[:num_batches * batch_size * seq_size]
    out_text1 = np.zeros_like(in_text1)
    out_text1[:-1] = in_text1[1:]
    out_text1[-1] = in_text1[0]
    in_text_val = np.reshape(in_text1, (batch_size, -1))
    out_text_val = np.reshape(out_text1, (batch_size, -1))
    print('Validation size', len(int_text1))

                              
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text, in_text_val, out_text_val

int_to_vocab, vocab_to_int, n_vocab, in_text, out_text, in_text_val, out_text_val= get_data_from_file(train_file, batch_size, seq_size)

Saving KP.txt to KP.txt
Vocabulary size 6433
Total size 56445
Training size 45156
Validation size 11289


## Define class and functions for regularization 


In [0]:
class LockedDropout(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, dropout=0.5):
        if not self.training or not dropout:
            return x
        m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
        mask = Variable(m, requires_grad=False) / (1 - dropout)
        mask = mask.expand_as(x)
        return mask * x

In [0]:
def embedded_dropout(embed, words, dropout=0.1, scale=None):
  if dropout:
    mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout) 
    masked_embed_weight = mask * embed.weight
  else:
    masked_embed_weight = embed.weight
    
  padding_idx = embed.padding_idx
  if padding_idx is None:
      padding_idx = -1

  X = torch.nn.functional.embedding(words, masked_embed_weight,
    padding_idx, embed.max_norm, embed.norm_type,
    embed.scale_grad_by_freq, embed.sparse
  )
  return X

## Define model


In [0]:
class RNNModule(nn.Module):
    
    def __init__(self, n_vocab, seq_size, 
                 embedding_size, lstm_size, num_layers, dropout=0.5, dropouti=0.5, dropoute=0.1):
        super(RNNModule, self).__init__()
       
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.drop = nn.Dropout(dropout)
        
        self.embedding = nn.Embedding(n_vocab, embedding_size)

        self.lstm = nn.LSTM(embedding_size, lstm_size, 
                            num_layers, batch_first=True)
      
        self.dense = nn.Linear(lstm_size, n_vocab)

        self.dropout = dropout
        self.dropouti = dropouti
        self.dropoute = dropoute

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.dense.bias.data.fill_(0)
        self.dense.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, x, prev_state):
        embed = embedded_dropout(self.embedding, x, dropout=self.dropoute if self.training else 0)
        embed = self.idrop(embed)
        embed = self.lockdrop(embed, self.dropouti)

        output, state = self.lstm(embed, prev_state)

        output = self.lockdrop(output, self.dropout)

        output = self.dense(output)

        return output, state, embed
    
    def zero_state(self, batch_size):
        return (torch.zeros(self.num_layers, batch_size, 
                            self.lstm_size),
                torch.zeros(self.num_layers, batch_size, 
                            self.lstm_size))
    

In [0]:
def get_loss_and_train_op(net, lr, weight_decay):   
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr, weight_decay=weight_decay)
    return criterion, optimizer

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]
        

## Training and validation


In [8]:
# TRAINING

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Initialize a new network
net = RNNModule(n_vocab, seq_size, embedding_size, lstm_size, num_layers,dropout, dropouti, dropoute)
net = net.to(device)


criterion, optimizer = get_loss_and_train_op(net, lr, weight_decay=weight_decay)

start = time.time()

training_loss, validation_loss, validation_perplex = [], [], []

# For each epoch
for e in range(1,epochs+1):
  
    losses_train, losses_val, perplexity_val = [], [], []

    batches_val = get_batches(in_text_val, out_text_val, batch_size, seq_size)
    state_h, state_c = net.zero_state(batch_size)

    # Transfer data to GPU
    state_h = state_h.to(device)
    state_c = state_c.to(device)
      
    # For each sentence in validation set
    for x,y in batches_val:
                  
        # Tell it we are in eval mode
        net.eval()

        # Make tensors
        x = torch.LongTensor(x).to(device) # inputs
        y = torch.LongTensor(y).to(device) # targets
          
        #Forward pass 
        outputs, (state_h, state_c),_ = net(x, (state_h, state_c))
        loss = criterion(outputs.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()

        # Compute loss and perplexity
        loss_value = loss.item()
        losses_val.append(loss_value)
        perplex = math.exp(loss_value)
        perplexity_val.append(perplex)
      
    batches = get_batches(in_text, out_text, batch_size,seq_size)
    state_h, state_c = net.zero_state(batch_size)
    
    # Transfer data to GPU
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    # For each sentence in training set
    for x,y in batches:
        
        # Tell it we are in training mode
        net.train()
        
        # Reset all gradients
        optimizer.zero_grad()
        
        # Make tensors
        x = torch.LongTensor(x).to(device) # inputs
        y = torch.LongTensor(y).to(device) # targets
        
        # Forward pass
        outputs, (state_h, state_c),_ = net(x, (state_h, state_c))
        loss = criterion(outputs.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()
        
        loss_value = loss.item()
        losses_train.append(loss_value)
        
        # Perform back-propagation
        loss.backward()

        # Gradient clipping
        _ = torch.nn.utils.clip_grad_norm_(net.parameters(), gradients_norm)

        # Update the network's parameters
        optimizer.step()
    
    
    # Save loss and perplexity for plot
    training_loss.append(np.mean(losses_train))
    validation_loss.append(np.mean(losses_val))
    validation_perplex.append(np.mean(perplexity_val))
     
    # Print at every epoch    
    if e % 1 == 0:
        print('\n') 
        print('Time: {}'.format(time_since(start)),
              'Epoch: {}/{}'.format(e, epochs),
              'Training loss: {}'.format(training_loss[-1]),
              'Validation loss: {}'.format(validation_loss[-1]),
              'Validation perplexity: {}'.format(validation_perplex[-1]))
   



Time: 0m 1s Epoch: 1/50 Training loss: 6.789502665400505 Validation loss: 8.767270028591156 Validation perplexity: 6420.626974698962


Time: 0m 3s Epoch: 2/50 Training loss: 6.232224680483341 Validation loss: 6.464867949485779 Validation perplexity: 643.9390964041859


Time: 0m 4s Epoch: 3/50 Training loss: 6.0263165310025215 Validation loss: 6.302557498216629 Validation perplexity: 547.947536122507


Time: 0m 5s Epoch: 4/50 Training loss: 5.8422074764966965 Validation loss: 6.168674051761627 Validation perplexity: 479.51016575791135


Time: 0m 7s Epoch: 5/50 Training loss: 5.723537869751453 Validation loss: 6.079588204622269 Validation perplexity: 439.03457318990996


Time: 0m 8s Epoch: 6/50 Training loss: 5.613759145140648 Validation loss: 6.035757303237915 Validation perplexity: 420.4203177305129


Time: 0m 10s Epoch: 7/50 Training loss: 5.509028509259224 Validation loss: 5.993395090103149 Validation perplexity: 403.04496033775825


Time: 0m 11s Epoch: 8/50 Training loss: 5.396863

## Prediction

*Fixed k*



In [0]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k, out_size):
    
    net.eval()

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    
    for w in words:
        ix = torch.LongTensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c),_ = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])
    words.append(int_to_vocab[choice])

    for _ in range(out_size):
        ix = torch.LongTensor([[choice]]).to(device)
        output, (state_h, state_c), _= net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])
    print('\n')    
    
    out = ' '.join(words)
    out = out.replace(' .','.')
    out = '. '.join(i.capitalize() for i in out.split(". "))
    print(out) 


In [36]:
predict(device,net,['kim', 'says'], n_vocab, vocab_to_int, int_to_vocab, 5, 40)



Kim says she's surprised but ron refuses the call on the other hand as a distraction and is relieved to get the magma. The monkey ninjas after ron and the twins see kim at ron but kim pulls up a humiliation baseball
