<a href="https://colab.research.google.com/github/shainedl/Papers-Colab/blob/master/Surprisingly_Effective_Fix_Latent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Based on *A Surprisingly Effective Fix for Deep Latent Variable Modeling of Text* (Li et al, Carnegie Mellon University)

In [77]:
import torch
from torch import nn
from google.colab import files
from collections import defaultdict
from itertools import count
import nltk
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import numpy as np
from torch.autograd import Variable

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
uploaded = files.upload()

Saving sample_train.txt to sample_train.txt


In [0]:
def load_data(file):
  w2i = defaultdict(lambda x=count(0): next(x))
  w2i["<pad>"] 
  padding_idx = w2i["<pad>"]
  w2i["<s>"] 
  w2i["</s>"] 
  w2i["<unk>"] 
  data = []
  file = file.decode('utf-8')
  sentences = file.splitlines()

  for sentence in sentences:
    tokens = word_tokenize(sentence)
    for token in tokens:
      w2i[token]
    data.append(tokens)


  w2i = dict(w2i)
  i2w = {i:w for w,i in w2i.items()}

  return data, w2i, i2w, padding_idx

In [0]:
training_data, w2i, i2w, padding_idx = load_data(uploaded['sample_train.txt'])

In [0]:
def prepare_sequence(seq, to_ix):
  idxs = [to_ix[w] for w in seq]
  return torch.tensor(idxs, dtype=torch.long)

In [0]:
"""
Helpful Links
-------------
Batching: https://cs230-stanford.github.io/pytorch-nlp.html
"""
batch_size = 50

batch_sentences = []
for i in range(batch_size):
  batch_sentences.append(prepare_sequence(training_data[i], w2i))

# compute length of longest sentence in batch
batch_max_len = max([len(s) for s in batch_sentences])

batch_input = w2i["<pad>"]*np.ones((len(batch_sentences), batch_max_len+1))

X_lengths = []

# copy the data to the numpy array
for j in range(len(batch_sentences)):
  cur_len = len(batch_sentences[j]) + 2
  X_lengths.append(cur_len)
  batch_input[j][0] = w2i["<s>"]
  batch_input[j][1:cur_len-1] = batch_sentences[j]
  batch_input[j][cur_len-1:cur_len] = w2i["</s>"]

X_lengths.sort(reverse=True)

# since all data are indices, we convert them to torch LongTensors
batch_input = torch.LongTensor(batch_input)
# convert Tensors to Variables
batch_input = Variable(batch_input)

In [0]:
class VAE(nn.Module):

  def __init__(self, vocab_size, embedding_size, padding_idx, hidden_size, latent_size):
    super(VAE, self).__init__()

    self.embeddings = nn.Embedding(num_embeddings=vocab_size,
                                   embedding_dim=embedding_size,
                                   padding_idx=padding_idx)
    
    self.rnn1 = nn.LSTM(input_size=embedding_size,
                        hidden_size=hidden_size,
                        batch_first=True)
    self.rnn2 = nn.LSTM(input_size=embedding_size,
                        hidden_size=hidden_size,
                        batch_first=True)    
    
    self.fc11 = nn.Linear(hidden_size, latent_size)
    self.fc12 = nn.Linear(hidden_size, latent_size)

    self._initialize_parameters([-0.01, 0.01], [-0.1, 0.1])
  
  def encode(self, x):
    rnn_e = self.rnn1(x)
    mu = self.fc11(rnn_e)
    logvar = self.fc12(rnn_e)
    
    return mu, logvar

  def forward(self, x):
    mu, logvar = self.encode(x)

    return mu, logvar

  def _initialize_parameters(self, lstm_init, embed_init):
    for param in self.parameters():
      nn.init.uniform_(param, a=lstm_init[0], b=lstm_init[1])
    nn.init.uniform_(self.embeddings, a=embed_init[0], b=embed_init[1])
    