<a href="https://colab.research.google.com/github/shainedl/Papers-Colab/blob/master/Surprisingly_Effective_Fix_Latent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Based on *A Surprisingly Effective Fix for Deep Latent Variable Modeling of Text* (Li et al, Carnegie Mellon University)

In [0]:
import torch
from torch import nn, optim
from google.colab import files
from collections import defaultdict
from itertools import count, chain
import nltk
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import numpy as np
from torch.autograd import Variable
import pdb

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
uploaded_training = files.upload()

Saving sample_train.txt to sample_train.txt


In [0]:
uploaded_val = files.upload()

Saving sample_dev.txt to sample_dev.txt


In [0]:
uploaded_test = files.upload()

Saving sample_test.txt to sample_test.txt


In [0]:
def load_data(file):
  """
  Load training data and output vocabulary dictionaries
  """
  w2i = defaultdict(lambda x=count(0): next(x))
  w2i["<s>"] 
  w2i["</s>"] 
  w2i["<unk>"] 
  data = []
  file = file.decode('utf-8')
  sentences = file.splitlines()

  for sentence in sentences:
    tokens = word_tokenize(sentence)
    for token in tokens:
      w2i[token]
    data.append(tokens)

  freq_dist = nltk.FreqDist([item for sublist in data for item in sublist])
  freq1 = set(list(freq_dist.keys())[-4000:])

  w2i = dict(w2i)
  for key in list(w2i.keys()):
    if key in freq1:
      w2i.pop(key)
  i2w = {i:w for w,i in w2i.items()}

  return data, w2i, i2w, freq_dist

In [0]:
training_data, w2i, i2w, freq_dist = load_data(uploaded_training['sample_train.txt'])

In [0]:
def load_data_test(file):
  """
  Load test and validation data
  """
  data = []
  file = file.decode('utf-8')
  sentences = file.splitlines()

  for sentence in sentences:
    tokens = word_tokenize(sentence)
    data.append(tokens)
    
  return data

In [0]:
val_data = load_data_test(uploaded_val['sample_dev.txt'])

In [0]:
test_data = load_data_test(uploaded_test['sample_test.txt'])

In [0]:
use_cuda = True
cuda = torch.device('cuda')

In [0]:
def prepare_sequence(seq, to_ix):
  idxs = [to_ix["<s>"]]
  for w in seq:
    if w in to_ix:
      idxs.append(to_ix[w])
    else:
      idxs.append(to_ix["<unk>"])
  idxs.append(to_ix["</s>"])
  return torch.tensor(idxs, dtype=torch.long)

In [0]:
def batch_data(batch_size, data, w2i):
  """
  Batches data with sequences of the same length
  """
  sentence_lengths = np.array([len(sentence) for sentence in data])
  sorted_idx = np.argsort(sentence_lengths)
  sorted_lengths = sentence_lengths[sorted_idx]

  len_increase_idx = []
  for i in range(1, len(sorted_lengths)):
    if sorted_lengths[i] > sorted_lengths[i-1]:
      len_increase_idx.append(i)
  len_increase_idx.append(len(sorted_lengths))

  batch_data = []
  curr_idx = 0
  for i, idx in enumerate(len_increase_idx):
    while curr_idx < idx:
      batch_sentences = []
      new_idx = min(curr_idx + batch_size, idx)
      for i in range(curr_idx, new_idx):
        sent_to_vec = prepare_sequence(data[sorted_idx[i]], w2i)
        batch_sentences.append(sent_to_vec)
      curr_idx = new_idx
      batch_sentences = torch.stack(batch_sentences).to(device=cuda)
      batch_data.append(batch_sentences)

  i = 0
  j = len(batch_data)
  while i < j:
    if i != 0 and len(batch_data[i]) <= 2 and len(batch_data[i][0]) == len(batch_data[i-1][0]):
      batch_data.append(torch.cat((batch_data[i], batch_data[i-1])))
      batch_data.pop(i)
      batch_data.pop(i-1)
      i -= 1
      j = len(batch_data)
    elif len(batch_data[i]) == 1:
      batch_data.pop(i)
      j = len(batch_data)
    else:
      i += 1

  return batch_data

In [0]:
batch_size = 5

In [0]:
batch_training = batch_data(batch_size, training_data, w2i)

In [0]:
batch_val = batch_data(batch_size, val_data, w2i)

In [0]:
batch_test = batch_data(batch_size, test_data, w2i)

In [0]:
class Encoder(nn.Module):

  def __init__(self, vocab_size, embedding_size, hidden_size, latent_size):
    super(Encoder, self).__init__()

    self.embeddings = nn.Embedding(num_embeddings=vocab_size,
                                   embedding_dim=embedding_size,
                                   padding_idx=-1)
    
    self.rnn = nn.LSTM(input_size=embedding_size,
                        hidden_size=hidden_size,
                        batch_first=True)
    
    self.fc_mu = nn.Linear(hidden_size, latent_size)
    self.fc_var = nn.Linear(hidden_size, latent_size)

    self._initialize_parameters([-0.01, 0.01], [-0.1, 0.1]) 

  def encode(self, x):
    """
    Produces a Gaussian distribution over the possible values of the code z 
    from which x could have been generated
        
    Parameters
    ----------
      x: batch size x sequence length Tensor
        observed data

    Returns
    -------
      mu: batch size x latent size Tensor
        mean of Gaussian distribution
        
      logvar: batch size x latent size Tensor
        log of variance of Gaussian distribution     
    """
    x = self.embeddings(x)
    outputs, (hidden, cell) = self.rnn(x)
    mu = self.fc_mu(hidden)
    logvar = self.fc_var(hidden)
    mu = mu.squeeze()
    logvar = logvar.squeeze()
    return mu, logvar 

  def _initialize_parameters(self, lstm_init, embed_init):
    """
    Initializes the LSTM parameters and embeddings with uniform distributions
    """
    for param in self.parameters():
      nn.init.uniform_(param, a=lstm_init[0], b=lstm_init[1])
    nn.init.uniform_(self.embeddings.weight, a=embed_init[0], b=embed_init[1])

In [0]:
class Decoder(nn.Module):

  def __init__(self, vocab_size, embedding_size, hidden_size, latent_size):
    super(Decoder, self).__init__()

    self.embeddings = nn.Embedding(num_embeddings=vocab_size,
                                   embedding_dim=embedding_size,
                                   padding_idx=-1)
    
    self.rnn = nn.LSTM(input_size=embedding_size + latent_size,
                        hidden_size=hidden_size,
                        batch_first=True)    

    self.fc_hid = nn.Linear(latent_size, hidden_size)
    self.fc_voc = nn.Linear(hidden_size, vocab_size)

    self.dropout = nn.Dropout()
    self._initialize_parameters([-0.01, 0.01], [-0.1, 0.1])

  def decode(self, z, inputs):
    """
    Given a code z it produces unscaled output corresponding to the vocabulary

    Parameters
    ----------
      z: batch size x latent size Tensor
        latent variables
      
      inputs: batch size x sequence length Tensor
        source sequence

    Returns
    -------
      output_logits: batch size x sequence length x vocab size Tensor
        unscaled output
    """
    cell = self.fc_hid(z)
    cell = cell.unsqueeze(0)
    hidden = torch.tanh(cell)

    embed = self.embeddings(inputs)
    embed = self.dropout(embed)
    z = z.expand(embed.size(1), z.size(0), z.size(1))
    z = z.transpose(1,0)
    embed_lat = torch.cat((embed, z), 2)

    outputs, (hidden, cell) = self.rnn(embed_lat, (hidden, cell))
    outputs = self.dropout(outputs)
    output_logits = self.fc_voc(outputs)
    return output_logits

  def decode_greedy(self, z, inputs, interpolation=False):
    """
    Given a code z it produces unscaled output corresponding to the vocabulary

    Parameters
    ----------
      z: batch size x latent size Tensor
        latent variables
      
      inputs: batch size x sequence length Tensor
        source sequence

    Returns
    -------
      output_logits: batch size x sequence length x vocab size Tensor
        unscaled output
      
      batch_decoded: batch size x output sequence length list
        decoded output sequence
    """
    cell = self.fc_hid(z)
    cell = cell.unsqueeze(0)
    hidden = torch.tanh(cell)

    batch_size = inputs.size(0)
    input_d = inputs[:,0]
    output_logit_prev = None
    seq_len = inputs.size(1)
    batch_decoded = [[] for j in range(batch_size)]

    end_mask = torch.ones(batch_size)
    counter = 0
    while end_mask.sum() != 0 and counter < seq_len:
      embed = self.embeddings(input_d)
      embed_lat = torch.cat((embed, z), 1)
      embed_lat = embed_lat.unsqueeze(1)
      outputs, (hidden, cell) = self.rnn(embed_lat, (hidden, cell))  
      output_logit = self.fc_voc(outputs)
      if output_logit_prev is not None:
        output_logits = torch.cat((output_logit_prev, output_logit), dim=1)
        output_logit_prev = output_logits
      else:
        output_logit_prev = output_logit
      input_d = torch.argmax(output_logit, dim=2).flatten()

      for k in range(batch_size):
        if end_mask[k] != 0:
          if interpolation and input_d[k].item() == w2i["</s>"] :
            end_mask[k] = 0
          else:
            token = i2w[input_d[k].item()]
            batch_decoded[k].append(token)
      counter += 1
    
    return output_logits, batch_decoded

  def _initialize_parameters(self, lstm_init, embed_init):
    """
    Initializes the LSTM parameters and embeddings with uniform distributions
    """
    for param in self.parameters():
      nn.init.uniform_(param, a=lstm_init[0], b=lstm_init[1])
    nn.init.uniform_(self.embeddings.weight, a=embed_init[0], b=embed_init[1])

In [0]:
class VAE(nn.Module):

  def __init__(self, encoder, decoder):
    super(VAE, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.re_loss = nn.CrossEntropyLoss()

  def forward(self, x, greedy=False):
    """
    Forward pass of the model 
    """
    mu, logvar = self.encoder.encode(x)
    kl = self.get_kl(mu, logvar)
    z = self._reparameterize(mu, logvar)

    source = x[:,:-1]
    target = x[:, 1:]
    if greedy:
      output_logits, batch_decoded = self.decoder.decode_greedy(z, source)
      re = self.get_reconstruction_error(output_logits, target)
      return kl, re, batch_decoded
    else: 
      output_logits = self.decoder.decode(z, source)
      re = self.get_reconstruction_error(output_logits, target)
      return kl, re

  def _reparameterize(self, mu, logvar):
    """
    Reparameterize the random variable z to express as a deterministic variable
    
    Parameters
    ----------
      mu: batch size x latent size Tensor
        mean of Gaussian distribution
        
      logvar: batch size x latent size Tensor
        log of variance of Gaussian distribution     
    
    Returns
    -------
      z: batch size x latent size Tensor
        reparameterization of latent variables
    """
    std = torch.exp(logvar / 2)
    eps = torch.randn_like(std)
    return mu + std * eps  

  def get_kl(self, mu, logvar):
    """
    Returns the KLD between posterior and prior

    Parameters
    ----------
      mu: batch size x latent size Tensor
        mean of Gaussian distribution
        
      logvar: batch size x latent size Tensor
        log of variance of Gaussian distribution
    
    Returns
    -------
      kl: batch size x latent size Tensor
        kl divergence
    """
    return (mu**2 + logvar.exp() - 1 - logvar) / 2

  def get_reconstruction_error(self, output_logits, target):
    """
    Returns the reconstruction error

    Parameters
    ----------
      output_logits: batch size x sequence length x vocab size Tensor
        unscaled output
      
      target: batch size x sequence length Tensor
        target sequence
    """
    target = target.contiguous().view(-1)
    output_logits = output_logits.view(-1, output_logits.size(2))
    return self.re_loss(output_logits, target)


In [0]:
vocab_size = len(w2i)
embedding_size = 128
hidden_size = 512 
latent_size = 32

In [0]:
print_every = round(len(batch_training) / 5)

In [0]:
def train(epoch, pretraining=False, target_rate=4.0):
  """
  Trains the model
  Helpful link for free bits: https://stats.stackexchange.com/questions/267924/explanation-of-the-free-bits-technique-for-variational-autoencoders
  """
  model.train()
  anneal = 0.1 * epoch if epoch < 10 else 1.0

  running_loss = 0.0
  running_kl = 0.0
  running_re = 0.0
  for batch_idx, data in enumerate(batch_training):
    optimizer_e.zero_grad()
    optimizer_d.zero_grad()
    kl, re = model(data)
    if pretraining:
      loss = re
    else:
      kl_mean = kl.mean(dim=0)
      kl_mask = (kl_mean > target_rate).float()
      fb_mask = (kl_mean <= target_rate).float()
      free_b = kl_mask + target_rate
      kl = (kl_mean * kl_mask + free_b * fb_mask).sum()
      loss = kl * anneal + re
      running_kl += kl * anneal
      running_re += re
    loss.backward()
    optimizer_e.step()
    optimizer_d.step()

    running_loss += loss
  
    if (epoch == 0 or epoch % 10 == 9) and batch_idx % print_every == print_every-1:    
      print('[%d, %5d] Train loss: %.3f' % (epoch + 1, batch_idx + 1, running_loss / print_every))
      running_loss = 0.0 
      if not pretraining:
        print('[%d, %5d] Train KL: %.3f' % (epoch + 1, batch_idx + 1, running_kl / print_every))
        print('[%d, %5d] Train RE: %.3f' % (epoch + 1, batch_idx + 1, running_re / print_every))
        running_kl = 0.0
        running_re = 0.0

In [0]:
def test(epoch, validation=False, pretraining=False, target_rate=4.0):
  """
  Run the model on validation or test dataset
  """
  model.eval()
  anneal = 0.1 * epoch if epoch < 10 else 1.0

  data = batch_val if validation else batch_test
  running_loss = 0.0
  running_kl = 0.0
  running_re = 0.0
  for batch_idx, data in enumerate(data):
    if validation:  
      kl, re = model(data)
    else:
      kl, re, batch_decoded = model(data, True)
    if pretraining:
      loss = re
    else:
      kl_mean = kl.mean(dim=0)
      kl_mask = (kl_mean > target_rate).float()
      fb_mask = (kl_mean <= target_rate).float()
      fb = kl_mask + target_rate
      kl = (kl_mean * kl_mask + fb * fb_mask).sum()
      loss = kl * anneal + re
      running_kl += kl * anneal
      running_re += re

    running_loss += loss

  avg_loss = running_loss / (batch_idx + 1)
  avg_kl = running_kl / (batch_idx + 1)
  avg_re = running_re / (batch_idx + 1)
  if epoch == 0 or epoch % 10 == 9:
    if validation:
      print('[%d, %5d] Validation loss: %.3f' % (epoch + 1, batch_idx + 1, avg_loss))
      if not pretraining:
        print('[%d, %5d] Validation KL: %.3f' % (epoch + 1, batch_idx + 1, avg_kl))
        print('[%d, %5d] Validation RE: %.3f' % (epoch + 1, batch_idx + 1, avg_re))
    else:
      print('[%d, %5d] Test loss: %.3f' % (epoch + 1, batch_idx + 1, avg_loss))
      if not pretraining:
        print('[%d, %5d] Test KL: %.3f' % (epoch + 1, batch_idx + 1, avg_kl))
        print('[%d, %5d] Test RE: %.3f' % (epoch + 1, batch_idx + 1, avg_re))
      print(*batch_decoded[0])


  return avg_loss

In [0]:
class DecayLearning:
  """
  Class updated from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
  """
  def __init__(self, patience=2):
    self.patience = patience
    self.counter = 0
    self.best_score = None
    self.update_lr = False

  def __call__(self, val_loss):
    score = -val_loss

    if self.best_score is None:
      self.best_score = score
    elif score < self.best_score:
      self.counter += 1
      if self.counter >= self.patience:
        self.update_lr = True
    else:
      self.best_score = score
      self.counter = 0

In [0]:
encoder = Encoder(vocab_size, embedding_size, hidden_size, latent_size)
decoder = Decoder(vocab_size, embedding_size, hidden_size, latent_size)
model = VAE(encoder, decoder)
if use_cuda and torch.cuda.is_available():
  model.cuda()
  
lr = 0.5
optimizer_e = optim.SGD(encoder.parameters(), lr=lr)
optimizer_d = optim.SGD(decoder.parameters(), lr=lr)

decay_learning = DecayLearning()
num_decays = 0
early_stop = 5

for epoch in range(100):
  train(epoch, True)
  val_loss = test(epoch, True, True)
  if epoch == 0 or epoch % 10 == 9: 
    test(epoch, False, True)
  decay_learning(val_loss)
  if decay_learning.update_lr:
    lr *= 0.5
    num_decays += 1
    if num_decays == early_stop + 1:
      print("Stopping early at epoch", epoch+1)
      break
    print("Learning rate has been decayed to", lr, "at epoch", epoch+1)
    optimizer_e = optim.SGD(encoder.parameters(), lr=lr)
    optimizer_d = optim.SGD(decoder.parameters(), lr=lr)
    decay_learning = DecayLearning()

save_name = 'surprising.pt'
path = F"/content/gdrive/My Drive/{save_name}" 

torch.save({
          'model_state_dict': model.state_dict()
          }, path)

[1,  1600] Train loss: 4.169
[1,  3200] Train loss: 3.755
[1,  4800] Train loss: 3.759
[1,  6400] Train loss: 3.889
[1,  8000] Train loss: 3.872
[1,   802] Validation loss: 4.563
[1,   801] Test loss: 6.728
A man is sitting on a <unk> , <unk> , and <unk> , are playing
Learning rate has been decayed to 0.25 at epoch 5
[10,  1600] Train loss: 1.508
[10,  3200] Train loss: 1.553
[10,  4800] Train loss: 1.722
[10,  6400] Train loss: 1.955
[10,  8000] Train loss: 2.148
[10,   802] Validation loss: 3.175
[10,   801] Test loss: 5.391
Two men are sitting </s> </s> . </s> </s> are sitting next to the man
Learning rate has been decayed to 0.125 at epoch 15
Learning rate has been decayed to 0.0625 at epoch 18
[20,  1600] Train loss: 0.517
[20,  3200] Train loss: 0.600
[20,  4800] Train loss: 0.759
[20,  6400] Train loss: 0.990
[20,  8000] Train loss: 1.221
[20,   802] Validation loss: 2.194
[20,   801] Test loss: 4.139
Two men are sweeping , one is sitting and one is sitting down . </s>
Learning 

In [0]:
decoder = Decoder(vocab_size, embedding_size, hidden_size, latent_size)
model = VAE(encoder, decoder)
if use_cuda and torch.cuda.is_available():
  model.cuda()

lr = 0.5
optimizer_e = optim.SGD(encoder.parameters(), lr=lr)
optimizer_d = optim.SGD(decoder.parameters(), lr=lr)
num_decays = 0
decay_learning = DecayLearning()

for epoch in range(10):
  train(epoch, False)
  val_loss = test(epoch, True, False)
  if epoch == 0 or epoch % 10 == 9: 
    test(epoch, False, False)

[1,  1600] Train loss: 2.669
[1,  1600] Train KL: 0.000
[1,  1600] Train RE: 2.669
[1,  3200] Train loss: 1.883
[1,  3200] Train KL: 0.000
[1,  3200] Train RE: 1.883
[1,  4800] Train loss: 1.834
[1,  4800] Train KL: 0.000
[1,  4800] Train RE: 1.834
[1,  6400] Train loss: 1.949
[1,  6400] Train KL: 0.000
[1,  6400] Train RE: 1.949
[1,  8000] Train loss: 2.108
[1,  8000] Train KL: 0.000
[1,  8000] Train RE: 2.108
[1,   802] Validation loss: 3.404
[1,   802] Validation KL: 0.000
[1,   802] Validation RE: 3.404
[1,   801] Test loss: 4.372
[1,   801] Test KL: 0.000
[1,   801] Test RE: 4.372
Two men are one , , , one is sitting sitting sitting sitting down .
[10,  1600] Train loss: 115.571
[10,  1600] Train KL: 115.210
[10,  1600] Train RE: 0.363
[10,  3200] Train loss: 115.581
[10,  3200] Train KL: 115.205
[10,  3200] Train RE: 0.378
[10,  4800] Train loss: 115.742
[10,  4800] Train KL: 115.204
[10,  4800] Train RE: 0.540
[10,  6400] Train loss: 115.999
[10,  6400] Train KL: 115.204
[10,  6

In [46]:
for epoch in range(10,100):
  train(epoch, False)
  val_loss = test(epoch, True, False)
  if epoch == 0 or epoch % 10 == 9: 
    test(epoch, False, False)
  decay_learning(val_loss)
  if decay_learning.update_lr:
    lr *= 0.5
    num_decays += 1
    if num_decays == early_stop + 1:
      print("Stopping early at epoch", epoch+1)
      break
    print("Learning rate has been decayed to", lr, "at epoch", epoch+1)
    optimizer_e = optim.SGD(encoder.parameters(), lr=lr)
    optimizer_d = optim.SGD(decoder.parameters(), lr=lr)
    decay_learning = DecayLearning()

Learning rate has been decayed to 0.25 at epoch 13
Learning rate has been decayed to 0.125 at epoch 16
[20,  1600] Train loss: 128.193
[20,  1600] Train KL: 128.001
[20,  1600] Train RE: 0.192
[20,  3200] Train loss: 128.245
[20,  3200] Train KL: 128.001
[20,  3200] Train RE: 0.245
[20,  4800] Train loss: 128.370
[20,  4800] Train KL: 128.000
[20,  4800] Train RE: 0.369
[20,  6400] Train loss: 128.581
[20,  6400] Train KL: 128.001
[20,  6400] Train RE: 0.581
[20,  8000] Train loss: 128.855
[20,  8000] Train KL: 128.000
[20,  8000] Train RE: 0.855
[20,   802] Validation loss: 130.209
[20,   802] Validation KL: 128.031
[20,   802] Validation RE: 2.178
[20,   801] Test loss: 132.171
[20,   801] Test KL: 128.038
[20,   801] Test RE: 4.133
Two men are keeping , . </s> </s> , is sitting , is on .
Learning rate has been decayed to 0.0625 at epoch 22
[30,  1600] Train loss: 128.104
[30,  1600] Train KL: 128.000
[30,  1600] Train RE: 0.103
[30,  3200] Train loss: 128.153
[30,  3200] Train KL: 1

In [0]:
def interpolate(latent_size):
  sample1 = torch.randn(1, latent_size, device=cuda)
  sample2 = torch.randn(1, latent_size, device=cuda)
  for w in range(11):
    weight = w * 0.1
    sample = weight * sample2 + (1-weight) * sample1
    _, batch_decoded = decoder.decode_greedy(sample, torch.zeros(12, dtype=torch.long, device=cuda).unsqueeze(0), True)
    print(*batch_decoded[0])

In [48]:
interpolate(latent_size)

The man standing is standing by a hotel .
The male standing is standing by a hotel .
The male man is standing by the garden .
The male man standing standing by the his log .
The bald man sitting standing up the the bed .
The bald man sitting standing up the the bed .
The male man sitting sitting up from the bed .
The blue man sitting sitting down from the bed .
The middle <unk> sitting sitting down from the ground the bed .
The middle <unk> sitting sitting down from the ground the ground .
The five <unk> sitting sitting down from the ground to the ground


In [49]:
interpolate(latent_size)

An elderly watches drums drums to cross the street
An elderly watches two drums to cross the race a street .
An elderly watches two drums to cross the a race .
A elderly and uses equipment drums to cross a race .
A older woman with old equipment to cross a hole into .
A older woman and an equipment to cross a hole against .
A barefoot woman and an <unk> gather to a turn up .
A small boy , two <unk> stand next a card behind .
A small boy , only old man after a card behind beans
A small boy is holding an men squatting behind a disc set
A small boy is holding a black cart next to cigarettes .


In [50]:
interpolate(latent_size)

An infant up in the laundry garden river walks by by naked
An infant baby in the blue cliff ring walks by by himself
Male infant kid in the blue cliff ring walks by himself by
One long-haired kid in a white mat ring while by himself himself
Two long-haired boy in a white slide walks while by himself walks
Two long-haired boy in a white subway bench while himself himself himself
Two dark-haired boy in a white crowd stands while himself himself himself
Two shirtless boy sitting a white crowd while standing behind himself himself
Two white man sitting a crowd crowd while a boy up smiles
Two white man sitting a crowd crowd while standing posing posing .
Two man people sitting a crowd boy standing behind posing standing .


In [51]:
interpolate(latent_size)

white men are running holding a <unk> and smile a smile .
Two men are swimming holding a <unk> and smile a smile .
Two men are swimming holding a <unk> and like a smile .
Two people are swimming holding a <unk> and like a joke set
Two people are swimming up a <unk> and like on a joke
Two people are swimming up and white friend to a hula joke
Two people are swimming up something toy like to a hula joke
Two children are up up something toy to put a hula hoop
Two children are up up something plastic to hula a hoop hoop
Two girls jumping up to blue plastic to dive .
Two girls jumping up to blue rocking to hula hula .


In [52]:
interpolate(latent_size)

A female red-haired talks on a woman and another person standing to
A female soldier talks on a woman and another woman eating to
A female soldier talks on a phone and another woman fixing her
A female soldier sits on a bench and people stop to a
A female teenager sits on a bench and people are on the
A female teenager sits on a bench and they heading the train
A female family sitting on a ledge that 's heading during the
A female family sitting on a ledge where leaving work during display
A family sits sitting on a snowy track or work during work
A family sits around on a snowy lined lined during work site
Three family sits around at a snowy forest work during work .


In [53]:
interpolate(latent_size)

Two white dog in a crowd walks up <unk> up <unk> .
Two white dog making a white stands up <unk> up <unk> .
Two white dog making a white jumps up <unk> for his hands
Two white dog making a <unk> jumps up with his <unk> jumps
The white rider has a <unk> up while <unk> in his mouth
The dirt rider has white <unk> in a ball jumps for his
The dirt rider racing white football in a <unk> jumps in her
The dirt rider racing white football at a team 's in midair
The dirt rider <unk> flat with the air in midair
The ladder car flat <unk>
The ladder car flat <unk>


In [54]:
interpolate(latent_size)

Three group men are in a blue canoe on the concrete dock
Three group men are in a blue canoe on the concrete railing
Three young men are in a blue next to some trees .
Three young men are in a canoe next to some trees .
Three young men are using a canoe out behind behind fishing .
Three Three adults men wearing a trash back from behind behind .
Three Three two men are holding two luggage to their back behind
Three Three three men men holding two mom behind behind fish behind
three Three three adults men holding his mom behind from behind behind
three three three three men holding two trash from behind behind toy
three three three three adults men holding two boxes as from behind


In [55]:
interpolate(latent_size)

Men are in a green uniform in a big middle
Men are in a green uniform in a big big gym
Men men in a professional uniform through a big gym .
Two men riding a uniform performing the gym .
Two men riding in a uniform won a dark gym .
Two men riding in a uniform won a dark gym .
Four men riding in a hallway won a evening alley .
Four men standing in a sidewalk during a indoor country .
Six kids in standing enjoying a country lit .
Six of men standing across a courtyard .
Six of kids standing standing a quiet .


In [56]:
interpolate(latent_size)

three men men up up and down outside talking outside .
two men men up up and working outside another man .
two adults men up up <unk> outside while a man outside .
two young people up up <unk> standing outside a man outside
two young people up up <unk> standing beside a outside outside
two young people up blue wall standing a outside man near a
two girls people in white bicycle next a man near a building
two girls people in a car parked outside a building building
The girls , in a truck steps beside the <unk> beside
The little boy in a truck truck beside the <unk> beside a
The small boy in a truck truck beside a city building


In [57]:
interpolate(latent_size)

Some woman are fighting with rope and a backpack sitting
Some women are fighting over with another and running
Five women are fighting over a helmet sitting with a backpack on
Five women are fighting over a helmet sitting on a pole with
2 men are fighting over another motorcycle on a pole with a
2 men are <unk> over another while riding a pole .
Two men men jumping a tree while interacting on a small pole
two three men fixing a bike while next on a small bridge
two two men <unk> a tree next to another a bridge .
The two men <unk> a tree next to a bridge .
The two men <unk> the tree next to a bridge .


In [0]:
"""
Downsampling
"""

import random

# Training
f = open('./snli_1.0/snli_1.0_train.txt')
contents = f.read()
file_as_list = contents.splitlines()
sentences = []
for example in file_as_list[1:]:
    sentence1 = example.split('\t')[5]
    sentence2 = example.split('\t')[6]
    sent1_len = len(sentence1.split())
    sent2_len = len(sentence2.split())
    if sent1_len <= 12: sentences.append(sentence1)
    if sent2_len <= 12: sentences.append(sentence2)

sample = random.sample(sentences, 40000)

f = open('./snli_1.0/sample_train.txt', "w")
for sentence in sample:
    f.write(sentence)
    f.write("\n")
f.close()

# Development
f = open('./snli_1.0/snli_1.0_dev.txt')
contents = f.read()
file_as_list = contents.splitlines()
sentences = []
for example in file_as_list[1:]:
    sentence1 = example.split('\t')[5]
    sentence2 = example.split('\t')[6]
    sent1_len = len(sentence1.split())
    sent2_len = len(sentence2.split())
    if sent1_len <= 12: sentences.append(sentence1)
    if sent2_len <= 12: sentences.append(sentence2)

sample = random.sample(sentences, 4000)

f = open('./snli_1.0/sample_dev.txt', "w")
for sentence in sample:
    f.write(sentence)
    f.write("\n")
f.close()

# Test
f = open('./snli_1.0/snli_1.0_test.txt')
contents = f.read()
file_as_list = contents.splitlines()
sentences = []
for example in file_as_list[1:]:
    sentence1 = example.split('\t')[5]
    sentence2 = example.split('\t')[6]
    sent1_len = len(sentence1.split())
    sent2_len = len(sentence2.split())
    if sent1_len <= 12: sentences.append(sentence1)
    if sent2_len <= 12: sentences.append(sentence2)

sample = random.sample(sentences, 4000)

f = open('./snli_1.0/sample_test.txt', "w")
for sentence in sample:
    f.write(sentence)
    f.write("\n")
f.close()