<a href="https://colab.research.google.com/github/shainedl/Papers-Colab/blob/master/Hybrid_Convolutional_VAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Based on *A Hybrid Convolutional Variational Autoencoder for Text Generation* (Semeniuta et al, Univweaitat zu Lubeck)

In [0]:
import torch
from torch import nn
from nltk.tokenize import TweetTokenizer
from google.colab import files
from collections import defaultdict
from itertools import count
import numpy as np
from torch.autograd import Variable

In [0]:
class VAE(nn.Module):
  def __init__(self, vocab_size, embedding_dim, z_dim):
    """
    Helpful Links
    -------------
    To define the layers of the CNN: https://www.analyticsvidhya.com/blog/2019/10/building-image-classification-models-cnn-pytorch/
    Deconvolution layers: https://datascience.stackexchange.com/questions/6107/what-are-deconvolutional-layers
    Batch normalization: https://towardsdatascience.com/batch-normalization-in-neural-networks-1ac91516821c
    """
    super(VAE, self).__init__()
    
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)
    
    self.cnn_layers1 = nn.Sequential(
        # Defining 1st 1D convolution layer
        nn.Conv1d(embedding_dim, 128, kernel_size=3, stride=2),
        nn.BatchNorm1d(128),
        nn.ReLU(),

        # Defining 2nd 1D convolution layer
        nn.Conv1d(128, 256, kernel_size=3, stride=2),
        nn.BatchNorm1d(256),
        nn.ReLU(),  
        
        # Defining 3rd 1D convolution layer
        nn.Conv1d(256, 512, kernel_size=3, stride=2),
        nn.BatchNorm1d(512),
        nn.ReLU(),    

        # Defining 4th 1D convolution layer
        nn.Conv1d(512, 512, kernel_size=3, stride=2),
        nn.BatchNorm1d(512),
        nn.ReLU(),    

        # Defining 5th 1D convolution layer
        nn.Conv1d(512, 512, kernel_size=3, stride=2),
        nn.BatchNorm1d(512),
        nn.ReLU(),           

        nn.Flatten()  
    )

    self.cnn_layers2 = nn.Sequential(
        # Defining 1st 1D deconvolution layer
        nn.ConvTranspose1d(512, 512, kernel_size=3, stride=2),
        nn.BatchNorm1d(512),
        nn.ReLU(),

        # Defining 2nd 1D deconvolution layer
        nn.ConvTranspose1d(512, 512, kernel_size=3, stride=2),
        nn.BatchNorm1d(512),
        nn.ReLU(),    

        # Defining 3rd 1D deconvolution layer
        nn.ConvTranspose1d(512, 512, kernel_size=3, stride=2),
        nn.BatchNorm1d(512),
        nn.ReLU(),    

        # Defining 4th 1D deconvolution layer        
        nn.ConvTranspose1d(512, 256, kernel_size=3, stride=2),
        nn.BatchNorm1d(256),
        nn.ReLU(),    

        # Defining 5th 1D deconvolution layer
        nn.ConvTranspose1d(256, 128, kernel_size=3, stride=2),
        nn.BatchNorm1d(128),
        nn.ReLU()
    )

    self.fc11 = nn.Linear(512 * 3, z_dim)
    self.fc12 = nn.Linear(512 * 3, z_dim)
  
  def encode(self, x):
    cnn_e = self.cnn_layers1(x)
    mu = self.fc11(cnn_e) 
    logvar = self.fc12(cnn_e) 

    return mu, logvar
  
  def decode(self, z):
    pass
  
  def forward(self, x):
    """
    Forward pass of the model 
    """
    input = self.embeddings(x)
    input = torch.transpose(input, 1, 2)
    mu, logvar = self.encode(input)
    
    return mu, logvar    
  
  def __reparameterize(self, mu, logvar):
    std = torch.exp(logvar / 2)
    eps = torch.randn_like(std)
    
    return mu + std * eps    

In [0]:
def load_data(txt):
  w2i = defaultdict(lambda x=count(0): next(x))
  data = []
  tweets = txt.splitlines()
  tknzr = TweetTokenizer()

  for tweet in tweets:
    tweet = tknzr.tokenize(tweet)
    tokens = [twitter_word_classes(tok) for tok in tweet]
    characters = []
    for i,tok in enumerate(tokens):
      if tok not in ('@userid', 'url'):
        for char in tok:
          w2i[char]
          characters.append(char)
      else:
        characters.append(tok)
      if i != len(tokens) - 1:
        characters.append(" ")
    data.append(characters)
  w2i[" "]
  w2i["_UNK_"]  
  w2i["_PAD_"] 
  w2i["@userid"] 
  w2i["url"] 
  w2i = dict(w2i)
  i2w = {i:w for w,i in w2i.items()}

  return data, w2i, i2w

In [0]:
def twitter_word_classes(token):
  """ 
  Converts Twitter specific classes
  """
  wc = ''
  if token[0] == '@' and len(token) > 1:
      wc = '@userid'
  elif token[0:7] == 'http://':
      wc = 'url'
  else:
      wc = token

  return wc


In [10]:
uploaded = files.upload()

Saving hybrid_cvae.txt to hybrid_cvae.txt


In [0]:
data, w2i, i2w = load_data(uploaded['hybrid_cvae.txt'])
vocab_size = len(w2i)

In [0]:
embedding_dim = 80
z_dim = 512

In [0]:
def train(epoch):
  
  model.train()

  mu, logvar = model(batch_data)

In [104]:
model = VAE(vocab_size, embedding_dim, z_dim)
train(0)

torch.Size([128, 512])


In [0]:
def prepare_sequence(seq, to_ix):
  idxs = [to_ix[w] for w in seq]
  return torch.tensor(idxs, dtype=torch.long)

In [0]:
"""
https://cs230-stanford.github.io/pytorch-nlp.html
"""
batch_size = 128

batch_sentences = []
for i in range(batch_size):
  batch_sentences.append(prepare_sequence(data[i], w2i))

# compute length of longest sentence in batch
batch_max_len = max([len(s) for s in batch_sentences])

batch_data = w2i["_PAD_"]*np.ones((len(batch_sentences), batch_max_len))

# copy the data to the numpy array
for j in range(len(batch_sentences)):
  cur_len = len(batch_sentences[j])
  batch_data[j][:cur_len] = batch_sentences[j]

# since all data are indices, we convert them to torch LongTensors
batch_data = torch.LongTensor(batch_data)

# convert Tensors to Variables
batch_data = Variable(batch_data)