In [1]:
import stanza
import torch
from torch import nn
from torch import optim
import json

  from .autonotebook import tqdm as notebook_tqdm


## VAE Sentence Generator – Based on Bowman et al. (2016)

This notebook re-implements the core model from **“Generating Sentences from a Continuous Space”** using Python & PyTorch. The architecture is a **Variational Autoencoder (VAE)** designed to generate full sentences by learning a continuous latent space.

We apply it to the **[Sarcasm Headlines Dataset](https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection)**.


#### 0. Initialize Device & Load Dataset

In [2]:
device = {
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
}

print(f"Using device {device}")  

device = device.pop()

Using device {'cpu'}


In [3]:
# Transform JSON file into a txt file (only first 3000 rows for performance reasons)
with open("sarcasm_headlines_dataset.json", "r", encoding="utf-8") as f:
    data = [json.loads(line) for _, line in zip(range(3000), f)]

txt = "\n".join(item["headline"] for item in data)
print(txt[:100])  # Print first 100 characters to verify extraction

former versace store clerk sues over secret 'black code' for minority shoppers
the 'roseanne' reviva


In [4]:
# Set up variables
vocabulary = []
ngrams = []
n = 0; # Total Number of tokens
N = 5 # Window size
ngram = []
dict = {}

# Load data with stanza & tokenize
pipeline = stanza.Pipeline(lang='en', processors='tokenize')
doc = pipeline(txt)

pipeline = stanza.Pipeline(lang='en', processors='tokenize')
doc = pipeline(txt)

for i, sentence in enumerate(doc.sentences): # text to sentences
    for token in sentence.words:  # sentences to tokens
        t = token.text.lower() # assign toke to t
        if len(ngram) < N: # if ngram is not full
            ngram.append(t) # add to n-gram
        else: 
            print(ngram)
            ngrams.append(ngram) # add new ngram to ngrams
            ngram = ngram[1:] # shift ngram one to the side
            ngram.append(t) # append the t to ngram

        if not t in vocabulary: 
            vocabulary.append(t) # set up of vocabulary

l = len(vocabulary)
print(vocabulary)
print("Vocabulary Size: " + str(l))

2025-06-29 11:00:57 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 432kB [00:00, 34.9MB/s]                    
2025-06-29 11:00:57 INFO: Downloaded file to C:\Users\himan\stanza_resources\resources.json
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/tokenize/combined.pt: 100%|██████████| 651k/651k [00:00<00:00, 6.58MB/s]
Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.10.0/models/mwt/combined.pt: 100%|██████████| 509k/509k [00:00<00:00, 7.07MB/s]
2025-06-29 11:00:58 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2025-06-29 11:00:58 INFO: Using device: cpu
2025-06-29 11:00:58 INFO: 

['former', 'versace', 'store', 'clerk', 'sues']
['versace', 'store', 'clerk', 'sues', 'over']
['store', 'clerk', 'sues', 'over', 'secret']
['clerk', 'sues', 'over', 'secret', "'"]
['sues', 'over', 'secret', "'", 'black']
['over', 'secret', "'", 'black', 'code']
['secret', "'", 'black', 'code', "'"]
["'", 'black', 'code', "'", 'for']
['black', 'code', "'", 'for', 'minority']
['code', "'", 'for', 'minority', 'shoppers']
["'", 'for', 'minority', 'shoppers', 'the']
['for', 'minority', 'shoppers', 'the', "'"]
['minority', 'shoppers', 'the', "'", 'roseanne']
['shoppers', 'the', "'", 'roseanne', "'"]
['the', "'", 'roseanne', "'", 'revival']
["'", 'roseanne', "'", 'revival', 'catches']
['roseanne', "'", 'revival', 'catches', 'up']
["'", 'revival', 'catches', 'up', 'to']
['revival', 'catches', 'up', 'to', 'our']
['catches', 'up', 'to', 'our', 'thorny']
['up', 'to', 'our', 'thorny', 'political']
['to', 'our', 'thorny', 'political', 'mood']
['our', 'thorny', 'political', 'mood', ',']
['thorny', '

### 1. Set-Up of Word2Vec Embedding

In [5]:
class Word2Vec(nn.Module): # NN Architecture for Word2Vec
    def __init__(self, voc_size, vector_size, window_size): # parameters
        super().__init__() # Inherit from nn.module

        # 3 linear layers with ReLU activations and a final Softmax
        self.model = nn.Sequential(
            nn.Linear(voc_size, voc_size, bias=False),          # 1st linear layer: One-Hot Input
            nn.ReLU(),                                          # Non-linear Activation
            nn.Linear(voc_size, vector_size, bias=False),        # 2nd linear layer: Reduce to embedding (vector) size
            nn.ReLU(),                                          # Non-linear activation
            nn.Linear(vector_size, (window_size-1)*voc_size, bias=False), # 3rd linear layer: Output size for context words
            nn.Softmax(dim=0)                                   # Softmax activation to get probabilities over output
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Forward pass: input tensor x goes through the model
        return self.model(x)

### 1.1 One-Hot Encoded Vector

In [6]:
dict = {} # Empty dictionary for one-hot encoded vectors
print("Word with index 2: " + vocabulary[1]) # 2nd word in dictionary is black
# This means that the 2nd value in the dictionary needs to be 1

# Initialize the dictionary with one-hot encoded vectors (main-code)
for i in range(0,l):
    dict[vocabulary[i]] = torch.zeros(l).to(device) # set all values to 0
    dict[vocabulary[i]][i] = 1 # update the index of word to 1

for word, vector in list(dict.items())[:2]:
    print(f"{word}: {vector}")

Word with index 2: versace
former: tensor([1., 0., 0.,  ..., 0., 0., 0.])
versace: tensor([0., 1., 0.,  ..., 0., 0., 0.])


### 1.2 Train the Model

In [7]:
# --- Please check vector size ---
w2v = Word2Vec(l,5,N).to(device) 

# Train model weights
lossFn = nn.MSELoss() 
optimizer = optim.Adamax(w2v.parameters(), lr=0.4) 
w2v.train() 
w2v.zero_grad() 
cnt = 0 
BATCH_SIZE = 5 

# Initialize tensors for input and output (all set to zero)
y = torch.zeros((N-1)*l).to(device) # Output tensor (y)
yy = torch.zeros((N-1)*l).to(device) # Target tensor (yy)

### 1.3 Create Word-Embeddings

In [None]:
for ngram in ngrams:
    x = dict[ngram[int((N-1)/2)]]
    y_old = y
    y = torch.zeros((N-1)*l).to(device)
    torch.cat([dict[ngram[0]], dict[ngram[1]], dict[ngram[3]], dict[ngram[4]]], dim=-1, out=y)
    y = y + y_old

    yy = w2v(x) + yy

    if (cnt % BATCH_SIZE) == 0:
        loss = lossFn(y, yy)
        y = torch.zeros((N-1)*l).to(device)
        yy = torch.zeros((N-1)*l).to(device)
        print(loss)

        loss.backward()
        optimizer.step() 
        optimizer.zero_grad()  

    cnt += 1

# For VAE compatibility
embeddings = w2v.model[1].weight.data.clone()
word2idx = {word: idx for idx, word in enumerate(vocabulary)}
print(embeddings)
print(word2idx)

tensor(0.0001, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0007, grad_fn=<MseLossBackward0>)
tensor(0.0007, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0007, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.0006, grad_fn=<MseLossBackward0>)
tensor(0.00

## 2. Core VAE Architecture for Sentences

## 3. Training Procedure

## 4. Loss Function

## 5. Evaluation: Language Modelling

## 6. Improved Inference

## 7. Adversarial Evaluation

## 8. Latent Space Analysis

## 9. Text Classification Task

## 10. Hyperparamater Tuning