In [4]:
!wget https://zenodo.org/record/1043504/files/corpus-webis-tldr-17.zip?download=1 -O /tmp/corpus-webis-tldr-17.zip
!unzip /tmp/corpus-webis-tldr-17.zip -d /tmp/
!rm -rf /tmp/corpus-webis-tldr-17.zip

# currently only use 1k datapoints
!head -1000 /tmp/corpus-webis-tldr-17.json >> data.json

--2021-10-22 01:11:52--  https://zenodo.org/record/1043504/files/corpus-webis-tldr-17.zip?download=1
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3141854161 (2.9G) [application/octet-stream]
Saving to: ‘/tmp/corpus-webis-tldr-17.zip’


2021-10-22 01:14:11 (21.7 MB/s) - ‘/tmp/corpus-webis-tldr-17.zip’ saved [3141854161/3141854161]

Archive:  /tmp/corpus-webis-tldr-17.zip
  inflating: /tmp/corpus-webis-tldr-17.json  


In [17]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import json
import os
import os.path

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
START_TOKEN = '<start>'
STOP_TOKEN = '<stop>'
UNK_TOKEN = '<unk>'
PAD_TOKEN = '<pad>'

tokenizer = get_tokenizer('basic_english')

def add_start_stop_tokens(s):
    return START_TOKEN + ' ' + s + ' ' + STOP_TOKEN

class RedditTldrDataset(Dataset):
    def __init__(self, json_file):
        self.data = open(json_file, 'r').readlines()
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = json.loads(self.data[idx])
        sample = (add_start_stop_tokens(entry['content']), add_start_stop_tokens(entry['summary']))
        
        return sample

def build_vocab_from_dataset(dataset):
    def get_token():
        for sample in dataset:
            x, y = sample
            yield tokenizer(x)
            yield tokenizer(y)
    
    vocab = build_vocab_from_iterator(get_token(), specials=[UNK_TOKEN, PAD_TOKEN])
    vocab.set_default_index(vocab[UNK_TOKEN])
    return vocab

In [20]:
dataset = RedditTldrDataset('data.json')

vocab = build_vocab_from_dataset(dataset)

In [21]:
EMBEDDING_DIM = 128
ATTENTION_DIM = 512

class Summarizer(nn.Module):
    def __init__(self, vocab):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=len(vocab), embedding_dim=EMBEDDING_DIM)
        
        # [...] transformer stuff

    def forward(self, x):
        x = self.embedding(x)

        # [...] transformer stuff

        return x

In [22]:
EPOCHS = 1
BATCH_SIZE = 4

model = Summarizer(vocab).to(device)
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# criterion = ...

def text_batch_to_tensor(batch):
    sequences = [torch.Tensor(vocab(tokenizer(add_start_stop_tokens(text)))).long() for text in batch]
    tensor = nn.utils.rnn.pad_sequence(sequences=sequences, batch_first=True, padding_value=vocab[PAD_TOKEN])
    return tensor

for epoch in range(EPOCHS):
    print('epoch: ', epoch)
    for x_batch, y_batch in dataloader:
        x_batch = text_batch_to_tensor(x_batch)
        y_batch = text_batch_to_tensor(y_batch)

        # todo: look into 'long transformer' or 'sparse transformer'
        if x_batch.shape[1] > ATTENTION_DIM:
            continue

        # optimizer.zero_grad()

        out_batch = model(x_batch)
        
        # todo: complete model architecture
        # loss = criterion(out_batch, y_batch)
        # loss.backward()
        # optimizer.step()

        print('out:', out_batch)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

        [[ 0.2048,  0.1339,  0.5602,  ..., -0.1777,  2.0267, -0.5202],
         [ 0.2048,  0.1339,  0.5602,  ..., -0.1777,  2.0267, -0.5202],
         [-0.3438, -0.0222,  0.2949,  ..., -0.5578, -1.8735,  0.4543],
         ...,
         [-0.1405, -1.1559,  0.2187,  ..., -0.7478,  1.3880,  1.4656],
         [-0.0912,  0.7842,  0.7714,  ..., -0.3212,  0.4454, -0.4901],
         [-0.0912,  0.7842,  0.7714,  ..., -0.3212,  0.4454, -0.4901]],

        [[ 0.2048,  0.1339,  0.5602,  ..., -0.1777,  2.0267, -0.5202],
         [ 0.2048,  0.1339,  0.5602,  ..., -0.1777,  2.0267, -0.5202],
         [-0.9514, -1.1455,  0.8509,  ...,  0.6035, -0.1538, -0.5811],
         ...,
         [-1.0835, -0.1764,  1.4342,  ..., -1.1875,  0.9033,  0.1328],
         [-1.0835, -0.1764,  1.4342,  ..., -1.1875,  0.9033,  0.1328],
         [-1.0835, -0.1764,  1.4342,  ..., -1.1875,  0.9033,  0.1328]]],
       grad_fn=<EmbeddingBackward>)
out: tensor([[