In [1]:
from torchtext.data import Field
from torchtext.data import TabularDataset
import pandas as pd
import re
import torch

def tokenize(text):
    return re.findall('[\w]+', text.lower())

TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False)

datafields = {'title':('title', TEXT), "text":('text', TEXT), "url":("url", TEXT), "source":('source', TEXT), "longform":("longform", LABEL)}

train,test,val = TabularDataset.splits(path="debugdata", train="train_basic.json", test="test_basic.json", validation="validate_basic.json", format="json", fields = datafields)
print(train.examples[0])

<torchtext.data.example.Example object at 0x000001C0837FFDC8>


In [2]:
print(train.examples[0].source)
print(len(train.examples))

['https', 'www', 'vox', 'com', '2014', '7', '21', '11629010', 'affirm', 'hires', 'chief', 'compliance', 'officer', 'add', 'rabois', 'liew', 'to', 'board']
100


In [3]:
train.examples[19].title

['romania', 'breaks', 'up', 'alleged', '25m', 'illegal', 'logging', 'ring']

In [4]:
TEXT.build_vocab(train)
TEXT.build_vocab(test)
TEXT.build_vocab(val)

In [5]:
from torchtext.data import Iterator, BucketIterator

train_iter, val_iter, test_iter = BucketIterator.splits(
 (train, val, test),
 batch_sizes=(32,32,32),
 sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False,
)

In [6]:
next(iter(train_iter))


[torchtext.data.batch.Batch of size 32]
	[.title]:[torch.LongTensor of size 21x32]
	[.text]:[torch.LongTensor of size 5357x32]
	[.url]:[torch.LongTensor of size 24x32]
	[.source]:[torch.LongTensor of size 24x32]
	[.longform]:[torch.LongTensor of size 32]

In [13]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
            self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x 

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            y = getattr(batch, self.y_vars)
            y = y[:, None]
            y = y.float()
            yield (x, y)

    def __len__(self):
        return len(self.dl)

train_dl = BatchWrapper(train_iter, "text", "longform")
valid_dl = BatchWrapper(val_iter, "text", "longform")
test_dl = BatchWrapper(test_iter, "text", "longform")

next(train_dl.__iter__())

(tensor([[ 367, 3178,    7,  ...,    0,   32, 2568],
         [   3,    5,  291,  ...,    0,  253,   62],
         [   4, 2514, 9066,  ...,    2, 3376,   23],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]]),
 tensor([[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [1.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]))

In [14]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 1)

    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

em_sz = 100
nh = 500
nl = 3
model = SimpleLSTMBaseline(nh, emb_dim=em_sz)
model

SimpleLSTMBaseline(
  (embedding): Embedding(12910, 100)
  (encoder): LSTM(100, 500)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=500, out_features=1, bias=True)
)

In [16]:
from tqdm.notebook import tqdm

opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()

epochs = 2

for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x,y in tqdm(train_dl):
        opt.zero_grad()
        preds = model(x)
        loss = loss_func(y, preds)
        loss.backward()
        opt.step()

        running_loss += loss.data * x.size(0)

    epoch_loss = running_loss / len(train)

    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(y, preds)
        val_loss += loss.data * x.size(0)

    val_loss /= len(val)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Epoch: 1, Training Loss: -75.5914, Validation Loss: -2750.2510


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Epoch: 2, Training Loss: -809.9732, Validation Loss: -4575.0122
