In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification


In [2]:
import torch
from torch.utils.data import default_collate, RandomSampler, SequentialSampler

In [3]:
# Load IMDB dataset from huggingface datasets and select the "train" split
dataset = load_dataset("imdb", streaming=False)
train_dataset = dataset["train"]
# Since train_dataset is a Map-style dataset, we can setup a sampler to shuffle the data
sampler = RandomSampler(train_dataset)
# Use a standard bert tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [4]:
# Now we can set up some torchdata.nodes to create our pre-proc pipeline
from torchdata.nodes import MapStyleWrapper, ParallelMapper, Batcher, PinMemory, Loader

# All torchdata.nodes.BaseNode implementations are Iterators.
# MapStyleWrapper creates an Iterator that combines sampler and train_dataset to create an iterator.
#
# Under the hood, MapStyleWrapper just does:
# > node = IterableWrapper(sampler)
# > node = Mapper(node, map_fn=train_dataset.__getitem__)  # You can parallelize this with ParallelMapper

node = MapStyleWrapper(map_dataset=train_dataset, sampler=sampler)

# Now we want to transform the raw inputs. We can just use another Mapper with
# a custom map_fn to perform this. Using ParallelMapper allows us to use multiple
# threads (or processes) to parallelize this work and have it run in the background
max_len = 512
batch_size = 32
def bert_transform(item):
    encoding = tokenizer.encode_plus(
        item["text"],
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    return {
        "input_ids": encoding["input_ids"].flatten(),
        "attention_mask": encoding["attention_mask"].flatten(),
        "labels": torch.tensor(item["label"], dtype=torch.long),
    }
node = ParallelMapper(node, map_fn=bert_transform, num_workers=2) # output items are Dict[str, tensor]

# Next we batch the inputs, and then apply a collate_fn with another Mapper
# to stack the tensors between. We use torch.utils.data.default_collate for this
node = Batcher(node, batch_size=batch_size) # output items are List[Dict[str, tensor]]
node = ParallelMapper(node, map_fn=default_collate, num_workers=2) # outputs are Dict[str, tensor]

# we can optionally apply pin_memory to the batches
if torch.cuda.is_available():
    node = PinMemory(node)

# Since nodes are iterators, they need to be manually .reset() between epochs.
# We can wrap the root node in Loader to convert it to a more conventional Iterable.
loader = Loader(node)

In [5]:
# Inspect a batch
print(next(iter(loader)))

{'input_ids': tensor([[ 101, 4283, 2000,  ...,    0,    0,    0],
        [ 101, 2821, 2009,  ..., 7987, 1013,  102],
        [ 101, 2023, 2034,  ...,    0,    0,    0],
        ...,
        [ 101, 2228, 1997,  ..., 1045, 2228,  102],
        [ 101, 1996, 2434,  ...,    0,    0,    0],
        [ 101, 1045, 4149,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
        0, 0, 0, 0, 1, 0, 0, 0])}


In [6]:
from utils import train_bert, get_prediction_bert

In [7]:
# Set up a similar Test DataLoader to get evaluation accuracy
test_dataset = dataset["test"]
node = MapStyleWrapper(map_dataset=test_dataset, sampler=SequentialSampler(test_dataset))
node = ParallelMapper(node, map_fn=bert_transform, num_workers=2) # output items are Dict[str, tensor]
node = Batcher(node, batch_size=batch_size) # output items are List[Dict[str, tensor]]
node = ParallelMapper(node, map_fn=default_collate, num_workers=2) # outputs are Dict[str, tensor]
if torch.cuda.is_available():
    node = PinMemory(node)
test_loader = Loader(node)

In [8]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model = train_bert(model, loader, test_loader, num_epochs=1, batch_size=batch_size)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.4385003423667513
Test Loss : 0.3982642225455493, Accuracy:  0.8271


In [9]:
#let's check some predictions
get_prediction_bert("Best movie.", model, max_len, tokenizer)

Positive


In [10]:
get_prediction_bert("Worst movie ever.", model, max_len, tokenizer)


Negative
