### First we import all the packages required

In [1]:

from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification

import torch
from torch.utils.data import default_collate 

from torchdata.nodes.adapters import IterableWrapper
from torchdata.nodes.batch import Batcher
from torchdata.nodes.map import Mapper
from torchdata.nodes.loader import Loader

from functools import partial
from utils import map_fn_bert, train_bert, get_prediction


### Load IMDB dataset

In [2]:
dataset = load_dataset("imdb")
train_dataset = dataset["train"].shuffle(42).select(range(4096))
test_dataset = dataset["test"].shuffle(42).select(range(1024))

In [3]:
print(f"Size training = {len(train_dataset["text"])}, size test = {len(test_dataset["text"])}")

Size training = 4096, size test = 1024


##### Let's look at one example

In [4]:
train_dataset["text"][0], train_dataset["label"][0]

('There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...',
 1)

##### Set hyperparameters

In [5]:
# Set hyperparameters
max_len = 512
batch_size = 32
num_epochs = 1
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Next we use torchdata.nodes for defining transforms and batcher 
##### (i.e. the way in which batches will be passed during training)

#### First we make train and test `BaseNode` by using IterableWrapper

In [6]:
train_node = IterableWrapper(train_dataset)
test_node = IterableWrapper(test_dataset)

#### Next we use a Mapper to carry out the necessary transformations on our data before batching it

In [7]:
train_mapper = Mapper(train_node,partial(map_fn_bert, max_len=max_len, tokenizer=tokenizer))
test_mapper = Mapper(test_node,partial(map_fn_bert, max_len=max_len, tokenizer=tokenizer))

#### Finally we use Batcher to batch the samples together and Loader to reset the Batcher in every epoch

In [8]:
#We use Loader so that we do not have to reset the batcher after every epoch
train_batcher = Loader(Batcher(train_mapper, batch_size, drop_last=True))
test_batcher = Loader(Batcher(test_mapper, 128, drop_last=True))

#### Let's see how a batch looks like

#### We use the default collate method to collate the samples in a batch. Each batch contains batch_size = 32 samples.

In [9]:
for batch in train_batcher:
    batch = default_collate(batch)
    input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
    print(input_ids, attention_mask, labels)
    break

tensor([[  101,  2045,  2003,  ...,     0,     0,     0],
        [  101,  2023,  3185,  ...,     0,     0,     0],
        [  101,  2577,  1052,  ...,     0,     0,     0],
        ...,
        [  101,  2009,  1005,  ...,     0,     0,     0],
        [  101,  1045,  2074,  ...,     0,     0,     0],
        [  101, 18036,  5886,  ...,  7344,  2474,   102]]) tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]) tensor([1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 1, 0, 1, 1, 1, 0, 0])


##### This LLM has approx 100M params, thus, training on a local machine might take some time

In [10]:
num_params = sum(p.numel() for p in model.parameters())/1000000
print(f"Number of parameters: {num_params:.0f} M")

Number of parameters: 109 M


### Train the model

In [11]:
model = train_bert(model, train_batcher, test_batcher, num_epochs, batch_size, )

Epoch 1, Loss: 0.4253180552655318
Test Loss: 0.2570107448846102, Accuracy: 0.8926


#### Finally our model is trained.We got an accuracy of around 90%.

#### Let's also test on our custom examples

In [12]:
get_prediction("best movie", model, max_len, tokenizer)

Positive


In [13]:
get_prediction("Worst movie ever.", model, max_len, tokenizer)


Negative


In [14]:
get_prediction("No other movie is worse than this movie.", model, max_len, tokenizer)


Negative


In [15]:
get_prediction("This is not very good", model, max_len, tokenizer)

Negative


In [16]:
get_prediction("Will watch again", model, max_len, tokenizer)


Positive


In [17]:
get_prediction("I want to watch it again", model, max_len, tokenizer)

Positive


In [18]:
get_prediction("I do not want to watch it again", model, max_len, tokenizer)

Negative


In [19]:
get_prediction("Unwatchable", model, max_len, tokenizer)

Negative
