In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification

import torch
from torch.utils.data import default_collate 

from torchdata.nodes.adapters import IterableWrapper
from torchdata.nodes.batch import Batcher
from torchdata.nodes.map import Mapper
from torchdata.nodes.loader import Loader

from functools import partial

### Load IMDB dataset

In [2]:
dataset = load_dataset("imdb")
train_dataset = dataset["train"].shuffle(42).select(range(4096))
test_dataset = dataset["test"].shuffle(42).select(range(1024))

In [3]:
print(f"Size training = {len(train_dataset["text"])}, size test = {len(test_dataset["text"])}")

Size training = 4096, size test = 1024


##### let's look at one example

In [4]:
train_dataset["text"][0], train_dataset["label"][0]

('There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...',
 1)

##### Set hyperparameters

In [5]:
# Set hyperparameters
max_len = 512
batch_size = 32
num_epochs = 1
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### Next we use torchdata.nodes for defining the batcher and transforms

In [6]:
train_node = IterableWrapper(train_dataset)
test_node = IterableWrapper(test_dataset)

def map_fn(item, max_len, tokenizer):
    text = item["text"]
    label = item["label"]
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    return {
        "input_ids": encoding["input_ids"].flatten(),
        "attention_mask": encoding["attention_mask"].flatten(),
        "labels": torch.tensor(label, dtype=torch.long),
    }

train_mapper = Mapper(train_node,partial(map_fn, max_len=max_len, tokenizer=tokenizer))
test_mapper = Mapper(test_node,partial(map_fn, max_len=max_len, tokenizer=tokenizer))

#We use Loader so that we do not have to reset the batcher after every epoch
train_batcher = Loader(Batcher(train_mapper, batch_size, drop_last=True))
test_batcher = Loader(Batcher(test_mapper, 128, drop_last=True))


##### Checking how a batch looks like

In [7]:
for batch in train_batcher:
    batch = default_collate(batch)
    input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
    print(input_ids, attention_mask, labels)
    break

tensor([[  101,  2045,  2003,  ...,     0,     0,     0],
        [  101,  2023,  3185,  ...,     0,     0,     0],
        [  101,  2577,  1052,  ...,     0,     0,     0],
        ...,
        [  101,  2009,  1005,  ...,     0,     0,     0],
        [  101,  1045,  2074,  ...,     0,     0,     0],
        [  101, 18036,  5886,  ...,  7344,  2474,   102]]) tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]) tensor([1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 1, 0, 1, 1, 1, 0, 0])


##### This LLM has approx 100M params

In [8]:
num_params = sum(p.numel() for p in model.parameters())/1000000000
print(f"Number of parameters: {num_params} B")

Number of parameters: 0.109483778 B


### Train the model

In [9]:
# Set device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
# Train the model
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for num_loop, batch in enumerate(train_batcher):

        batch = default_collate(batch)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        print(f"Trained on {(num_loop+1)*batch_size} samples.")
    print(f"Epoch {epoch+1}, Loss: {total_loss / num_loop}")
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        num_samples_tested=0
        num_loops=0
        for batch in test_batcher:
            
            batch = default_collate(batch)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = loss_fn(outputs.logits, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs.logits, dim=1)
            correct += (predicted == labels).sum().item()
            num_samples_tested+=128
            num_loops+=1
            print(f"Tested {num_samples_tested} samples")
    accuracy = correct / num_samples_tested
    print(f"Test Loss: {test_loss / num_loops}, Accuracy: {accuracy:.4f}")

Trained on 32 samples.
Trained on 64 samples.
Trained on 96 samples.
Trained on 128 samples.
Trained on 160 samples.
Trained on 192 samples.
Trained on 224 samples.
Trained on 256 samples.
Trained on 288 samples.
Trained on 320 samples.
Trained on 352 samples.
Trained on 384 samples.
Trained on 416 samples.
Trained on 448 samples.
Trained on 480 samples.
Trained on 512 samples.
Trained on 544 samples.
Trained on 576 samples.
Trained on 608 samples.
Trained on 640 samples.
Trained on 672 samples.
Trained on 704 samples.
Trained on 736 samples.
Trained on 768 samples.
Trained on 800 samples.
Trained on 832 samples.
Trained on 864 samples.
Trained on 896 samples.
Trained on 928 samples.
Trained on 960 samples.
Trained on 992 samples.
Trained on 1024 samples.
Trained on 1056 samples.
Trained on 1088 samples.
Trained on 1120 samples.
Trained on 1152 samples.
Trained on 1184 samples.
Trained on 1216 samples.
Trained on 1248 samples.
Trained on 1280 samples.
Trained on 1312 samples.
Trained o

#### We got an accuracy of around 90%

##### Let's also test on our custom examples

In [11]:
def get_prediction(review, model, max_len, tokenizer):

    encoding = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    input_ids = encoding["input_ids"].flatten().unsqueeze(0) 
    attention_mask = encoding["attention_mask"].flatten().unsqueeze(0) 
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits)
        if predicted_class==0:
            print("Negative")
        else:
            print("Positive")
    

In [12]:
get_prediction("best movie", model, max_len, tokenizer)

Positive


In [13]:
get_prediction("Worst movie ever.", model, max_len, tokenizer)


Negative


In [14]:
get_prediction("No other movie is worse than this movie.", model, max_len, tokenizer)


Negative


In [15]:
get_prediction("This is not very good", model, max_len, tokenizer)

Negative


In [16]:
get_prediction("Will watch again", model, max_len, tokenizer)


Positive


In [17]:
get_prediction("I want to watch it again", model, max_len, tokenizer)

Positive


In [18]:
get_prediction("I do not want to watch it again", model, max_len, tokenizer)

Negative


In [19]:
get_prediction("Unwatchable", model, max_len, tokenizer)

Negative
