In [1]:
from datasets import load_dataset
from transformers import LlamaTokenizer, LlamaForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification

import torch
from torch.utils.data import default_collate 



from torchdata.nodes.adapters import IterableWrapper
from torchdata.nodes.batch import Batcher
from torchdata.nodes.map import Mapper

from functools import partial


### Load IMDB dataset

In [2]:

dataset = load_dataset("imdb")

In [3]:
train_dataset = dataset["train"].shuffle().select(range(2048))
test_dataset = dataset["test"].shuffle().select(range(1024))


In [4]:
print(f"Size training = {len(train_dataset["text"])}, size test = {len(test_dataset["text"])}")

Size training = 2048, size test = 1024


##### let's look at one example

In [5]:
train_dataset["text"][0], train_dataset["label"][0]

('MGM were unsure of how to market Garbo when she first arrived in Hollywood. Mayer had a lot of faith in her and her appearance in "Torrent" justified that. She did not speak a word of English so she must have found it difficult to work, also Ricardo Cortez did not make it very easy for her.<br /><br />The torrent of the title is the river Juscar that winds through a sleepy little village in Spain. Leonora (Greta Garbo) hopes someday that her voice will bring great wealth and happiness to her struggling parents. Leonora and Don Rafael (Ricardo Cortez) are in love but he is under his mother\'s thumb and cannot get her to consent to his marriage. Meanwhile Dona Brull (Martha Mattox) has evicted Leonora\'s parents from their home and they send Leonora to Paris hoping to give her a chance to further her singing career. Leonora sends a note to Rafael, urging him to remember his promise and come with her. His mother is enraged and forbids him to go - so of course he caves in to her request.

##### create train and test nodes

In [6]:
train_node = IterableWrapper(train_dataset)
test_node = IterableWrapper(test_dataset)

In [7]:
# Set hyperparameters
max_len = 512
batch_size = 32
num_epochs = 1
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### we are defining a map function that we pass in the Mapper

In [8]:
def map_fn(item, max_len, tokenizer):
    text = item["text"]
    label = item["label"]
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    return {
        "input_ids": encoding["input_ids"].flatten(),
        "attention_mask": encoding["attention_mask"].flatten(),
        "labels": torch.tensor(label, dtype=torch.long),
    }



In [9]:
train_mapper = Mapper(train_node,partial(map_fn, max_len=max_len, tokenizer=tokenizer))
test_mapper = Mapper(test_node,partial(map_fn, max_len=max_len, tokenizer=tokenizer))

In [10]:
for item in train_mapper:
    print(item)
    break

{'input_ids': tensor([  101, 15418,  2020, 12422,  1997,  2129,  2000,  3006, 11721, 15185,
         2080,  2043,  2016,  2034,  3369,  1999,  5365,  1012, 14687,  2018,
         1037,  2843,  1997,  4752,  1999,  2014,  1998,  2014,  3311,  1999,
         1000, 22047,  3372,  1000, 15123,  2008,  1012,  2016,  2106,  2025,
         3713,  1037,  2773,  1997,  2394,  2061,  2016,  2442,  2031,  2179,
         2009,  3697,  2000,  2147,  1010,  2036, 13559,  2522, 19731,  2480,
         2106,  2025,  2191,  2009,  2200,  3733,  2005,  2014,  1012,  1026,
         7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996, 22047,  3372,
         1997,  1996,  2516,  2003,  1996,  2314, 18414, 15782,  2099,  2008,
         7266,  2083,  1037, 17056,  2210,  2352,  1999,  3577,  1012,  6506,
         6525,  1006, 26111, 11721, 15185,  2080,  1007,  8069, 13834,  2008,
         2014,  2376,  2097,  3288,  2307,  7177,  1998,  8404,  2000,  2014,
         8084,  3008,  1012,  6506,  6525,  1998, 

##### Next we create a batcher each for training and testing. Testing batch_size we fix as 128.

In [11]:
train_batcher = Batcher(train_mapper, batch_size, drop_last=True)
test_batcher = Batcher(test_mapper, 128, drop_last=True)

##### Checking how a batch looks like

In [12]:
for batch in train_batcher:
    batch = default_collate(batch)
    input_ids, attention_mask, labels = batch["input_ids"], batch["attention_mask"], batch["labels"]
    print(input_ids, attention_mask, labels)
    break

tensor([[  101, 15418,  2020,  ...,  1005,  1055,   102],
        [  101,  2005,  6298,  ...,     0,     0,     0],
        [  101,  4462,  5822,  ...,     0,     0,     0],
        ...,
        [  101,  2028,  1997,  ...,     0,     0,     0],
        [  101,  2023,  2003,  ...,     0,     0,     0],
        [  101, 15390,  2039,  ...,     0,     0,     0]]) tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]) tensor([1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 0, 1, 1, 1])


##### This LLM has approx 100M params

In [13]:
num_params = sum(p.numel() for p in model.parameters())/1000000000
print(f"Number of parameters: {num_params} B")

Number of parameters: 0.109483778 B


In [14]:
# Set device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
# Train the model
for epoch in range(num_epochs):
    train_batcher.reset()
    test_batcher.reset()
    model.train()
    total_loss = 0
    
    for num_loop, batch in enumerate(train_batcher):

        batch = default_collate(batch)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        print(f"Trained on {(num_loop+1)*batch_size} samples.")
    print(f"Epoch {epoch+1}, Loss: {total_loss / num_loop}")
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        num_samples_tested=0
        num_loops=0
        for batch in test_batcher:
            
            batch = default_collate(batch)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = loss_fn(outputs.logits, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs.logits, dim=1)
            correct += (predicted == labels).sum().item()
            num_samples_tested+=128
            num_loops+=1
            print(f"Tested {num_samples_tested} samples")
    accuracy = correct / num_samples_tested
    print(f"Test Loss: {test_loss / num_loops}, Accuracy: {accuracy:.4f}")

Trained on 32 samples.
Trained on 64 samples.
Trained on 96 samples.
Trained on 128 samples.
Trained on 160 samples.
Trained on 192 samples.
Trained on 224 samples.
Trained on 256 samples.
Trained on 288 samples.
Trained on 320 samples.
Trained on 352 samples.
Trained on 384 samples.
Trained on 416 samples.
Trained on 448 samples.
Trained on 480 samples.
Trained on 512 samples.
Trained on 544 samples.
Trained on 576 samples.
Trained on 608 samples.
Trained on 640 samples.
Trained on 672 samples.
Trained on 704 samples.
Trained on 736 samples.
Trained on 768 samples.
Trained on 800 samples.
Trained on 832 samples.
Trained on 864 samples.
Trained on 896 samples.
Trained on 928 samples.
Trained on 960 samples.
Trained on 992 samples.
Trained on 1024 samples.
Trained on 1056 samples.
Trained on 1088 samples.
Trained on 1120 samples.
Trained on 1152 samples.
Trained on 1184 samples.
Trained on 1216 samples.
Trained on 1248 samples.
Trained on 1280 samples.
Trained on 1312 samples.
Trained o

In [15]:
##### Let's also test on our custom examples

In [27]:
def get_prediction(review, model, max_len, tokenizer):

    encoding = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    input_ids = encoding["input_ids"].flatten()
    input_ids = input_ids.unsqueeze(0) 
    attention_mask = encoding["attention_mask"].flatten()
    attention_mask = attention_mask.unsqueeze(0) 
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits)
        if predicted_class==0:
            print("Negative")
        else:
            print("Positive")
    

In [28]:
get_prediction("best movie", model, max_len, tokenizer)

Positive


In [29]:
get_prediction("No other movie is worse than this movie.", model, max_len, tokenizer)


Negative


In [30]:
get_prediction("This is not very good", model, max_len, tokenizer)

Negative


In [31]:
get_prediction("Will watch again", model, max_len, tokenizer)


Positive


In [32]:
get_prediction("I want to watch it again", model, max_len, tokenizer)

Positive


In [33]:
get_prediction("I do not want to watch it again", model, max_len, tokenizer)

Negative


In [34]:
get_prediction("Unwatchable", model, max_len, tokenizer)

Negative
