In [1]:
# here we use BERT model and add a final classification layer to classify into 2 categories

In [2]:
# !pip install -U datasets huggingface_hub fsspec
from datasets import load_dataset, DatasetDict
dataset = load_dataset('imdb')

In [3]:
import numpy as np
import matplotlib.pyplot as plt


import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import BertModel, BertTokenizer

In [4]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained('bert-base-uncased').to(device)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [5]:
tokenizer.vocab_size

30522

In [6]:
bert
# note the namings are diff to OpenAI
# we have a pooler layer as final layer which has 768x768, not the vocab_size, bcoz this is for classification

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [7]:
bert.embeddings.dropout.p

0.1

More on model inputs and outputs

In [8]:
text = 'Replace me with any text you like.'
tokens = tokenizer(text, return_tensors='pt').to(device)
tokens

{'input_ids': tensor([[ 101, 5672, 2033, 2007, 2151, 3793, 2017, 2066, 1012,  102]],
       device='mps:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='mps:0')}

In [9]:
# get model output using sepcific inputs
output = bert(
    input_ids = tokens['input_ids'],
    attention_mask = tokens['attention_mask']
)

# better way
output = bert(**tokens) #dict unpacking


In [10]:
dir(output)

['__annotations__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__post_init__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'attentions',
 'clear',
 'copy',
 'cross_attentions',
 'fromkeys',
 'get',
 'hidden_states',
 'items',
 'keys',
 'last_hidden_state',
 'move_to_end',
 'past_key_values',
 'pooler_output',
 'pop',
 'popitem',
 'setdefault',
 'to_tuple',
 'update',
 'values']

In [11]:
output['last_hidden_state'].shape

torch.Size([1, 10, 768])

In [12]:
output['pooler_output'].shape #output of final linear layer of bert model

torch.Size([1, 768])

In [13]:
# can we get text output?  NOOOO bcoz this is not for text gen
bert.generate(tokens, max_length=100,do_sample=True).cpu()

AttributeError: 'BertModel' object has no attribute 'generate'

CREATE an LLM model using pretrained BERT with a new head that does the classification

In [14]:
class BertForBinaryClassification(nn.Module):
    def __init__(self, num_labels=2):
        super(BertForBinaryClassification, self).__init__()

        # Load the pretrained BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        #classification head that converts 678-d pooled output into 2 final outuputs
        self.classifier = nn.Linear(768,2)
        self.dropout = nn.Dropout(self.bert.embeddings.dropout.p) #10%

        #init the weights and biases
        nn.init.xavier_uniform_(self.classifier.weight)
        nn.init.zeros_(self.classifier.bias)

    def forward(self, input_ids, attention_mask = None, token_type_ids = None):
        # fwd pas through the downloaded(pretrained) BERT
        outputs = self. bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_idsn = token_type_ids)

        # extract the pooled output and apply dropout
        pooled_output  = self.dropout(outputs.pooler_output)

        # final push through classification layer
        logits = self.classifier(pooled_output)
        return logits

In [15]:
# create an instance of model and test it
model = BertForBinaryClassification().to(device)

tokens = tokenizer(text, return_tensors='pt').to(device)
out = model(**tokens)
out

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tensor([[ 1.0514, -1.0165]], device='mps:0', grad_fn=<LinearBackward0>)

In [16]:
# the output of the model is not nexxt token prediction
# its a binary classification
# now we train the model, to correspond this output to positive and -ve reviews

Import the datset

In [17]:
dataset
# this is a special dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [18]:
dataset['train'][24000]
# we have labels 0 and 1 -ve and +ve

{'text': "S.S. Van Dine must have been a shrewd businessman in dealing with Hollywood. Most of the film series' from the studio days were usually confined to one or two studios. But apparently Van Dine must have sold his rights to each book about Philo Vance one at a time. Note that Paramount, MGM, Warner Brothers, and more all released Philo Vance films. Only Tarzan seemed to get around Hollywood more.<br /><br />MGM produced the Garden Murder Case and starred Edmund Lowe as the fashionable detective. Of course MGM had the screen's original Philo under contract at the time, but Bill Powell was busy doing The Thin Man at the time and I guess Louis B. Mayer decided to concentrate him there.<br /><br />Edmund Lowe is a pretty acceptable Philo Vance. Lowe had started out pretty big at the tail end of the silent era with What Price Glory and then with a string of films with Victor McLaglen with their Flagg and Quirt characters. But after McLaglen got his Oscar for The Informer, Lowe seemed

its a lot of data, lets take a small sample and train the model

In [19]:
# this special dict has a method called select which selects data points
dataset['train'].select(range(100))

Dataset({
    features: ['text', 'label'],
    num_rows: 100
})

In [20]:
# the dataset['train'] ois organised in a way that first half is all 0s and next half all 1s
# so we need to reduce dataset size while:
#    1) including both categories
#    2) preserving only 'train' and 'test (dont use unsupervised)
smalldata = DatasetDict({split: dataset[split].select(range(10000,15000)) for split in ['train','test']})
smalldata

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

Tokenizing the text with padding

In [21]:
# in this text, we have variable sequences
# we need to deal with this because LMs not designed to handle batches with seq of diff lengths


In [22]:
# this works....
first_try = tokenizer(dataset['train'][0]['text'])

# but this is better bcz reviews have diff lengths
better  = tokenizer(
    dataset['train'][0]['text'], # the text to tokenize
    max_length = 512,
    padding = 'max_length', # usning pad_token to reach max_len
    truncation=True) # cut out tokens > max_len

print(f"'Naive' tokenization (N={len(first_try['input_ids'])}):")
print(f"{first_try['input_ids']}")

print(f"\nBetter tokenization (N={len(better['input_ids'])}):")
print(f"{better['input_ids']}")



'Naive' tokenization (N=363):
[101, 1045, 12524, 1045, 2572, 8025, 1011, 3756, 2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009, 2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, 2657, 2008, 2012, 2034, 2009, 2001, 8243, 2011, 1057, 1012, 1055, 1012, 8205, 2065, 2009, 2412, 2699, 2000, 4607, 2023, 2406, 1010, 3568, 2108, 1037, 5470, 1997, 3152, 2641, 1000, 6801, 1000, 1045, 2428, 2018, 2000, 2156, 2023, 2005, 2870, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 5436, 2003, 8857, 2105, 1037, 2402, 4467, 3689, 3076, 2315, 14229, 2040, 4122, 2000, 4553, 2673, 2016, 2064, 2055, 2166, 1012, 1999, 3327, 2016, 4122, 2000, 3579, 2014, 3086, 2015, 2000, 2437, 2070, 4066, 1997, 4516, 2006, 2054, 1996, 2779, 25430, 14728, 2245, 2055, 3056, 2576, 3314, 2107, 2004, 1996, 5148, 2162, 1998, 2679, 3314, 1999, 1996, 2142, 2163, 1012, 1999, 2090, 4851, 8801, 1998, 6623, 7939, 4697, 3619, 1997, 8947, 2055, 2037, 10740, 2006, 4331, 1010, 2016, 2038, 3348, 2007,

In [23]:
# define a token func that processes each data sample
def tokenize_function(one_sample):
    return tokenizer(
        one_sample['text'],
        max_length = 512,
        padding = 'max_length',
        truncation = True)

# apply tokeniation func to dataset (batched for efficiency
tokenized_dataset = smalldata.map(tokenize_function, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [24]:
smalldata

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [25]:
# remove text pair
tokenized_dataset = tokenized_dataset.remove_columns(['text'])

# change format to pytorch tensors
tokenized_dataset.set_format('torch',columns=['input_ids', 'attention_mask', 'label','token_type_ids'])

# create DataLoaders for training and testing
train_dataloader = DataLoader(tokenized_dataset['train'], shuffle=True, batch_size=16)
test_dataloader = DataLoader(tokenized_dataset['test'], shuffle=True, batch_size=16)

In [26]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [27]:
X = next(iter(train_dataloader))
X

{'label': tensor([0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0]),
 'input_ids': tensor([[ 101, 2023, 3185,  ...,    0,    0,    0],
         [ 101, 2023, 3185,  ...,    0,    0,    0],
         [ 101, 1000, 1996,  ...,    0,    0,    0],
         ...,
         [ 101, 2044, 3974,  ...,    0,    0,    0],
         [ 101, 2023, 3185,  ...,    0,    0,    0],
         [ 101, 1045, 2787,  ...,    0,    0,    0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [28]:
print(X['input_ids'].shape)
print(X['label'].shape)

torch.Size([16, 512])
torch.Size([16])


Now fine tune the model

In [29]:
optimizer= torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fun = nn.CrossEntropyLoss() # cross entropy for multi class classification


In [30]:
# get a bathc of data
batch = next(iter(train_dataloader))

# move to GPU
tokenz = batch['input_ids'].to(device)
attn_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)

# clear the prev grads
optimizer.zero_grad()

# fwd pass and get model preds
logits = model(tokenz, attention_mask = attn_mask)
predLabels = torch.argmax(logits, dim=1)

# calculate and store loss+avg accuracy
loss = loss_fun(logits, labels)
train_accuracy = (predLabels == labels).sum().item() / train_dataloader.batch_size

loss.backward()
optimizer.step()


In [31]:
print(f'logits are of size {logits.shape} and are:\n', logits)

logits are of size torch.Size([16, 2]) and are:
 tensor([[ 0.4223, -0.5061],
        [ 0.4937, -1.1348],
        [ 0.6020, -1.4387],
        [ 1.1124, -1.0846],
        [ 1.1038, -1.3456],
        [ 1.0491, -1.6837],
        [ 1.2433, -1.4893],
        [ 0.4244, -1.1602],
        [ 0.6467, -0.8793],
        [ 0.7566, -0.9556],
        [ 0.7556, -1.2713],
        [ 0.8743, -1.4579],
        [ 0.9014, -1.2644],
        [ 0.3062, -1.1704],
        [ 0.6058, -0.8031],
        [ 0.2592, -1.0258]], device='mps:0', grad_fn=<LinearBackward0>)


In [33]:
print('Model preds:', predLabels)
print('True labels:', labels)

Model preds: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='mps:0')
True labels: tensor([1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0], device='mps:0')
