## Packages

In [35]:
import numpy as np
import torch 
import math

## Tokenizer

In [36]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_basic_tokenization = True)

## Collation
Collation in the context of machine learning typically refers to the process of gathering and organizing data into a structured format suitable for analysis. This involves:

Data Collection: Gathering data from various sources.
Data Cleaning: Ensuring data consistency, accuracy, and completeness.
Data Integration: Combining data from different sources into a unified dataset.
Data Transformation: Converting data into a suitable format for analysis (e.g., numerical, categorical).

In essence, collation is a critical preprocessing step in machine learning that lays the foundation for building effective models.

Used as function in DataLoader option.

In [37]:
from torch.utils.data import Dataset, DataLoader

def data_collate(batch_dataset):
    arr = np.array(batch_dataset)
    inputs = tokenizer(text = arr.tolist(), padding = 'max_length', max_length = 512, truncation=True, return_tensors = 'pt')
    # padding: How to handle sequences that are shorter than the maximum length.
    # Options:
    # 'max_length': Pad sequences to the maximum length with a padding token.
    # 'longest': Pad sequences to the length of the longest sequence in the batch.
    # 'do_not_pad': Do not pad sequences.

    # return_tensors: The format of the output tensors.
    # Options:
    # 'pt': Return PyTorch tensors.
    # 'tf': Return TensorFlow tensors.
    # 'np': Return NumPy arrays.
    return inputs

class CreateDataset(Dataset):
    def __init__(self, src, tokenizer):
        #src = sentences 
        self.src = src
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        src = self.src[idx]
        return src

## Select the dataset to load in <datasets> package.

You can find a list of available datasets on the Hugging Face Datasets Hub: https://huggingface.co/datasets

You can browse through the list of datasets, filter by task, dataset type, and more. Each dataset has a unique identifier (e.g. *cnn_dailymail*) that you can use with the load_dataset function.

Alternatively, you can use the list_datasets function from the datasets library to get a list of available datasets:

```python
from datasets import list_datasets
available_datasets = list_datasets()
print(available_datasets)

In [1]:
from datasets import list_datasets
available_datasets = list_datasets()
available_datasets

  available_datasets = list_datasets()


['amirveyseh/acronym_identification',
 'ade-benchmark-corpus/ade_corpus_v2',
 'UCLNLP/adversarial_qa',
 'Yale-LILY/aeslc',
 'nwu-ctext/afrikaans_ner_corpus',
 'fancyzhx/ag_news',
 'allenai/ai2_arc',
 'google/air_dialogue',
 'komari6/ajgt_twitter_ar',
 'legacy-datasets/allegro_reviews',
 'tblard/allocine',
 'mutiyama/alt',
 'fancyzhx/amazon_polarity',
 'defunct-datasets/amazon_reviews_multi',
 'defunct-datasets/amazon_us_reviews',
 'sewon/ambig_qa',
 'nala-cub/americas_nli',
 'legacy-datasets/ami',
 'gavinxing/amttl',
 'facebook/anli',
 'sealuzh/app_reviews',
 'deepmind/aqua_rat',
 'google-research-datasets/aquamuse',
 'bigIR/ar_cov19',
 'hadyelsahar/ar_res_reviews',
 'iabufarha/ar_sarcasm',
 'abuelkhair-corpus/arabic_billion_words',
 'QCRI/arabic_pos_dialect',
 'halabi2016/arabic_speech_corpus',
 'hsseinmz/arcd',
 'ramybaly/arsentd_lev',
 'allenai/art',
 'arxiv-community/arxiv_dataset',
 'tuanphong/ascent_kb',
 'achrafothman/aslg_pc12',
 'AmazonScience/asnq',
 'facebook/asset',
 'nilc-

In [38]:
from datasets import load_dataset

data = load_dataset("cnn_dailymail", "2.0.0", split = 'train')

## A sneak peek on what is done


data

In [39]:
data[0]['article']

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

token for unprocessed data

In [40]:
#check tokens
#max_length = 512 for BERT tokenizer

sample_input = np.array(data[0]['article'])
tokenizer_output = tokenizer(text = sample_input.tolist(), return_tensors = 'pt')
tokens = tokenizer_output['input_ids']

print(tokens)
print(tokens.size())
print(tokens[0].size())
# print(len(tokens[0]))

Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors


tensor([[  101,  2414,  1010,  2563,  1006, 26665,  1007,  1011,  1011,  4302,
         10693,  2732,  3817, 22603, 12154,  3229,  2000,  1037,  2988, 21853,
          2692,  2454,  1006,  1002,  4601,  1012,  1015,  2454,  1007,  7280,
          2004,  2002,  4332,  2324,  2006,  6928,  1010,  2021,  2002, 16818,
          1996,  2769,  2180,  1005,  1056,  3459,  1037,  6297,  2006,  2032,
          1012,  3817, 22603,  2004,  4302, 10693,  1999,  1000,  4302, 10693,
          1998,  1996,  2344,  1997,  1996,  6708,  1000,  2000,  1996, 10520,
          1997, 13761, 13317,  2015,  2105,  1996,  2088,  1010,  1996,  2402,
          3364,  2758,  2002,  2038,  2053,  3488,  2000, 10424, 27100,  2099,
          2010,  5356,  2185,  2006,  3435,  3765,  1010,  4392,  1998,  8958,
          4243,  1012,  1000,  1045,  2123,  1005,  1056,  2933,  2000,  2022,
          2028,  1997,  2216,  2111,  2040,  1010,  2004,  2574,  2004,  2027,
          2735,  2324,  1010,  3402,  4965,  3209,  

filtering function for data filter.

In [41]:
import re

def filter_data(text):
    #remove last line
    text = re.sub(r"Copyright \d{4} Reuters. All rights reserved.*", "", text)
    
    #replace \'
    text = text.replace("\'", "")
    
    #replace 's
    text = re.sub(r"'s\b'", "", text)
    
    #remove extra white space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

check out the filtered data...

In [42]:
filter_text = filter_data(data[0]['article'])
print(filter_text)



LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money wont cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I dont plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I dont think Ill be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how hel

check out the effect on tokens after data filter processing.

In [43]:

sample_input = np.array(filter_text)
tokenizer_output = tokenizer(text = sample_input.tolist(), return_tensors = 'pt')
tokens = tokenizer_output['input_ids']

print(tokens)
print(len(tokens[0]))

tensor([[  101,  2414,  1010,  2563,  1006, 26665,  1007,  1011,  1011,  4302,
         10693,  2732,  3817, 22603, 12154,  3229,  2000,  1037,  2988, 21853,
          2692,  2454,  1006,  1002,  4601,  1012,  1015,  2454,  1007,  7280,
          2004,  2002,  4332,  2324,  2006,  6928,  1010,  2021,  2002, 16818,
          1996,  2769,  2180,  2102,  3459,  1037,  6297,  2006,  2032,  1012,
          3817, 22603,  2004,  4302, 10693,  1999,  1000,  4302, 10693,  1998,
          1996,  2344,  1997,  1996,  6708,  1000,  2000,  1996, 10520,  1997,
         13761, 13317,  2015,  2105,  1996,  2088,  1010,  1996,  2402,  3364,
          2758,  2002,  2038,  2053,  3488,  2000, 10424, 27100,  2099,  2010,
          5356,  2185,  2006,  3435,  3765,  1010,  4392,  1998,  8958,  4243,
          1012,  1000,  1045,  2123,  2102,  2933,  2000,  2022,  2028,  1997,
          2216,  2111,  2040,  1010,  2004,  2574,  2004,  2027,  2735,  2324,
          1010,  3402,  4965,  3209,  1037,  5294,  

## process begins for data in training

In [44]:
from tqdm import tqdm 

train_data = []

for i in tqdm(range(len(data))):
    filter_d = filter_data(data[i]['article'])
    train_data.append(filter_d)

100%|██████████| 287113/287113 [00:43<00:00, 6594.30it/s]


In [45]:
len(train_data)

287113

### select 10000 of the trainig data

In [46]:
train_data = train_data[:10000]

## make a dataset for training via CreateDataset and DataLoader
Here are the parameters that can be used for DataLoader in PyTorch:
```python
DataLoader(
    dataset, 
    batch_size=1, 
    shuffle=False, 
    sampler=None, 
    batch_sampler=None, 
    num_workers=0, 
    collate_fn=None, 
    pin_memory=False, 
    drop_last=False, 
    timeout=0, 
    worker_init_fn=None, 
    multiprocessing_context=None, 
    generator=None, 
    prefetch_factor=2, 
    persistent_workers=False
)


In [47]:
train_data = CreateDataset(train_data, tokenizer)
# dataloader = DataLoader(train_data, batch_size = 8, collate_fn = data_collate)
dataloader = DataLoader(train_data, batch_size = 8, collate_fn = data_collate) # try increasing batch_size to 32
# a general two-step approach: create a dataset, then create a dataloader

In [48]:
len(dataloader)

1250

## Meddle deeper in transformer architecture
since now input is done via CreateDataset and DataLoader, now let us dig in encoder.


In [49]:
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer

The first to come is positional encoding, of course...that is what tokens are faced with.

Note that we should now cover the phases in transformer architecture of raw input, tokenization, embedding, and positional encoding.

In [50]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # position[i] = i
        # div_term[j] = exp(j * (-log(10000.0)) / d_model), en mathematics...
        #             = exp(-log(10000.0) * j / d_model)
        #             = 10000.0^(-j/d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # note the j is even number
        # so if j = 2k, where k is continuous number between 0 and half max_len, then position * div_term should be:
        # i * (10000.0^(-2/d_model))^k
        # which is easier to programme.
        
        pe = pe.unsqueeze(0).transpose(0, 1)
          # (max_len, 1, d_model)
        self.register_buffer('pe', pe)  

    def forward(self, x):
        x = x + self.pe[:x.size(0), :] 
        # 1. if x is of three dimensions, then it syntactically works under command of [:x.size(0), :]
        # 2. i guess the x is of shape (max_len, batch_size, d_model) here. I will check in the antecedent codes.
        return self.dropout(x)

## the whole, complete architecture of transformer

In [51]:
#model 
class TransformerModel(nn.Module):
    
    def __init__(self, ntokens, ninp, nhead, nhid, nlayers, dropout = 0.5):
        super(TransformerModel, self).__init__()
        self.model_type = "Transformer"
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layer = TransformerEncoderLayer(ninp, nhead, nhid, dropout, batch_first = True)
        self.transformer_encoder = TransformerEncoder(encoder_layer, nlayers)
        self.encoder = nn.Embedding(ntokens, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntokens)
        
        self.init_weights()
        
    def generate_square_subsequent_mask(self, sz):
        '''
        We generate the mask to prevent the transformer from seeing future tokens
        Square matrix is created with elements below the diagonal = 0
        Conver the mask to float, all zeros are replaced with -inf(indicating no access to elements) 
        and 1 with 0.0 (this operation does not changes the magnitude but influences the output)
        '''
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)  #  a lower triangular matrix of bools with ones above the diagonal and ones below and along.
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        # e.g. mask, if sz being 5:
        # tensor([[0., -inf, -inf, -inf, -inf],
        #         [0., 0.,   -inf, -inf, -inf],
        #         [0., 0.,   0.,   -inf, -inf],
        #         [0., 0.,   0.,   0.,   -inf],
        #         [0., 0.,   0.,   0.,   0.]])
        return mask
    
    def init_weights(self):
        initrange = 0.1

        self.encoder.weight.data.uniform_(-initrange, initrange)  # initializes the weights of the self.encoder layer with random values uniformly distributed between -initrange and initrange.
        # parameters for nn.Embeding:
        # num_embeddings, embedding_dim, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse

        self.decoder.bias.data.zero_() # sets the bias term to zero.
        self.decoder.weight.data.uniform_(-initrange, initrange) # initializes the weights of the self.decoder like self.encoder
        
    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output 
    

## check mps availability

In [52]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")

    ntokens = tokenizer.vocab_size # 30522
    emsize = 512 # embedding dimension

    nhid = 100 # the dimension of the feedforward network model in nn.TransformerEncoder

    nlayers = 5 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder

    nhead = 4 # the number of heads in the multiheadattention models

    dropout = 0.2 # the dropout value

    model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(mps_device)  # the model architecture is sent to mps. For the 1st time in the session.

## confirm model architecture

In [53]:
model

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-4): 5 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=100, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=100, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (encoder): Embedding(30522, 512)
  (decoder): Linear(in_features=512, out_features=30522, bias=True)
)

In [55]:
model.__dict__

{'training': True,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_pre_hooks': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_is_full_backward_hook': None,
 '_forward_hooks': OrderedDict(),
 '_forward_hooks_with_kwargs': OrderedDict(),
 '_forward_hooks_always_called': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_forward_pre_hooks_with_kwargs': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_post_hooks': OrderedDict(),
 '_modules': OrderedDict([('pos_encoder',
               PositionalEncoding(
                 (dropout): Dropout(p=0.2, inplace=False)
               )),
              ('transformer_encoder',
               TransformerEncoder(
                 (layers): ModuleList(
                   (0-4): 5 x TransformerEncoderLayer(
                     (self_attn): MultiheadAttention(
              

## training configuration

In [56]:
def train(model, dataloader):
    model.train()
    # epochs = 50
    epochs = 2 # pour savourer ce test, justement...
    total_loss = 0
    criterion = nn.CrossEntropyLoss()
    optim = torch.optim.AdamW(model.parameters(), lr = 0.1)
    
    for epoch in tqdm(range(epochs)):
        for batch in tqdm(dataloader):
            optim.zero_grad()
            input = batch['input_ids'].clone()
            
            src_mask = model.generate_square_subsequent_mask(batch['input_ids'].size(1)) # batch['input_ids'].size(1) represents how many tokens.
            # this is for preventing from seeing future data.
        
            rand_value = torch.rand(batch.input_ids.shape)
            rand_mask = (rand_value < 0.15) * (input != 101) * (input != 102) * (input != 0)
            # 101: Special Token (e.g., CLS or BOS)
                # Often used to represent the beginning of a sequence.
                # Common in classification tasks where the token's representation is used for overall sequence classification.
            # 102: Special Token (e.g., SEP or EOS)
                # Typically indicates the end of a sequence.
                # Frequently used in sentence pair classification tasks or question answering.
            # 0: Padding Token
                # Used to fill sequences to a fixed length.
                # Often ignored during calculations.
            
            mask_idx=(rand_mask.flatten() == True).nonzero().view(-1)  # here, it is 'nonzero()' that returns indices  of the elements in the boolean tensor that are True.
            
            input = input.flatten()
            input[mask_idx] = 103   # 103 is likely a special token (e.g., [MASK]) used for masked language modeling tasks. 
            # The mask_idx indices are determined by the rand_mask variable, which randomly selects 15% of the input tokens (excluding special tokens like CLS, SEP, and PAD) to be replaced with the [MASK] token.
            input = input.view(batch['input_ids'].size())
            
            out = model(input.to(mps_device), src_mask.to(mps_device)) # sent to mps device, for the 2nd and  3rd time in the session, for input data and mask, respectively.
            # the input has its masked operation, and it is a different operation form the src_masks. These are two different functions.
            loss = criterion(out.view(-1, ntokens), batch['input_ids'].view(-1).to(mps_device)) # here for the 4th time in the session, something is sent to mps. 
            # ‘criterion' absorbs a tensor of shape (batch_size, num_classes) where num_classes is the number of classes in the classification problem, which is ntokens=30522 here.
            # target: The true labels, which should be a tensor of shape (batch_size) containing the indices of the true classes.

            total_loss += loss
            loss.backward()
            optim.step()
            
        print("Epoch: {} -> loss: {}".format(epoch+1, total_loss/(len(dataloader)*epoch+1)))

In [57]:
torch.mps.empty_cache()
train(model, dataloader)

100%|██████████| 1250/1250 [07:08<00:00,  2.92it/s]
 50%|█████     | 1/2 [07:08<07:08, 428.79s/it]

Epoch: 1 -> loss: 37269.55078125


100%|██████████| 1250/1250 [07:09<00:00,  2.91it/s]
100%|██████████| 2/2 [14:18<00:00, 429.03s/it]


Epoch: 2 -> loss: 40.021324157714844


In [58]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
# Parameter objects are typically used to store model weights and biases.

buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

# note buffers() method returns an iterator over all the Tensor objects in a model that are registered as buffers. 
# Buffers are typically used to store additional tensors that are not directly involved in the forward pass of the model.


size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 151.152MB


In [59]:
import torch 

one_mat = torch.ones(4, 4)

In [60]:
torch.triu(one_mat)


tensor([[1., 1., 1., 1.],
        [0., 1., 1., 1.],
        [0., 0., 1., 1.],
        [0., 0., 0., 1.]])

In [61]:
torch.triu(one_mat == 1)

tensor([[ True,  True,  True,  True],
        [False,  True,  True,  True],
        [False, False,  True,  True],
        [False, False, False,  True]])

In [62]:
torch.triu(one_mat == 1).transpose(0, 1)

tensor([[ True, False, False, False],
        [ True,  True, False, False],
        [ True,  True,  True, False],
        [ True,  True,  True,  True]])

In [63]:
mat = torch.triu(one_mat == 1).transpose(0, 1).float()

# one_mat == 1: Creates a boolean mask where all elements are True (since all elements in one_mat are 1).
# torch.triu(...): Sets all elements below the diagonal to False, the rest being True.
# .transpose(0, 1): Swaps the rows and columns of the resulting matrix.
# .float(): Converts the boolean matrix to a floating-point matrix, where True becomes 1.0 and False becomes 0.0.


In [64]:
mat.masked_fill(mat == 0, float('-inf')).masked_fill(mat == 1, float(0.0))

tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])

## Optimizing GPU usage

In [67]:
from accelerate import Accelerator
# Accelerator is applicable for mps.

In [68]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    learning_rate = 0.1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    #fp16=True, # can only be done with CUDA 
    output_dir = "./model_output"
)

In [74]:
dataloader = DataLoader(train_data, batch_size=training_args.per_device_train_batch_size)

# if training_args.gradient_checkpointing:
    # model.gradient_checkpointing_enable()

# 'TransformerModel' object has no attribute 'gradient_checkpointing_enable'

# accelerator = Accelerator()
# model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)

In [75]:
from transformers.trainer_pt_utils import get_parameter_names

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
#  retrieves the names of model parameters that are not inside a nn.LayerNorm layer.
print(decay_parameters)

['transformer_encoder.layers.0.self_attn.out_proj.weight', 'transformer_encoder.layers.0.self_attn.out_proj.bias', 'transformer_encoder.layers.0.self_attn.in_proj_weight', 'transformer_encoder.layers.0.self_attn.q_proj_weight', 'transformer_encoder.layers.0.self_attn.k_proj_weight', 'transformer_encoder.layers.0.self_attn.v_proj_weight', 'transformer_encoder.layers.0.self_attn.in_proj_bias', 'transformer_encoder.layers.0.linear1.weight', 'transformer_encoder.layers.0.linear1.bias', 'transformer_encoder.layers.0.linear2.weight', 'transformer_encoder.layers.0.linear2.bias', 'transformer_encoder.layers.1.self_attn.out_proj.weight', 'transformer_encoder.layers.1.self_attn.out_proj.bias', 'transformer_encoder.layers.1.self_attn.in_proj_weight', 'transformer_encoder.layers.1.self_attn.q_proj_weight', 'transformer_encoder.layers.1.self_attn.k_proj_weight', 'transformer_encoder.layers.1.self_attn.v_proj_weight', 'transformer_encoder.layers.1.self_attn.in_proj_bias', 'transformer_encoder.layers

In [76]:
decay_parameters = [name for name in decay_parameters if "bias" not in name]
print(decay_parameters)

['transformer_encoder.layers.0.self_attn.out_proj.weight', 'transformer_encoder.layers.0.self_attn.in_proj_weight', 'transformer_encoder.layers.0.self_attn.q_proj_weight', 'transformer_encoder.layers.0.self_attn.k_proj_weight', 'transformer_encoder.layers.0.self_attn.v_proj_weight', 'transformer_encoder.layers.0.linear1.weight', 'transformer_encoder.layers.0.linear2.weight', 'transformer_encoder.layers.1.self_attn.out_proj.weight', 'transformer_encoder.layers.1.self_attn.in_proj_weight', 'transformer_encoder.layers.1.self_attn.q_proj_weight', 'transformer_encoder.layers.1.self_attn.k_proj_weight', 'transformer_encoder.layers.1.self_attn.v_proj_weight', 'transformer_encoder.layers.1.linear1.weight', 'transformer_encoder.layers.1.linear2.weight', 'transformer_encoder.layers.2.self_attn.out_proj.weight', 'transformer_encoder.layers.2.self_attn.in_proj_weight', 'transformer_encoder.layers.2.self_attn.q_proj_weight', 'transformer_encoder.layers.2.self_attn.k_proj_weight', 'transformer_encod

In [80]:
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        # p is the actual parameter value (or tensor) associated with the parameter name n.
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

optimizer_grouped_parameters

[{'params': [Parameter containing:
   tensor([[-0.0128,  0.0444, -0.0700,  ...,  0.0691, -0.0440,  0.0548],
           [-0.0557, -0.0509,  0.0462,  ...,  0.0512,  0.0477, -0.0594],
           [-0.0413,  0.0370,  0.0520,  ...,  0.0455,  0.0476,  0.0474],
           ...,
           [-0.0499, -0.0471,  0.0474,  ..., -0.0495,  0.0472, -0.0471],
           [ 0.0543,  0.0565, -0.0513,  ...,  0.0522,  0.0468,  0.0463],
           [ 0.0515,  0.0479,  0.0527,  ...,  0.0486, -0.0528,  0.0459]],
          device='mps:0', requires_grad=True),
   Parameter containing:
   tensor([[ 0.0525, -0.0461, -0.0537,  ...,  0.0451,  0.0537, -0.0438],
           [ 0.0517, -0.0534, -0.0465,  ...,  0.0482,  0.0485, -0.0476],
           [-0.0443,  0.0579, -0.0519,  ..., -0.0499, -0.0476, -0.0414],
           ...,
           [-0.0440,  0.0638,  0.0323,  ..., -0.0513, -0.0491,  0.0506],
           [ 0.0514, -0.0455, -0.0518,  ...,  0.0486,  0.0501, -0.0533],
           [-0.0407, -0.0493, -0.0617,  ...,  0.0506,  0.

In [85]:
optimizer_kwargs = {
    "betas": (training_args.adam_beta1, training_args.adam_beta2),
    "eps": training_args.adam_epsilon,
}
optimizer_kwargs

{'betas': (0.9, 0.999), 'eps': 1e-08}

In [83]:
import bitsandbytes as bnb
# Bitsandbytes is a Python library that focuses on quantization of neural network models. 
# Quantization is a technique to reduce the precision of numerical representations 
# (e.g., from 32-bit floating-point to 8-bit integers) to decrease model size and accelerate computations.

optimizer_kwargs["lr"] = training_args.learning_rate

adam_bnb_optim = bnb.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),   
    #  betas: a tuple of two values, (beta1, beta2), 
    #  that control the exponential moving averages of the gradient and squared gradient, respectively. 
    # Typical values are (0.9, 0.999).
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [86]:
accelerator = Accelerator()
model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)