##**FINE-TUNING BERT WITH MLM**

In [187]:
!nvidia-smi

Sat Apr 23 19:02:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000000:00:17.0 Off |                    0 |
| N/A   55C    P0    62W / 149W |  11266MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 00000000:00:18.0 Off |                    0 |
| N/A   45C    P0    74W / 149W |  11314MiB / 11441MiB |      0%      Default |
|       

In [188]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [189]:
os.environ["CUDA_VISIBLE_DEVICES"]

'2'

####**Install the huggingface transformers library**

In [190]:
!pip3 install transformers
!pip3 install nltk

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [191]:
import torch
from tqdm.auto import tqdm
from transformers import AdamW
from transformers import BertTokenizer, BertForMaskedLM

In [192]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased', return_dict=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


####**Import text data**

In [193]:
# data_path = '/content/the_fire_flower.txt'
# with open(data_path, 'r') as f:
#     data = f.read().split('\n')

In [194]:
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer

def preprocess_text(text):
    utterances = text.split("##")
    without_speaker = []
    for utterance in utterances:
        if(len(utterance.strip())==0):
            continue
        utr_split = utterance.split(":")
        without_speaker.append(utr_split[1])
    result = " ".join(without_speaker)
    result = result.lower()
    cleanr = re.compile('<.*?>')
    result = re.sub(cleanr, '', result)
    result=re.sub(r'http\S+', '',result)
    result = re.sub('[0-9]+', '', result)
    tokenizer = RegexpTokenizer(r'\w+')
    result = tokenizer.tokenize(result)
    result = " ".join(result)
    return result

train_df = pd.read_csv('/home/ubuntu/teach-bert-ft/nli_train.csv') 

train_df['premise'] = train_df['premise'].apply(lambda x: preprocess_text(x))

data = train_df['premise'].tolist()
data

['fd tudwa rahe aap time se pehle woji chot lag gayi inko paise short horahe hain interest kat jayega aapka interest se pehle health hai life insurance hai aap dono ki h u m k o n a h i karani j i a a p bas cash dila do jaldi',
 'tumhaare liye ye bhi agar padhne ka mann kare to hum sab teen teen baar padh liye hain apni poems bhi leke aaye hain aur kahaniyan ye tum kabhi nahin padhe ho tumko chahiye to tumhaare school ke doston ko bula laayen vah ladki jisko tum kiss kiye the vah humse baat nahin karti koi baat hai arjun jo tum kahna chahte ho hello arjun aap iska kapda badal lenge kya hua',
 'paanch minute late baal bana raha tha t shirt pe kya hai who gives a hash at the rate of dollar per se factory mein yah sab leechad kapda pahan kar nahin jaa sakte mere paas sab yahi hai jaake meri almaari mein se kameez le lo aur baal banaane mat lag jaana nahin to ghar ke saare sheeshe tod doonga tumhaare sarr pe now run',
 'paanch minute pack main zara halka hokey aata hoon ghanta chota aslam 

####**Text cleaning process**

In [195]:
print(len(data))

1792


In [196]:
for sentence in data:
    if len(sentence) < 50:
        data.remove(sentence)

In [197]:
print(len(data))

1754


####**Tokenizing the text data**

In [198]:
inputs = tokenizer(
    data,
    max_length=512,
    truncation=True,
    padding='max_length',
    return_tensors='pt'
)

In [199]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [200]:
inputs['labels'] = inputs['input_ids'].detach().clone()
inputs

{'input_ids': tensor([[  101,  1042,  2094,  ...,     0,     0,     0],
        [  101, 10722,  2213,  ...,     0,     0,     0],
        [  101,  6643,  2319,  ...,     0,     0,     0],
        ...,
        [  101,   102,     0,  ...,     0,     0,     0],
        [  101,  6616,  2125,  ...,     0,     0,     0],
        [  101, 14910,  6300,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[  101,  1042,  2094,  ...,     0,     0,     0],
        [  101, 10722,  2213,  ...,     0,     0,     0],
        [  101,  6643, 

####**Masking the input_ids**

In [201]:
random_tensor = torch.rand(inputs['input_ids'].shape)

In [202]:
random_tensor.shape

torch.Size([1754, 512])

In [203]:
# creating a random tensor of float values.
random_tensor

tensor([[0.5757, 0.6191, 0.8620,  ..., 0.4642, 0.9877, 0.9560],
        [0.4152, 0.0879, 0.6551,  ..., 0.6971, 0.1811, 0.6064],
        [0.4429, 0.5567, 0.6925,  ..., 0.3831, 0.5319, 0.7969],
        ...,
        [0.4277, 0.9582, 0.4337,  ..., 0.8087, 0.8200, 0.1146],
        [0.4957, 0.4146, 0.9757,  ..., 0.7564, 0.4271, 0.8467],
        [0.8424, 0.0384, 0.1215,  ..., 0.3811, 0.2168, 0.8483]])

In [204]:
# creating a mask tensor of float values ranging from 0 to 1 and avoiding special tokens
masked_tensor = (random_tensor < 0.15)*(inputs['input_ids'] != 101)*(inputs['input_ids'] != 102)*(inputs['input_ids'] != 0)

In [205]:
# getting all those indices from each row which are set to True, i.e. masked.
nonzeros_indices = []
for i in range(len(masked_tensor)):
    nonzeros_indices.append(torch.flatten(masked_tensor[i].nonzero()).tolist())

In [206]:
# setting the values at those indices to be a MASK token (103) for every row in the original input_ids.
for i in range(len(inputs['input_ids'])):
    inputs['input_ids'][i, nonzeros_indices[i]] = 103

####**Pytorch Dataset and Dataloader**

In [207]:
class BookDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, index):
        input_ids = self.encodings['input_ids'][index]
        labels = self.encodings['labels'][index]
        attention_mask = self.encodings['attention_mask'][index]
        token_type_ids = self.encodings['token_type_ids'][index]
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        }

In [208]:
dataset = BookDataset(inputs)

In [209]:
dataloader = torch.utils.data.DataLoader(
    dataset,
    batch_size=8,
    shuffle=True
)

In [210]:
device = torch.device('cuda:2') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda', index=2)

In [211]:
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

####**Model parameters**

In [212]:
epochs = 30
optimizer = AdamW(model.parameters(), lr=1e-5)

####**Training Loop**

In [213]:
model.train()
PATH = 'models/bert-ft-premise-movies/model.pt'
for epoch in range(epochs):
    loop = tqdm(dataloader)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description("Epoch: {}".format(epoch))
        loop.set_postfix(loss=loss.item())
    
    # if(epoch % 2 == 0):
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, PATH)

  0%|          | 0/110 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 2; 11.17 GiB total capacity; 10.54 GiB already allocated; 126.88 MiB free; 10.58 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF