In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/Transformers/LanguageModeling/

/content/drive/MyDrive/Transformers/LanguageModeling


**Masked language modeling**
For masked language modeling (MLM) we are going to use the same preprocessing as before for our dataset with one additional step: we will randomly mask some tokens (by replacing them by [MASK]) and the labels will be adjusted to only include the masked tokens (we don't have to predict the non-masked tokens).

In [3]:
!pip install transformers
import torch
!pip install datasets
from datasets import load_dataset
device = "cuda" if torch.cuda.is_available() else "cpu"
import pandas as pd
import numpy as np



In [4]:
from datasets import load_dataset
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')

Reusing dataset wikitext (/root/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/aa5e094000ec7afeb74c3be92c88313cd6f132d564c7effd961c10fd47c76f20)


In [5]:
TrainData = pd.DataFrame(datasets['train'])
TestData = pd.DataFrame(datasets['test']['text'])
TestData.columns = ["text"]

In [6]:
print(TrainData.shape,TestData.shape,sep="\n\n")

(36718, 1)

(4358, 1)


In [7]:
model_name = "distilroberta-base"

In [8]:
from transformers import RobertaForMaskedLM,RobertaTokenizerFast

In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
Model = RobertaForMaskedLM.from_pretrained(model_name).to(device)

In [10]:
def tokenize_function(TrainData):
  input_idss = []
  attention_maskss = []
  for data in TrainData:
    encodings = tokenizer.encode_plus(data)

    input_idss.append(encodings['input_ids'])
    attention_maskss.append(encodings['attention_mask'])
  
  return input_idss,attention_maskss

In [11]:
input_idsTrain,attention_maskTrain = tokenize_function(TrainData['text'])
input_idsTest,attention_maskTest = tokenize_function(TestData['text'])

Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors


####so now we have not used padding over here , we can concatenate all the input_ids from all sentences & than 
convert to one single list. than we will make that the lis of list having size (128) features..
####make the labels only for Masked token , because we want to train only the Masked token not all tokens.

In [12]:
def Concatenate_fun(input_ids,attention_mask):
  input_ids = list(np.concatenate(np.array(input_ids)))
  attention_mask = list(np.concatenate(np.array(attention_mask)))
  return input_ids,attention_mask

In [13]:
input_idsTrain,attention_maskTrain = Concatenate_fun(input_idsTrain,attention_maskTrain)
input_idsTest,attention_maskTest = Concatenate_fun(input_idsTest,attention_maskTest)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
print(len(input_idsTrain),len(input_idsTest))
print(len(attention_maskTrain),len(attention_maskTest))

2465320 292003
2465320 292003


In [15]:
def Grouping(lis,block_size):
  remainder = len(lis)%block_size
  idx = len(lis)-remainder
  lis_cut = lis[:idx] # chop-off excess data
  lis_new = [lis_cut[i:i+block_size] for i in range(0,len(lis_cut),block_size)]
  return lis_new

In [16]:
# block_size = tokenizer.model_max_length
block_size = 128

In [17]:
labels_train = Grouping(input_idsTrain,128)           # making labels from input_ids
input_idsTrain_grouped = Grouping(input_idsTrain,128) # grouping input_ids in fix features

labels_test = Grouping(input_idsTest,128)
input_idsTest_grouped = Grouping(input_idsTest,128)

attention_maskTrain_grouped = Grouping(attention_maskTrain,128)
attention_maskTest_grouped = Grouping(attention_maskTest,128)

In [18]:
trainDict = {
    "input_ids":torch.tensor(input_idsTrain_grouped,dtype=torch.long),
    "attention_mask":torch.tensor(attention_maskTrain_grouped,dtype=torch.long),
    "labels":torch.tensor(labels_train,dtype=torch.long)
    }

testDict = {
    "input_ids":torch.tensor(input_idsTest_grouped,dtype=torch.long),
    "attention_mask":torch.tensor(attention_maskTest_grouped,dtype=torch.long),
    "labels":torch.tensor(labels_test,dtype=torch.long)
    }

In [19]:
class CustomDataset(torch.utils.data.Dataset):
  def __init__(self,dictionary):
    self.dic = dictionary
    self.id = self.dic['input_ids']
    self.mask = self.dic['attention_mask']
    self.label = self.dic['labels']
    self.device = "cuda" if torch.cuda.is_available() else "cpu"

  def __len__(self):
    return len(self.label)

  def __getitem__(self,index):
    self.input_ids = self.id[index]
    self.attention_mask = self.mask[index]
    self.labels = self.label[index]
    
    dict_map = {
        "input_ids":self.input_ids,
        "attention_mask":self.attention_mask,
        "labels":self.labels
        }

    return dict_map

In [20]:
Train_iter = CustomDataset(trainDict)
Test_iter = CustomDataset(testDict)

In [21]:
from transformers import Trainer, TrainingArguments,DataCollatorForLanguageModeling

In [22]:
# Finally, we use a special data_collator. 
# The data_collator is a function that is responsible of taking the samples and batching them in tensors. 
# In the previous example, we had nothing special to do, so we just used the default for this argument. 
# Here we want to do the random-masking. We could do it as a pre-processing step (like the tokenization) 
# but then the tokens would always be masked the same way at each epoch. By doing this step inside the data_collator, 
# we ensure this random masking is done in a new way each time we go over the data.

In [23]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True ,mlm_probability=0.15)

In [24]:
training_args = TrainingArguments(output_dir="OutputDir",
    do_train = True,
    do_eval=True,
    do_predict=True,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=5.0,
    logging_dir = "Logs/",
    logging_strategy = "epoch",
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 2,
    save_strategy = "epoch",
    )

In [25]:
trainer = Trainer(
    model=Model,
    args=training_args,
    train_dataset=Train_iter,
    eval_dataset=Test_iter,
    data_collator = data_collator
)

In [26]:
# function ClickConnect(){
# console.log("Working"); 
# document.querySelector("colab-toolbar-button#connect").click() 
# }
# setInterval(ClickConnect,60000)

In [27]:
trainer.train()

***** Running training *****
  Num examples = 19260
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 24075


Epoch,Training Loss,Validation Loss
1,2.1498,1.896548
2,2.0091,1.84947
3,1.9382,1.82201
4,1.8947,1.841298


***** Running Evaluation *****
  Num examples = 2281
  Batch size = 2
Saving model checkpoint to OutputDir/checkpoint-4815
Configuration saved in OutputDir/checkpoint-4815/config.json
Model weights saved in OutputDir/checkpoint-4815/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2281
  Batch size = 2
Saving model checkpoint to OutputDir/checkpoint-9630
Configuration saved in OutputDir/checkpoint-9630/config.json
Model weights saved in OutputDir/checkpoint-9630/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2281
  Batch size = 2
Saving model checkpoint to OutputDir/checkpoint-14445
Configuration saved in OutputDir/checkpoint-14445/config.json
Model weights saved in OutputDir/checkpoint-14445/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2281
  Batch size = 2
Saving model checkpoint to OutputDir/checkpoint-19260
Configuration saved in OutputDir/checkpoint-19260/config.json
Model weights saved in OutputDir/checkpoint-19260/pytorch_

Epoch,Training Loss,Validation Loss
1,2.1498,1.896548
2,2.0091,1.84947
3,1.9382,1.82201
4,1.8947,1.841298
5,1.859,1.803294


***** Running Evaluation *****
  Num examples = 2281
  Batch size = 2
Saving model checkpoint to OutputDir/checkpoint-24075
Configuration saved in OutputDir/checkpoint-24075/config.json
Model weights saved in OutputDir/checkpoint-24075/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=24075, training_loss=1.970149191978193, metrics={'train_runtime': 2893.738, 'train_samples_per_second': 33.279, 'train_steps_per_second': 8.32, 'total_flos': 6077176593638400.0, 'train_loss': 1.970149191978193, 'epoch': 5.0})

In [28]:
tokenizer.save_pretrained("Tokenizer/")

tokenizer config file saved in Tokenizer/tokenizer_config.json
Special tokens file saved in Tokenizer/special_tokens_map.json


('Tokenizer/tokenizer_config.json',
 'Tokenizer/special_tokens_map.json',
 'Tokenizer/vocab.json',
 'Tokenizer/merges.txt',
 'Tokenizer/added_tokens.json',
 'Tokenizer/tokenizer.json')

## Prediction

In [32]:
from transformers import RobertaForMaskedLM,RobertaTokenizerFast

In [36]:
!ls

checkpoint-24075  Logs	MLM.ipynb  OutputDir  Tokenizer


In [37]:
Model = RobertaForMaskedLM.from_pretrained("ModelDir/")
tokenizer = RobertaTokenizerFast.from_pretrained("/content/drive/MyDrive/Transformers/LanguageModeling/Tokenizer/")

loading configuration file ModelDir/config.json
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file ModelDir/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkpoint at ModelDir/.
If your task is similar to the task the model of t

In [39]:
sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."


# encoding the text via tokenizer
# input = tokenizer.encode(sequence, return_tensors="pt",return_attention_mask=True) 

# encoding the text via tokenizer without attention mask ,both fine
input = tokenizer.encode(sequence, return_tensors="pt",return_attention_mask=False) 


print("Length of input tokens : "+str(len(input[0])))

# getting index of mask_token_id
mask_token_index = torch.where(input == tokenizer.mask_token_id)[1] 
print(mask_token_index)
print("out of 27 token , mask token index is : "+str(mask_token_index))
Model.to("cpu")
token_logits = Model(input).logits


mask_token_logits = token_logits[0, mask_token_index, :] # selct prob of only mask token out of 27 token

# Two Ways of prediction ==> 
# 1. top 5 higher prob. tokens
print("\nTop 5 predictions\n")
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for token in top_5_tokens:
  print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))

print("\n\n\n")
# 2 . Only one Higher probability token
print("Only One Prediction")
predict_token_id = torch.argmax(mask_token_logits,dim=1)
print(sequence.replace(tokenizer.mask_token,tokenizer.decode(predict_token_id)))

Length of input tokens : 27
tensor([21])
out of 27 token , mask token index is : tensor([21])

Top 5 predictions

Distilled models are smaller than the models they mimic. Using them instead of the large versions would help  reduce our carbon footprint.
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help  lower our carbon footprint.
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help  decrease our carbon footprint.
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help  cut our carbon footprint.
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help  minimize our carbon footprint.




Only One Prediction
Distilled models are smaller than the models they mimic. Using them instead of the large versions would help  reduce our carbon footprint.


##Model quantization

In [40]:
quantized_model = torch.quantization.quantize_dynamic(
    Model, {torch.nn.Linear}, dtype=torch.qint8
)

In [41]:
out = quantized_model(input)['logits']

In [42]:
mask_token_logits = out[0,mask_token_index,:]

predict_token_id = torch.argmax(mask_token_logits,dim=1)
print(sequence.replace(tokenizer.mask_token,tokenizer.decode(predict_token_id)))

Distilled models are smaller than the models they mimic. Using them instead of the large versions would help  reduce our carbon footprint.


In [43]:
# save qantized model
torch.save(quantized_model,"QuantizedMLM.pt")

In [46]:
QLoaded = torch.load("QuantizedMLM.pt")