In [None]:
!pip install datasets
!pip install tokenizers
!pip install transformers
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 3.9 MB/s 
Collecting dill<0.3.6
  Downloading dill-0.3.5.1-py2.py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 3.4 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 48.1 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 38.2 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 44.4 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloadi

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, AutoTokenizer
import torch
import numpy as np
from pprint import pprint

In [None]:
from datasets import load_dataset
dataset = load_dataset("wikiann", "mi")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
label_names = dataset["train"].features["ner_tags"].feature.names

In [None]:
#Get the values for input_ids, token_type_ids, attention_mask
def tokenize_adjust_labels(all_samples_per_split):
  tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"],truncation=True, return_tensors='pt', is_split_into_words=True, padding=True)
  #tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
  #so the new keys [input_ids, labels (after adjustment)]
  #can be added to the datasets dict for each train test validation split
  total_adjusted_labels = []
  print(len(tokenized_samples["input_ids"]))
  for k in range(0, len(tokenized_samples["input_ids"])):
    prev_wid = -1
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    i = -1
    adjusted_label_ids = []
   
    for wid in word_ids_list:
      if(wid is None):
        adjusted_label_ids.append(-100)
      elif(wid!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = wid
      else:
        label_name = label_names[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])
        
    total_adjusted_labels.append(adjusted_label_ids)
    tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

In [None]:
tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True)
train_data = tokenized_dataset["train"].data
tokenized_dataset.num_rows



{'validation': 100, 'test': 100, 'train': 100}

In [None]:
inputs = {}
inputs["input_ids"] = torch.tensor(np.array(tokenized_dataset["train"]["input_ids"]))
inputs["labels"] = inputs["input_ids"].clone()
inputs["attention_mask"] = torch.tensor(np.array(tokenized_dataset["train"]["attention_mask"]))

In [None]:
rand = torch.rand(inputs["input_ids"].shape)
print(rand)
mask_arr = (rand < 0.15) * (inputs["input_ids"] != 101) * (inputs["input_ids"] != 102) * (inputs["input_ids"] != 0)
print(mask_arr)

tensor([[0.1475, 0.5720, 0.5303,  ..., 0.9173, 0.0697, 0.4001],
        [0.1583, 0.8456, 0.5768,  ..., 0.0749, 0.0882, 0.1141],
        [0.9448, 0.7798, 0.2674,  ..., 0.2490, 0.0783, 0.6001],
        ...,
        [0.2887, 0.5528, 0.9148,  ..., 0.9742, 0.6613, 0.8543],
        [0.3659, 0.6033, 0.2190,  ..., 0.7473, 0.9848, 0.2365],
        [0.8373, 0.0848, 0.8132,  ..., 0.7098, 0.3557, 0.4549]])
tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False]])


In [None]:
selection = []

# Extract masked indices
for i in range(inputs["input_ids"].shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )
# Replace elements at masked indices with 103
for i in range(inputs["input_ids"].shape[0]):
    inputs["input_ids"][i, selection[i]] = 103
inputs["input_ids"]

tensor([[   101,    157,  44756,  ...,      0,      0,      0],
        [   101,  30186,    112,  ...,      0,      0,      0],
        [   101, 103883,  64354,  ...,      0,      0,      0],
        ...,
        [   101,  30186,    112,  ...,      0,      0,      0],
        [   101,  30186,  21452,  ...,      0,      0,      0],
        [   101,    103,  64354,  ...,      0,      0,      0]])

In [None]:
class MyDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings["input_ids"])

In [None]:
train_dataset = MyDataset(inputs)


In [None]:
loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True,)

In [None]:
print(len(loader))
for batch in loader:
  print(len(batch["input_ids"][0])*len(batch["input_ids"]))

13
688
688
688
688
688
688
688
688
688
688
688
688
344


  """


In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-multilingual-cased/snapshots/cf732291d5a8eace7b973ccd13c95ec07b19e734/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 119547
}

loading weights file pytorch_model.bin f

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [None]:
from torch.optim import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # process
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  """
Epoch 0: 100%|██████████| 13/13 [05:08<00:00, 23.71s/it, loss=0.903]
Epoch 1: 100%|██████████| 13/13 [05:10<00:00, 23.90s/it, loss=0.395]


In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir='out',
    per_device_train_batch_size=16,
    num_train_epochs=4
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['train']
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: tokens, ner_tags, langs, spans. If tokens, ner_tags, langs, spans are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 28
  Number of trainable parameters = 177974523


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=28, training_loss=0.38043720381600515, metrics={'train_runtime': 640.0295, 'train_samples_per_second': 0.625, 'train_steps_per_second': 0.044, 'total_flos': 17702448120000.0, 'train_loss': 0.38043720381600515, 'epoch': 4.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: tokens, ner_tags, langs, spans. If tokens, ner_tags, langs, spans are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


{'eval_loss': 0.13851845264434814,
 'eval_runtime': 55.4365,
 'eval_samples_per_second': 1.804,
 'eval_steps_per_second': 0.235,
 'epoch': 4.0}