## Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification

import torch
import torch.nn.functional as F
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

# from torchsummary import summary

  from .autonotebook import tqdm as notebook_tqdm


## Define NER Label

In [None]:
# Info: Transformers (huggingface) is quite different from pytorch
label_names = ["O", "B-LEG", "I-LEG", "B-NUM", "I-NUM",
                   "B-YER", "I-YER", "B-PAS", "I-PAS", "B-AYT", "I-AYT"]
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

## Download Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
# model_ner = AutoModelForTokenClassification.from_pretrained("indolem/indobert-base-uncased", id2label=id2label, label2id=label2id)

# Data collator is used for padding in batch
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at indolem/indobert-base-uncas

## Save Model to local - Example

In [None]:
save_path = 'model/'

model_ner_path = os.path.join(save_path, "indobert_ner"+".pth")

torch.save(model_ner, model_ner_path)

## Load Model - Example

In [None]:
save_path = 'model/'
model_ner_path = os.path.join(save_path, "indobert_ner"+".pth")

bert_ner = torch.load(model_ner_path)

In [None]:
bert_ner.config.num_labels

11

In [None]:
print(bert_ner)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

## Bert Test Simple Input

In [None]:
input = tokenizer("apa bedanya biru sama merah?")
print(input.word_ids())
output = bert_ner(torch.tensor(input['input_ids']).view(1, -1), return_dict=False)
output[0]

[None, 0, 1, 2, 3, 4, 5, None]


tensor([[[ 0.5823, -0.3254],
         [ 0.1367,  0.2392],
         [ 0.1105, -0.1016],
         [ 0.0899, -0.0853],
         [ 0.2738, -0.1482],
         [ 0.2239, -0.1915],
         [ 0.4310, -0.4585],
         [ 0.3281, -0.6050]]], grad_fn=<ViewBackward0>)

# LexID QA 2 Using BERT

## Dataset NER

In [None]:
class Lexidqa2Dataset(Dataset):
  """LexID QA2 Dataset"""

  def __init__(self, dataframe, tokenizer):
    """
      dataframe (Pandas DataFrame): Dataset in DataFrame format
      tokenizer (Tokenizer) : Sentence to Vector
    """
    self.qa_frame = dataframe
    self.tokenizer = tokenizer

    self.label2id = {"O": 0, "B-LEG": 1, "I-LEG": 2
                "B-NUM": 3, "I-NUM": 4, "B-YER": 5, "I-YER": 6,
                "B-PAS": 7, "I-PAS": 8, "B-AYT": 9, "I-AYT": 10}
    self.id2label = {0: "O", 1: "B-LEG", 2: "I-LEG",
                3: "B-NUM", 4: "I-NUM", 5: "B-YER", 6: "I-YER",
                7: "B-PAS", 8: "I-PAS", 9: "B-AYT", 10: "I-AYT"}

  def __len__(self):
    return len(self.qa_frame)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()

    question = self.qa_frame.loc[idx, "question"]
    question_tokenized = tokenizer(question)
    question_input_ids = torch.tensor(question_tokenized['input_ids'])
    question_token_type_ids = torch.tensor(question_tokenized['token_type_ids'])
    question_attention_masks = torch.tensor(question_tokenized['attention_mask'])

    answer = self.qa_frame.loc[idx, "answer"]

    ner_labels = self.qa_frame.loc[idx, "ner"]
    ner_labels = self.align_labels_with_tokens(ner_labels, question_tokenized.word_ids())

    sample = {'question': question,
              'question_input_ids': question_input_ids,
              'question_token_type_ids': question_token_type_ids,
              'question_attention_masks': question_attention_masks,
              'answer': answer}

    return sample

  def one_hot(self, label, label2id):
    one_hot_label = torch.zeros(len(label2id))
    one_hot_label[label2id[label]] = 1

    return one_hot_label

  def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            # label = labels[word_id]
            # # If the label is B-XXX we change it to I-XXX
            # if label % 2 == 1:
            #     label += 1
            # new_labels.append(label)
            new_labels.append(-100)

    return new_labels

## Merging Dataset from Different csv

In [None]:
t1 = pd.read_csv("t1_dataset.csv")
t2 = pd.read_csv("t2_dataset.csv")
t3 = pd.read_csv("t3_dataset.csv")
t4 = pd.read_csv("t4_dataset.csv")
t5 = pd.read_csv("t5_dataset.csv")
t6 = pd.read_csv("t6_dataset.csv")
t7 = pd.read_csv("t7_dataset.csv")
t8 = pd.read_csv("t8_dataset.csv")
t10 = pd.read_csv("t10_dataset.csv")
t11 = pd.read_csv("t11_dataset.csv")
t12 = pd.read_csv("t12_dataset.csv")

t = pd.concat([t1, t2, t3, t4, t5, t6, t7, t8, t10, t11, t12], ignore_index=True)

## Dataset

In [None]:
qa_dataset = Lexidqa2Dataset(t, tokenizer)

Sample

In [None]:
np.random.seed(12345)
random_indices = np.random.choice(range(len(qa_dataset)), size=5, replace=False)

for i in random_indices:
  sample = qa_dataset[i]

  print(i, sample['question'])

  if i == 5:
    break

16250 Pada tanggal berapa Peraturan Presiden Republik Indonesia Nomor 59 Tahun 2005 tersebut diresmikan
69433 Peraturan Menteri Riset Teknologi Dan Pendidikan Tinggi Republik Indonesia Nomor 54 Tahun 2016 diadakan dengan fokus
96674 Peraturan Daerah Kabupaten Aceh Tamiang Nomor 20 Tahun 2011 dibangun dengan landasan hukum apa saja
96162 Peraturan Mahkamah Agung Republik Indonesia Nomor 8 Tahun 2017 dirancang berdasarkan basis apa
22872 Tanggal berapa penerbitan Peraturan Presiden Republik Indonesia Nomor 100 Tahun 2018 ini


## Split, DataLoader

In [None]:
torch.manual_seed(12345)
train_size = int(0.5 * len(qa_dataset))
val_size = int(0.3 * len(qa_dataset))
test_size = len(qa_dataset) - train_size - val_size
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(qa_dataset, [train_size, val_size, test_size])

In [None]:
BATCH_SIZE = 64

train_instances = DataLoader(train_dataset, collate_fn=data_collator, batch_size=BATCH_SIZE, shuffle=True)
valid_instances = DataLoader(valid_dataset, collate_fn=data_collator, batch_size=BATCH_SIZE, shuffle=False)
test_instances = DataLoader(test_dataset, collate_fn=data_collator, batch_size=BATCH_SIZE, shuffle=False)

# NER

References:

- https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

## Model

### Loss Function

> Loss Function Multi-Class Classification: CrossEntropyLoss

In [None]:
loss_func = torch.nn.CrossEntropyLoss(reduction='mean')

### BERT

In [None]:
class BERT_Lexidqa2_ner(nn.Module):
    def __init__(self, bert_for_token_classification, input_size = 768, output_size = 17):
        super().__init__()

        self.bert = bert_for_token_classification

    def forward(self, x):
        x = self.bert(x, return_dict=False)

        return x

In [None]:
model = BERT_Lexidqa2_ner(bert_ner, 768, 17)

In [None]:
model(qa_dataset[0]['question_input_ids'].unsqueeze_(0)).shape

torch.Size([1, 17])

## Training

In [None]:
# training image classification

LEARNING_RATE = 5e-5
EPOCH = 30

# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE)

for epoch_i in range(0, EPOCH):
  print(f"Epoch {epoch_i + 1} / {EPOCH}")
  # Reset the total loss for this epoch.
  tr_loss = 0

  # Put the model into training mode.
  model.train()

  # Untuk setiap data di training data
  for data in train_instances:

    # Mendapatkan output dari model
    pred = model(data['image'])

    # Categorical Cross-Entropy --- Loss Function
    loss = loss_func(pred, data['genre'])

    # Agar gradient tidak menumpuk; ini diperlukan sebelum
    # menghitung gradient dengan loss.backward()
    optimizer.zero_grad()

    # hitung gradient
    loss.backward()

    # update parameter
    optimizer.step()

    # akumulasi loss dalam 1 epoch
    tr_loss += loss.item()

  # rata-rata loss dalam 1 epoch
  avg_loss = tr_loss / train_size

  print(f"Average loss: {avg_loss}")

  ### Uji coba di Validation Data
  print("Uji coba di Validation Data ...")

  # model eval, agar layer seperti dropout yang menghasilkan
  # sesuai yang random tidak digunakan
  model.eval()

  val_loss = 0
  pred_labels = []
  true_labels = []

  # Untuk setiap batch di validation data
  for data in valid_instances:

    # jangan track gradient! ini sedang evaluasi, bukan training
    with torch.no_grad():

      # Mendapatkan output dari model
      pred = model(data['image'])

      # Categorical Cross-Entropy --- Loss Function
      loss = loss_func(pred, data['genre'])

      # akumulasi loss di validation set
      val_loss += loss.item()

    # akumulasi prediksi
    # _, pred_l = torch.max(output, dim = 1) # outnya adalah 2-tuple: (max, max_indices)
    # pred_labels += pred_l.detach().cpu()
    # _, true_l = torch.max(y, dim = 1)
    # true_labels += true_l.detach().cpu()

  # average loss
  avg_val_loss = val_loss / val_size

  # accuracy
  # pred_labels = torch.stack(pred_labels).numpy()
  # true_labels = torch.stack(true_labels).numpy()
  # val_accuracy = np.sum(pred_labels == true_labels) / len(pred_labels)
  print(f"Average loss di validation data: {avg_val_loss}")
  # print(f"Accuracy di validation data: {val_accuracy}")

  print("")

  if epoch_i % 5:
    save_path = 'model/ner_checkpoint/'
    model_ner_path = os.path.join(save_path, "indobert_ner_checkpoint_" + epoch_i +".pth")
    torch.save(model_ner, model_ner_path)