In [None]:
!pip install numpy protobuf==3.16.0
!pip install onnx
!pip install -q simpletransformers
!pip install -q datasets transformers[sentencepiece] simpletransformers
!pip install onnxruntime
!pip install transformers[onnx]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Example
to_predict = ['ไ่ก่ จิก เด็ก ตาย คน _ เกิด บน ปาก โอ่ง']

In [None]:
_POS_TAGS = ["NN", "VV", "PU", "CC", "PS", "AX", "AV", "FX", "NU", "AJ", "CL", "PR", "NG", "PA", "XX", "IJ"]

In [None]:
from transformers.models.camembert import CamembertTokenizer
from onnxruntime import InferenceSession, SessionOptions
import os
import torch
from tqdm.notebook import tqdm
import numpy as np

In [None]:
use_cuda = True
cuda_device=-1

In [None]:
# Use Cuda on local machine
# No need for cuda if use in inference server
if use_cuda:
    if torch.cuda.is_available():
        if cuda_device == -1:
            device = torch.device("cuda")
        else:
            device = torch.device(f"cuda:{cuda_device}")
    else:
        raise ValueError(
            "'use_cuda' set to True when cuda is unavailable."
            "Make sure CUDA is available or set use_cuda=False."
        )
else:
    device = "cpu"

In [None]:
# Declare Onnx runtime

onnx_execution_provider = (["CUDAExecutionProvider"] if use_cuda else ["CPUExecutionProvider"])

options = SessionOptions()

model_path = '/content/drive/MyDrive/POSTAG/BERT_model/ONNXModel(Noquantize)/onnx_model.onnx' ### ไฟล์.onnx โมเดล ONNX ที่เรา train
model = InferenceSession(model_path, options, providers=onnx_execution_provider)

  "Available providers: '{}'".format(name, ", ".join(available_provider_names)))


In [None]:
# ประกาศ tokenizer
tokenizer_class = CamembertTokenizer


model_name = '/content/drive/MyDrive/POSTAG/BERT_model/ONNXModel(Noquantize)'  # โฟลเดอร์ที่เก็บโมเดลและ argument (เอาทั้งโฟลเดอร์)

tokenizer = tokenizer_class.from_pretrained(model_name, do_lower_case=False,)

In [None]:
# Encode Input
model_inputs = tokenizer.batch_encode_plus(
                to_predict,
                return_tensors="np", #pt
                padding=True,
                truncation=True,
                is_split_into_words=(False),
            )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
model_inputs

{'input_ids': array([[    5,    10,  2840,   369,  7711,    10,  6803,  6345,    10,
          288,  2246,    10,   265,    10,   301,    10,   326,    10,
          573,    10, 22751,     6]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
# Get input
input_ids = model_inputs["input_ids"]
attention_mask = model_inputs["attention_mask"]

In [None]:
##### Prediction 1 sentence ######
inputs_onnx = { "input_ids": input_ids,
                "attention_mask": attention_mask,}

output = model.run(None, inputs_onnx)
print(output)

In [None]:
# Get correct output shape
output = np.array(output)
output = output[0].copy()
print(output.shape)

(1, 22, 16)


In [None]:
# Prediction 
preds = np.argmax(output, axis=2)
preds

array([[ 3,  0,  0,  1,  1,  1,  1,  0,  1,  1, 10,  2,  2,  1,  1,  4,
         0,  0,  0,  0,  0,  3]])

In [None]:
# Fix bug in simpletransformers

out_input_ids = inputs_onnx["input_ids"]
out_attention_mask = inputs_onnx["attention_mask"]

pad_token_label_id = -100
out_label_ids = [[] for _ in range(len(to_predict))]
max_len = len(out_input_ids[0])

for index, sentence in enumerate(to_predict):
    for word in sentence.split():
        word_tokens = tokenizer.tokenize(word)
        out_label_ids[index].extend(
        [0] + [pad_token_label_id] * (len(word_tokens) - 1)
    )
    out_label_ids[index].insert(0,pad_token_label_id)
    out_label_ids[index].append(pad_token_label_id)

    if len(out_label_ids[index]) < max_len:
        out_label_ids[index].extend([-100] * (max_len-len(out_label_ids[index])))

out_label_ids = np.array(out_label_ids).reshape(len(out_label_ids), max_len)

In [None]:
# Map label
label_map = {i: label for i, label in enumerate(_POS_TAGS)}

In [None]:
label_map

{0: 'NN',
 1: 'VV',
 2: 'PU',
 3: 'CC',
 4: 'PS',
 5: 'AX',
 6: 'AV',
 7: 'FX',
 8: 'NU',
 9: 'AJ',
 10: 'CL',
 11: 'PR',
 12: 'NG',
 13: 'PA',
 14: 'XX',
 15: 'IJ'}

In [None]:
# map word แต่ละคำ กับ POS/NER ในรูปแแบบ dict
out_label_list = [[] for _ in range(out_label_ids.shape[0])]
preds_list = [[] for _ in range(out_label_ids.shape[0])]
for i in range(out_label_ids.shape[0]):
    for j in range(out_label_ids.shape[1]):
        if out_label_ids[i, j] != pad_token_label_id:
            out_label_list[i].append(label_map[out_label_ids[i][j]])
            preds_list[i].append(label_map[preds[i][j]])

preds = [
            [
              {word: preds_list[i][j]}
              for j, word in enumerate(sentence.split()[: len(preds_list[i])])
            ]
              for i, sentence in enumerate(to_predict)
        ]

In [None]:
preds

[[{'ไ่ก่': 'NN'},
  {'จิก': 'VV'},
  {'เด็ก': 'NN'},
  {'ตาย': 'VV'},
  {'คน': 'CL'},
  {'_': 'PU'},
  {'เกิด': 'VV'},
  {'บน': 'PS'},
  {'ปาก': 'NN'},
  {'โอ่ง': 'NN'}]]