In [1]:
import numpy as np
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cpu"
dtype = torch.float32

In [3]:
# 1. 아스키 코드화
string = "a smile cat"
ascii_codes = np.array([ord(char) for char in string])
ascii_codes = torch.tensor(ascii_codes)
print(ascii_codes)  # 출력: [72 101 108 108 111]

tensor([ 97,  32, 115, 109, 105, 108, 101,  32,  99,  97, 116])


In [4]:
# 2. 나머지 '-1'로 채워서, [1,100] 사이즈의 tensor화
arr = torch.full((1, 100), -1)
arr[0, :ascii_codes.size(0)] = ascii_codes
print(arr.shape)
arr

torch.Size([1, 100])


tensor([[ 97,  32, 115, 109, 105, 108, 101,  32,  99,  97, 116,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,
          -1,  -1]])

In [5]:
# 3. 넣기
from simple_tokenizer import SimpleTokenizer as _Tokenizer
_tokenizer = _Tokenizer()

In [6]:
class TokenizerModule(nn.Module):
    def __init__(self, _tokenizer, device):
        super().__init__()
        self.tokenizer = _tokenizer
        
    def forward(self, x):
        texts = [''.join([chr(value) for value in x[x!=-1]])]
        print(texts)
        sot_token = self.tokenizer.encoder["<|startoftext|>"]       # 49407
        eot_token = self.tokenizer.encoder["<|endoftext|>"]         # 49406
        all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
        print(all_tokens)
        result = torch.zeros(len(all_tokens), 77, dtype=torch.int)
        for i, tokens in enumerate(all_tokens):
            if len(tokens) > 77:
                tokens = tokens[:77]
                tokens[-1] = eot_token
            result[i, :len(tokens)] = torch.tensor(tokens)
        return result
tz = TokenizerModule(_tokenizer, device)
tz(arr)

['a smile cat']
[[49406, 320, 3490, 2368, 49407]]


tensor([[49406,   320,  3490,  2368, 49407,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], dtype=torch.int32)

-----
### Conversion

In [7]:
import onnx
import onnxruntime
from onnxruntime.quantization import QuantType
from onnxruntime.quantization.quantize import quantize_dynamic

In [8]:
[''.join([chr(value) for value in arr[arr!=-1]])]

['a smile cat']

In [24]:
class TokenizerModule(nn.Module):
    def __init__(self, _tokenizer, device):
        super().__init__()
        self.tokenizer = _tokenizer
        
    def forward(self, x):
        x = x[x != -1]
        llen = 77 - x.shape[0]
        result = torch.zeros(1, x.shape[0] + llen, dtype=torch.int64)
        re = self.tokenizer.encode(''.join([chr(_i) for _i in x]))
        result[0][0] = 49407
        for i in range(len(re)):
            result[0][i+1] = re[i]
        result[0][i+2] = 49406
        return result

 # onnx conversion
torch.onnx.export(
    model               =   TokenizerModule(_tokenizer, device),                            # 실행될 모델
    args                =   (arr),        # 모델 입력값(tuple or 여러 입력값)
    f                   =   './tokenizer.onnx',                     # 모델 저장 경로
    export_params       =   True,                 # 모델 파일 안에 학습된 모델 가중치 저장 여부
    opset_version       =   14,                   # 모델 변환할 때 사용할 onnx 버전
    do_constant_folding =   True,         # 최적화시 상수폴딩 사용할지 여부
    input_names     =   ['input'],
    output_names    =   ["output"],
    dynamic_axes    =   {
        'input'     : {0 : 'batch_size'},    # 가변적인 길이를 가진 차원
    }
) 
# model quantization
quantize_dynamic(
    model_input     =   './tokenizer.onnx', 
    model_output    =   './tokenizer_quant.onnx', 
    per_channel     =   False,
    reduce_range    =   False,
    weight_type     =   QuantType.QUInt8,
)

FileNotFoundError: [Errno 2] No such file or directory: '../onnx-modules/tokenizer/model.onnx'

---
#### ONNX-Runtime Test

In [181]:
import onnxruntime as ort

# Load the ONNX model
onnx_model_path = './tokenizer.onnx'
session = ort.InferenceSession(onnx_model_path)

# Print the input names and shapes
input_names = [input.name for input in session.get_inputs()]
output_names = [output.name for output in session.get_outputs()]

print("Input names:", input_names)
print("Output names:", output_names)

Input names: ['input']
Output names: ['output']


In [185]:
# test running
ort_inputs  = {'input': np.array(arr)}
ort_outputs = session.run(None, ort_inputs)
print(ort_outputs[0].shape)
print(ort_outputs[0])

(1, 77)
[[49407   320  3490  2368 49406     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]]
