# model import

In [10]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab_jamos.json",
                                 unk_token="[UNK]",
                                 pad_token="[PAD]",
                                 word_delimiter_token="|")

In [11]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True)

In [12]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

In [25]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    ".\service_1\Assets\jamo_base_model",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

In [26]:
# print(model)

# model predict

In [20]:
import librosa
import pandas as pd
import numpy as np
import torch
import re

In [28]:
array,_ = librosa.load('./service_1/Assets/test_data_1.wav',16000)
print(array.shape)
print(array.min(), array.max())

(66976,)
-0.12826538 0.11135864


In [29]:
array = processor(array, sampling_rate=16000).input_values[0]
print(array.shape)
print(array.min(), array.max())

(66976,)
-7.489833 6.522962


In [30]:
pred = model.forward(torch.from_numpy(array.reshape(1,-1)))

In [31]:
def pred_decode(pred):
    pred_logits = pred['logits'].detach().numpy()
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_str = processor.batch_decode(pred_ids)
    return pred_str

In [32]:
pred_str = ''.join(pred_decode(pred))
print(pred_str)

ㅂㅗ<pad> ㅇ<pad>ㅣㅆㄴ<pad>ㅡ<pad>ㄴ ㅇ<pad>ㅕㅇ<pad>ㅅ<pad>ㅏ<pad>ㅇ<pad> ㅈ<pad>ㅓ<pad>ㅇ<pad>ㅈ<pad>ㅣ<pad>ㅅ<pad>ㅣ<pad>ㅋ<pad>ㅕ<pad> ㅈㅝ


In [33]:
remove_pad_token = re.sub('<pad>','',pred_str)
print(remove_pad_token)

ㅂㅗ ㅇㅣㅆㄴㅡㄴ ㅇㅕㅇㅅㅏㅇ ㅈㅓㅇㅈㅣㅅㅣㅋㅕ ㅈㅝ


# to onnx

In [34]:

x = torch.randn(1,100000,requires_grad=True)
print(x.dtype)

torch.onnx.export(model,
                  x,
                  "./outputs/jamo_base_model.onnx",
                  input_names=["input"],
                  output_names=["output"],
                  dynamic_axes={
                      "input":{
                          0: "batch",
                          1: "time",
                      },
                      "output":{
                          0: "batch",
                          1: "seqeunce",
                      }
                  },
                  opset_version=11,
                  do_constant_folding=True,
                  )

torch.float32


  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):


In [6]:
import onnxruntime
import onnx

In [7]:
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
session = onnxruntime.InferenceSession("./outputs/jamo_base_model.onnx", sess_options)

In [8]:
array,_ = librosa.load('./service_1/Assets/test_data_1.wav',16000)
print(array.shape)
print(array.min(), array.max())

(66976,)
-0.12826538 0.11135864


In [13]:
array = processor(array, sampling_rate=16000).input_values[0]
print(array.shape)
print(array.min(), array.max())

(66976,)
-7.489833 6.522962


In [34]:
results = session.run(["output"],{"input":array.reshape(1,-1)})
# print(results)
print(results[0].shape)
print(np.argmax(results[0],axis=-1))

(1, 146, 111)
[[54 67  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  13 13 13 50 50 86 86  0  0  0  0  0  4  4 85 85  0 22  0  0  0  0  0 78
   0  0 67  0  0  0  0 85  0  0  8  0  4 20 20  0  0  0 67 69 69  0  0  0
   0 12 12  0  8  0  0  0 31  0 67  0  4  4  4 13 13 13 35  0  0  0  0  0
  40  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 52
   4  4]]


In [35]:
pred_ids = np.argmax(results[0],axis=-1)

In [36]:
import json

with open('vocab_jamos.json','r') as f:
    word_to_index = json.load(f)
print(word_to_index)    

{'ㅣ': 8, 'ㄶ': 9, 'V': 10, 'C': 11, 'ㅆ': 12, 'ㅇ': 13, 'k': 14, 't': 15, 'ㄼ': 16, 'ㅋ': 17, 'ㅚ': 18, 'ㄵ': 19, 'ㄴ': 20, 'ㅉ': 21, 'ㅜ': 22, 'b': 23, 'o': 24, 'B': 25, 'ㅛ': 26, 'v': 27, 'I': 28, 'i': 29, 'X': 30, 'ㄱ': 31, 'ㅄ': 32, 'ㅅ': 33, 'ㅠ': 34, 'ㅓ': 35, 'L': 36, 'm': 37, 'Z': 38, 'q': 39, 'ㄸ': 40, 'G': 41, 'ㅕ': 42, 'K': 43, 'd': 44, 'S': 45, 'Y': 46, 'M': 47, 'h': 48, 'w': 49, 'ㅡ': 50, 'ㅍ': 51, 'ㅐ': 52, 'j': 53, 'ㄷ': 54, 'ㄽ': 55, 'p': 56, 'ㄾ': 57, 'e': 58, 'N': 59, 'ㅞ': 60, 'x': 61, 'ㅒ': 62, 'ㅑ': 63, 'H': 64, 'r': 65, 'T': 66, 'ㅏ': 67, 'g': 68, 'ㄹ': 69, 'ㅀ': 70, 'ㄻ': 71, 'J': 72, 'u': 73, 'A': 74, 'ㄿ': 75, 'y': 76, 'F': 77, 'ㄲ': 78, 'c': 79, 'ㅔ': 80, 'ㅎ': 81, 'O': 82, 'ㅌ': 83, 'ㅢ': 84, 'ㅈ': 85, 'ㅁ': 86, 'ㅊ': 87, 'ㅙ': 88, 'E': 89, 'ㅖ': 90, 'P': 91, 'n': 92, 'Q': 93, 'l': 94, 'ㄳ': 95, 'ㅟ': 96, 'z': 97, 'ㅝ': 98, 'D': 99, 's': 100, 'ㅘ': 101, 'ㅃ': 102, 'R': 103, 'f': 104, 'a': 105, 'W': 106, 'ㅗ': 107, 'U': 108, 'ㅂ': 109, 'ㄺ': 110, '<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, '<b>': 5, 

In [37]:
index_to_word = {index:word for word,index in word_to_index.items()}
print(index_to_word)

{8: 'ㅣ', 9: 'ㄶ', 10: 'V', 11: 'C', 12: 'ㅆ', 13: 'ㅇ', 14: 'k', 15: 't', 16: 'ㄼ', 17: 'ㅋ', 18: 'ㅚ', 19: 'ㄵ', 20: 'ㄴ', 21: 'ㅉ', 22: 'ㅜ', 23: 'b', 24: 'o', 25: 'B', 26: 'ㅛ', 27: 'v', 28: 'I', 29: 'i', 30: 'X', 31: 'ㄱ', 32: 'ㅄ', 33: 'ㅅ', 34: 'ㅠ', 35: 'ㅓ', 36: 'L', 37: 'm', 38: 'Z', 39: 'q', 40: 'ㄸ', 41: 'G', 42: 'ㅕ', 43: 'K', 44: 'd', 45: 'S', 46: 'Y', 47: 'M', 48: 'h', 49: 'w', 50: 'ㅡ', 51: 'ㅍ', 52: 'ㅐ', 53: 'j', 54: 'ㄷ', 55: 'ㄽ', 56: 'p', 57: 'ㄾ', 58: 'e', 59: 'N', 60: 'ㅞ', 61: 'x', 62: 'ㅒ', 63: 'ㅑ', 64: 'H', 65: 'r', 66: 'T', 67: 'ㅏ', 68: 'g', 69: 'ㄹ', 70: 'ㅀ', 71: 'ㄻ', 72: 'J', 73: 'u', 74: 'A', 75: 'ㄿ', 76: 'y', 77: 'F', 78: 'ㄲ', 79: 'c', 80: 'ㅔ', 81: 'ㅎ', 82: 'O', 83: 'ㅌ', 84: 'ㅢ', 85: 'ㅈ', 86: 'ㅁ', 87: 'ㅊ', 88: 'ㅙ', 89: 'E', 90: 'ㅖ', 91: 'P', 92: 'n', 93: 'Q', 94: 'l', 95: 'ㄳ', 96: 'ㅟ', 97: 'z', 98: 'ㅝ', 99: 'D', 100: 's', 101: 'ㅘ', 102: 'ㅃ', 103: 'R', 104: 'f', 105: 'a', 106: 'W', 107: 'ㅗ', 108: 'U', 109: 'ㅂ', 110: 'ㄺ', 0: '<pad>', 1: '<s>', 2: '</s>', 3: '<unk>', 4: '|', 5: '<b>', 

In [38]:
pred_str = [index_to_word[idx] for idx in pred_ids.flatten()]
print(pred_str)

['ㄷ', 'ㅏ', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 'ㅇ', 'ㅇ', 'ㅇ', 'ㅡ', 'ㅡ', 'ㅁ', 'ㅁ', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '|', '|', 'ㅈ', 'ㅈ', '<pad>', 'ㅜ', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 'ㄲ', '<pad>', '<pad>', 'ㅏ', '<pad>', '<pad>', '<pad>', '<pad>', 'ㅈ', '<pad>', '<pad>', 'ㅣ', '<pad>', '|', 'ㄴ', 'ㄴ', '<pad>', '<pad>', '<pad>', 'ㅏ', 'ㄹ', 'ㄹ', '<pad>', '<pad>', '<pad>', '<pad>', 'ㅆ', 'ㅆ', '<pad>', 'ㅣ', '<pad>', '<pad>', '<pad>', 'ㄱ', '<pad>', 'ㅏ', '<pad>', '|', '|', '|', 'ㅇ', 'ㅇ', 'ㅇ', 'ㅓ', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 'ㄸ', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa

In [39]:
remove_pad_token = re.sub('<pad>','',''.join(pred_str))
print(remove_pad_token)

ㄷㅏㅇㅇㅇㅡㅡㅁㅁ||ㅈㅈㅜㄲㅏㅈㅣ|ㄴㄴㅏㄹㄹㅆㅆㅣㄱㅏ|||ㅇㅇㅇㅓㄸㅐ||


In [40]:
ctc = []
tmp = ""
for s in remove_pad_token:
    if s == '|':
        s = " "
    if s == tmp:
        continue
    else:
        ctc.append(s)
    tmp = s
print(ctc)

['ㄷ', 'ㅏ', 'ㅇ', 'ㅡ', 'ㅁ', ' ', 'ㅈ', 'ㅜ', 'ㄲ', 'ㅏ', 'ㅈ', 'ㅣ', ' ', 'ㄴ', 'ㅏ', 'ㄹ', 'ㅆ', 'ㅣ', 'ㄱ', 'ㅏ', ' ', 'ㅇ', 'ㅓ', 'ㄸ', 'ㅐ', ' ']


In [41]:
"".join(ctc)

'ㄷㅏㅇㅡㅁ ㅈㅜㄲㅏㅈㅣ ㄴㅏㄹㅆㅣㄱㅏ ㅇㅓㄸㅐ '

In [42]:
from unicode import join_jamos
join_jamos("".join(ctc).strip())

'다음 주까지 날씨가 어때'

In [43]:
array,_ = librosa.load('./dataset/audio/script1_g_0044-6002-01-01-KSM-F-05-A.wav',16000)
array.shape

(47040,)

In [44]:
array = processor(array, sampling_rate=16000).input_values[0]
results = session.run(["output"],{"input":array.reshape(1,-1)})
pred_ids = np.argmax(results[0],axis=-1)
pred_str = [index_to_word[idx] for idx in pred_ids.flatten()]
remove_pad_token = re.sub('<pad>','',''.join(pred_str))
ctc = []
tmp = ""
for s in remove_pad_token:
    if s == '|':
        s = " "
    if s == tmp:
        continue
    else:
        ctc.append(s)
    tmp = s
join_jamos("".join(ctc).strip())

'다음 주까지 날씨가 어때'

# quantization

In [45]:
from onnxruntime.quantization import quantize_dynamic, QuantType

In [None]:
quantize_dynamic("./outputs/jamo_base_model.onnx",
                 "./outputs/quantized_jamo_base_model.onnx",
                 )

In [47]:
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
session = onnxruntime.InferenceSession("./outputs/quantized_jamo_base_model.onnx", sess_options)

In [48]:
array,_ = librosa.load('./service_1/Assets/test_data_1.wav',16000)
print(array.shape)
print(array.min(), array.max())

(66976,)
-0.12826538 0.11135864


In [49]:
array = processor(array, sampling_rate=16000).input_values[0]
results = session.run(["output"],{"input":array.reshape(1,-1)})
pred_ids = np.argmax(results[0],axis=-1)
pred_str = [index_to_word[idx] for idx in pred_ids.flatten()]
remove_pad_token = re.sub('<pad>','',''.join(pred_str))
ctc = []
tmp = ""
for s in remove_pad_token:
    if s == '|':
        s = " "
    if s == tmp:
        continue
    else:
        ctc.append(s)
    tmp = s
join_jamos("".join(ctc).strip())

'복 있는 영상 정지시켜 줘'