In [5]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("slplab/wav2vec2-xls-r-300m-japanese-hiragana")

vocab_size = processor.tokenizer.vocab_size

vocab_size

preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/278 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

122

In [6]:
from transformers import HubertForCTC

model = HubertForCTC.from_pretrained('./hubert_ASR/checkpoint-17000/')

In [7]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

HubertForCTC(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertLayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in

In [60]:
model.config.output_hidden_states=True

with torch.no_grad():
    output = model(torch.rand(1, 16000).to(device))
len(output.hidden_states)
# 跟 HubertModel一樣

25

In [9]:
from datasets import load_dataset
ds = load_dataset("reazon-research/reazonspeech", "tiny", trust_remote_code=True) # smail
# ds in ~/.cache/huggingface/datasets 
# rm -rf ~/.cache/huggingface/
ds

Downloading builder script:   0%|          | 0.00/4.74k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/521k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/605M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['name', 'audio', 'transcription'],
        num_rows: 5323
    })
})

In [45]:
split_datasets = ds["train"].train_test_split(test_size=0.99)

train_dataset = split_datasets["train"]
test_dataset = split_datasets["test"]

train_dataset[0]

{'name': '000/5c02961e1ac71.flac',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/aa46595380aebad1bf1a98a4aaaa63bbd328fec667ea8840e90dc8254550c614/000/5c02961e1ac71.flac',
  'array': array([0.00735474, 0.00683594, 0.00674438, ..., 0.01919556, 0.01898193,
         0.02380371]),
  'sampling_rate': 16000},
 'transcription': 'あのときのこと覚えてる？'}

In [11]:
import re
import MeCab
import pykakasi

CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", "；", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
          "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
          "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
          "、", "﹂", "﹁", "‧", "～", "﹏", "，", "｛", "｝", "（", "）", "［", "］", "【", "】", "‥", "〽",
          "『", "』", "〝", "〟", "⟨", "⟩", "〜", "：", "！", "？", "♪", "؛", "/", "\\", "º", "−", "^", "'", "ʻ", "ˆ"]
chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"

wakati = MeCab.Tagger("-Owakati")
kakasi = pykakasi.kakasi()
kakasi.setMode("J","H")
kakasi.setMode("K","H")
kakasi.setMode("r","Hepburn")
conv = kakasi.getConverter()

def prepare_char(batch):
    batch["transcription"] = conv.do(wakati.parse(batch["transcription"]).strip())
    batch["transcription"] = re.sub(chars_to_ignore_regex,'', batch["transcription"]).strip()
    return batch

  kakasi.setMode("J","H")
  kakasi.setMode("K","H")
  kakasi.setMode("r","Hepburn")
  conv = kakasi.getConverter()


In [13]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch


In [46]:
eval_dataset = test_dataset.map(prepare_char, num_proc=4)
prepare_eval_dataset = eval_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names, num_proc=4)
prepare_eval_dataset

Map (num_proc=4):   0%|          | 0/5270 [00:00<?, ? examples/s]

  batch["transcription"] = conv.do(wakati.parse(batch["transcription"]).strip())
  batch["transcription"] = conv.do(wakati.parse(batch["transcription"]).strip())
  batch["transcription"] = conv.do(wakati.parse(batch["transcription"]).strip())
  batch["transcription"] = conv.do(wakati.parse(batch["transcription"]).strip())


Map (num_proc=4):   0%|          | 0/5270 [00:00<?, ? examples/s]



Dataset({
    features: ['input_values', 'labels'],
    num_rows: 5270
})

In [47]:
from evaluate import load

wer = load("wer")
cer = load("cer")

def evaluate(batch):
    pred_strings = []
    for audio in batch["audio"]:
        inputs = processor(audio["array"], sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = inputs.to(device)
        with torch.no_grad():
            logits = model(inputs.input_values).logits
        pred_ids = torch.argmax(logits, dim=-1)

        pred_ids[pred_ids == -100] = processor.tokenizer.pad_token_id

        pred_string = processor.batch_decode(pred_ids)

        pred_strings.extend(pred_string)

    batch["pred_strings"] = pred_strings
    return batch

columns_to_remove = [column for column in eval_dataset.column_names if column != "transcription"]
result = eval_dataset.map(evaluate, remove_columns=columns_to_remove, batched=True)

wer_result = wer.compute(predictions=result["pred_strings"], references=result["transcription"])
cer_result = cer.compute(predictions=result["pred_strings"], references=result["transcription"])

print("WER: {:2f}%".format(100 * wer_result))
print("CER: {:2f}%".format(100 * cer_result))

Map:   0%|          | 0/5270 [00:00<?, ? examples/s]

WER: 31.953978%
CER: 16.177097%
