In [1]:
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
LANG_ID = "ja"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-japanese"
SAMPLES = 10

In [9]:
#dataset
test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", LANG_ID, split=f"test[:{SAMPLES}]", trust_remote_code=True)

In [4]:
#processor & model
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-japanese were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-japanese and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']

In [5]:
# Preprocessing the datasets.
# We need to read the audio files as arrays
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=16_000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["sentence"].upper()
    return batch

In [10]:
test_dataset = test_dataset.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

Map: 100%|██████████| 10/10 [00:00<00:00, 10.40 examples/s]


In [11]:
with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
print("Predicted_ids:",predicted_ids)
predicted_sentences = processor.batch_decode(predicted_ids)

for i, predicted_sentence in enumerate(predicted_sentences):
    print("-" * 100)
    print("Reference:", test_dataset[i]["sentence"])
    print("Prediction:", predicted_sentence)

Predicted_ids: tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
----------------------------------------------------------------------------------------------------
Reference: この実写化は原作ファンも納得
Prediction: この実社カ原作山も納得
----------------------------------------------------------------------------------------------------
Reference: 祖母は、おおむね機嫌よく、サイコロをころがしている。
Prediction: 人母は重にきね起くさいがしている
----------------------------------------------------------------------------------------------------
Reference: 映画の見どころはと聞かれて全部としか言わない
Prediction: 映画の所はと聞かれて全部としか言わない
----------------------------------------------------------------------------------------------------
Reference: 財布をなくしたので、交番へ行きます。
Prediction: 財布をなく手さので勾番へ行きます
----------------------------------------------------------------------------------------------------
Reference: 背の高さは一七〇セ