In [1]:
# symbol.txtファイルを読み込んで、_SPECIALSに記号を追加
# https://github.com/reazon-research/ReazonSpeech/blob/master/pkg/_v1/src/data/symbol.txt
with open('symbol.txt', 'r') as f:
    _SPECIALS = {ord(c.rstrip("\n")): "" for c in f.readlines()}

_HAN2ZEN = str.maketrans(
    "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789",
    "ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ０１２３４５６７８９")

def normalize(text):
    """Trim non-phonatory symbols in the text

    Args:
        text(str): A string to process

    Returns:
        A normalized string
    """
    text = text.replace('<sos/eos>', '')
    text = text.replace('<unk>', '')
    return text.translate(_SPECIALS).translate(_HAN2ZEN)


# cerを計測する関数を定義
import editdistance

def calculate_cer(reference_texts, predicted_texts):
    total_errors = 0
    total_chars = 0
    
    for ref, pred in zip(reference_texts, predicted_texts):
        # Levenshtein距離を使ってエディット距離を計算する
        levenshtein_distance = editdistance.eval(ref, normalize(pred))
        total_errors += levenshtein_distance
        total_chars += len(ref)
    
    # CERを計算する
    cer = total_errors / total_chars
    return cer

In [2]:
# データのダウンロード

import pandas as pd
from pathlib import Path

data_dir = Path('/root/datadrive/TEDxJP-10K_v1.1')
wav_dir = data_dir / 'test_dump'
id2text = pd.read_csv(data_dir / 'text', sep=' ', header=None, names=['id', 'text']).set_index('id')['text'].to_dict()
file_id_list = list(id2text.keys())
paths2audio_files = [wav_dir / f'{file_id}.wav' for file_id in file_id_list]

In [3]:
import soundfile as sf
df = pd.read_csv(data_dir / 'text', sep=' ', header=None, names=['id', 'text'])

def calc_duration(file_id):
    wav_path = wav_dir / f'{file_id}.wav'
    audio, sr = sf.read(wav_path)
    return len(audio) / sr

df['id'].map(lambda x: wav_dir / f'{x}.wav')

0       /root/datadrive/TEDxJP-10K_v1.1/test_dump/-6K2...
1       /root/datadrive/TEDxJP-10K_v1.1/test_dump/-6K2...
2       /root/datadrive/TEDxJP-10K_v1.1/test_dump/-6K2...
3       /root/datadrive/TEDxJP-10K_v1.1/test_dump/-6K2...
4       /root/datadrive/TEDxJP-10K_v1.1/test_dump/-6K2...
                              ...                        
9911    /root/datadrive/TEDxJP-10K_v1.1/test_dump/zwW9...
9912    /root/datadrive/TEDxJP-10K_v1.1/test_dump/zwW9...
9913    /root/datadrive/TEDxJP-10K_v1.1/test_dump/zwW9...
9914    /root/datadrive/TEDxJP-10K_v1.1/test_dump/zwW9...
9915    /root/datadrive/TEDxJP-10K_v1.1/test_dump/zwW9...
Name: id, Length: 9916, dtype: object

In [5]:
df['duration'] = df['id'].map(calc_duration)
df['duration'].sum() / 3600

8.820502934027777

In [None]:
import soundfile as sf
for path in paths2audio_files:
    data, sr = sf.read(path)
    duration = len(data) / sr

# ReazonSpeech (NeMo)

```shell
!pip install Cython
!pip install nemo_toolkit['asr']
```

In [3]:
from nemo.collections.asr.models import EncDecRNNTBPEModel
model = EncDecRNNTBPEModel.restore_from('/root/datadrive/reazonspeech-nemo-v2/reazonspeech-nemo-v2.nemo')

[NeMo I 2024-05-31 12:52:14 mixins:172] Tokenizer SentencePieceTokenizer initialized with 3000 tokens


[NeMo W 2024-05-31 12:52:15 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: dataset/train.json
    sample_rate: 16000
    batch_size: 32
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 30
    min_duration: 0.1
    use_start_end_token: false
    trim_silence: false
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2024-05-31 12:52:15 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: dataset/valid.json
    sample_rate: 16000
    batch_size: 16
    shuffle: false

[NeMo I 2024-05-31 12:52:15 features:289] PADDING: 0
[NeMo I 2024-05-31 12:52:18 rnnt_models:217] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2024-05-31 12:52:20 save_restore_connector:249] Model EncDecRNNTBPEModel was successfully restored from /root/datadrive/reazonspeech-nemo-v2/reazonspeech-nemo-v2.nemo.


In [12]:
import random
import pandas as pd
from pathlib import Path

data_dir = Path('/root/datadrive/TEDxJP-10K_v1.1')
wav_dir = data_dir / 'test_dump'
id2text = pd.read_csv(data_dir / 'text', sep=' ', header=None, names=['id', 'text']).set_index('id')['text'].to_dict()

# 10個サンプルする
file_id_list = random.sample(id2text.keys(), 10)

paths2audio_files = [str(wav_dir / f'{file_id}.wav') for file_id in file_id_list]

    since Python 3.9 and will be removed in a subsequent version.
      file_id_list = random.sample(id2text.keys(), 10)
    


In [18]:
from pprint import pprint
import time
tic = time.perf_counter()
asr_text = model.transcribe(paths2audio_files=paths2audio_files, batch_size=1, return_hypotheses=False)[0]
toc = time.perf_counter() - tic
print(f"transcribe time: {toc:.2f}s")
gt_text = [id2text[file_id] for file_id in file_id_list]
cer = calculate_cer(gt_text, asr_text)
pprint(asr_text)
print(cer)

Transcribing:   0%|          | 0/10 [00:00<?, ?it/s]

Beam search progress:: 100%|██████████| 1/1 [00:00<00:00, 12.37sample/s]
Beam search progress:: 100%|██████████| 1/1 [00:00<00:00, 20.36sample/s]
Beam search progress:: 100%|██████████| 1/1 [00:00<00:00, 16.29sample/s]
Beam search progress:: 100%|██████████| 1/1 [00:00<00:00, 24.62sample/s]
Beam search progress:: 100%|██████████| 1/1 [00:00<00:00, 37.98sample/s]
Beam search progress:: 100%|██████████| 1/1 [00:00<00:00,  8.63sample/s]
Beam search progress:: 100%|██████████| 1/1 [00:00<00:00,  8.57sample/s]
Beam search progress:: 100%|██████████| 1/1 [00:00<00:00, 11.63sample/s]
Beam search progress:: 100%|██████████| 1/1 [00:00<00:00, 15.75sample/s]
Beam search progress:: 100%|██████████| 1/1 [00:00<00:00, 42.66sample/s]

transcribe time: 1.32s
['使い古された技術を使った、',
 '一つの国家に変わってしまうと。',
 '俺しゃべり下手やしな。',
 'コミュニケーションも下手で。',
 '一方で、',
 '本当にだんだん体が動かなくなって。',
 'それと同時に、貧しさや差別。',
 '学校にはですね机と椅子がありませんでした。',
 'な部分があったりとかって結構しますね。',
 '登ったり。']
0.29605263157894735





# Kotoba-Whisper

In [19]:
import torch
from transformers import pipeline
from datasets import load_dataset

# config
model_id = "kotoba-tech/kotoba-whisper-v1.0"
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
generate_kwargs = {"language": "japanese", "task": "transcribe"}

# load model
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    torch_dtype=torch_dtype,
    device=device,
    model_kwargs=model_kwargs
)

asr_text = []
tic = time.perf_counter()
for sample in paths2audio_files:
    # run inference
    result = pipe(sample, generate_kwargs=generate_kwargs)
    asr_text.append(result["text"])
toc = time.perf_counter() - tic
print(f"transcribe time: {toc:.2f}s")
pprint(asr_text)
cer = calculate_cer(gt_text, asr_text)
cer



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


transcribe time: 2.15s
['使い古された技術を使った',
 '一つの国家に変えてしまうと',
 'でもな俺しゃべり下手やしな',
 'コミュニケーションも下手で',
 'でも一方で',
 'そうしても本当にもうだんだん体がない動かなくなって',
 'それと同時に貧しさや差別',
 'この学校には机と椅子がありませんでした',
 '根拠に曖昧な部分があったりとかって結構しまして',
 '山に登ったり']


0.19078947368421054

In [6]:
import torch
from transformers import pipeline
from datasets import load_dataset

# config
model_id = "kotoba-tech/kotoba-whisper-v1.0"
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
generate_kwargs = {"language": "japanese", "task": "transcribe"}

# load model
pipe = pipeline(
    "automatic-speech-recognition",
    model=model_id,
    torch_dtype=torch_dtype,
    device=device,
    model_kwargs=model_kwargs
)

config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [None]:
generate_kwargs = {"language": "japanese", "task": "transcribe", "return_timestamp": True}
sample = paths2audio_files[0]
pipe(sample, generate_kwargs=generate_kwargs)

# Nue-ASR

In [20]:
!pip install -q git+https://github.com/rinnakk/nue-asr.git
!pip install deepspeed

[0mCollecting deepspeed
  Downloading deepspeed-0.14.2.tar.gz (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting hjson
  Downloading hjson-3.1.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 KB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 KB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Collecting py-cpuinfo
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Collecting pydantic
  Downloading pydantic-2.7.2-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.5/409.5 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollect

In [3]:
import nue_asr
import time
from pprint import pprint
from pathlib import Path

paths2audio_files = ['/root/datadrive/TEDxJP-10K_v1.1/test_dump/Tc2--M7NQrA-00096719-00096990.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/saHZ4bN3h28-00039574-00039725.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/KpRpQsojKgc-00083461-00083799.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/sA4Cj96KMi0-00033229-00033405.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/IBrUxfKQdEA-00014000-00014147.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/cZy6z806Lyg-00100804-00101180.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/Yb04vLKSvxQ-00066642-00067042.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/VhFMEJhnTNk-00035949-00036235.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/d0DrguC0Flc-00060864-00061250.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/0jJLyvNn_to-00036300-00036412.wav']
file_id_list = [Path(p).stem for p in paths2audio_files]
gt_text = [id2text[file_id] for file_id in file_id_list]


model = nue_asr.load_model("rinna/nue-asr")
tokenizer = nue_asr.load_tokenizer("rinna/nue-asr")

asr_text = []
tic = time.perf_counter()
for wav_file in paths2audio_files:
    result = nue_asr.transcribe(model, tokenizer, wav_file)
    asr_text.append(result.text)
toc = time.perf_counter() - tic

print(f"transcribe time: {toc:.2f}s")
pprint(asr_text)
cer = calculate_cer(gt_text, asr_text)
print(cer)

Some weights of the model checkpoint at rinna/nue-asr were not used when initializing NueASRModel: ['audio_encoder.encoder.pos_conv_embed.conv.weight_g', 'audio_encoder.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing NueASRModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NueASRModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of NueASRModel were not initialized from the model checkpoint at rinna/nue-asr and are newly initialized: ['audio_encoder.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'audio_encoder.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream tas

transcribe time: 5.38s
['使い古された技術を使った。',
 '1つの国家に変えてしまおうと。',
 'でもな俺しゃべり下手やしな。',
 'コミュニケーションも下手で。',
 'でも一方で。',
 'するともう本当にだんだん体が動かなくなって。',
 'それと同時に貧しさや差別。',
 '学校には机と椅子がありませんでした。',
 '曖昧な部分があったりとかって結構しますね。',
 '山に登ったり。']
0.21052631578947367


In [9]:
sum([p.numel() for p in model.parameters()]) / 1e6 * 2

7417.005312

In [8]:
next(iter(model.parameters())).data.dtype

torch.float16

In [27]:
del pipe
del model
import torch
torch.cuda.empty_cache()

NameError: name 'pipe' is not defined

In [26]:
!nvidia-smi

Fri May 31 00:13:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04             Driver Version: 535.171.04   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060        Off | 00000000:01:00.0  On |                  N/A |
|  0%   49C    P3              28W / 170W |  10058MiB / 12288MiB |     40%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [28]:
print(paths2audio_files)

['/root/datadrive/TEDxJP-10K_v1.1/test_dump/Tc2--M7NQrA-00096719-00096990.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/saHZ4bN3h28-00039574-00039725.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/KpRpQsojKgc-00083461-00083799.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/sA4Cj96KMi0-00033229-00033405.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/IBrUxfKQdEA-00014000-00014147.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/cZy6z806Lyg-00100804-00101180.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/Yb04vLKSvxQ-00066642-00067042.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/VhFMEJhnTNk-00035949-00036235.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/d0DrguC0Flc-00060864-00061250.wav', '/root/datadrive/TEDxJP-10K_v1.1/test_dump/0jJLyvNn_to-00036300-00036412.wav']
