### Evaluate

In [1]:
import os
import torch

In [2]:
model_path = "./model/wav2vec"

In [3]:
# GPU
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    os.environ["CUDA_VISIBLE_DEVICES"] = '6'

In [4]:
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC

processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path).to(device)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
import soundfile as sf
from datasets import load_from_disk

dataset = load_from_disk("./data/datasets/eval")

In [6]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def map_to_result(batch):
    # print("batch path",batch["path"])
    # audio 처리
    batch["speech"], _ = sf.read(batch["path"])

    # 전사 데이터 처리
    result = re.sub(chars_to_ignore_regex, "", batch["transcription"])
    result = " ".join(result.split())
    result = result.lower() 
    batch["result"] = result

    return batch

In [7]:
dataset = dataset.map(map_to_result,remove_columns=["transcription","audio","path"])

11350ex [02:01, 93.73ex/s] 


In [8]:
dataset

Dataset({
    features: ['speech', 'result'],
    num_rows: 11350
})

In [9]:
def prepare_example(example):
    inputs = processor(example["speech"], sampling_rate=16000, return_tensors="pt", padding="longest")
    input_values = inputs.input_values.to(device)
    
    with torch.no_grad():
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
    
    predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id
    predicted_transcript = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    example["predicted_text"] = predicted_transcript
    return example

In [10]:
def prepare_example(batch):
    inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding="longest")
    input_values = inputs.input_values.to(device)
    
    with torch.no_grad():
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
    
    predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id
    predicted_transcripts = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)

    batch["predicted_text"] = predicted_transcripts
    
    return batch

In [11]:
dataset = dataset.map(prepare_example, batched=True, batch_size=16)

  0%|          | 0/710 [00:00<?, ?ba/s]2024-04-26 01:01:58.157615: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-26 01:01:58.186940: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-26 01:01:58.186966: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-26 01:01:58.186985: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-26 01:01:58.192966: I

In [15]:
print(dataset)

Dataset({
    features: ['speech', 'result', 'predicted_text'],
    num_rows: 11350
})


In [12]:
# 됐던거를 배치로 고침
# def prepare_example(batch):
#     inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding="longest")
#     input_values = inputs.input_values.to(device)
    
#     with torch.no_grad():
#         logits = model(input_values).logits
#         predicted_ids = torch.argmax(logits, dim=-1)
    
#     predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id
#     predicted_transcript = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
#     batch["predicted_text"] = predicted_transcript
#     return example

In [14]:
dataset = dataset.map(prepare_example)

213ex [00:27,  7.63ex/s]


KeyboardInterrupt: 

In [19]:
from jiwer import wer

print(wer(dataset["result"],dataset["predicted_text"]))

0.16804384485666105


In [None]:
# from jiwer import wer

# WER 계산 및 출력
# for example in dataset:
#     reference = example["result"].lower() 
#     predicted = example["predicted_text"].lower() 
#     print("Reference:", reference)
#     print("Predicted:", predicted)
#     print("WER:", wer(reference, predicted))
#     print("--")

In [None]:
# def prepare_example(batch):
#     batch["speech"], _ = sf.read(batch["path"])
#     result = batch["transcription"].translate(char_translations)
#     result = " ".join(result.split())  # clean up whitespaces
#     batch["result"] = result.lower()
#     return batch

In [None]:
# dataset = dataset.select(range(100)).map(prepare_example, remove_columns=["path","audio"])

# inputs = processor(dataset["speech"], sampling_rate=16000, return_tensors="pt", padding="longest")
# input_values = inputs.input_values.to(device)

# with torch.no_grad():
#   logits = model(input_values).to(device).logits
#   predicted_ids = torch.argmax(logits, dim=-1)

# predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id
# predicted_transcripts = processor.tokenizer.batch_decode(predicted_ids,skip_special_tokens=True)

In [None]:
# from jiwer import wer

# for reference, predicted in zip(dataset["result"], predicted_transcripts):
#     print("reference:", reference)
#     print("predicted:", predicted)
#     print("--")
#     print("WER:", wer(reference, predicted))