### Evaluate

In [1]:
import os
import torch

In [2]:
model_path = "./model/wav2vec"

In [3]:
# GPU
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    os.environ["CUDA_VISIBLE_DEVICES"] = '6,7'

In [4]:
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC

processor = Wav2Vec2Processor.from_pretrained(model_path)
model = Wav2Vec2ForCTC.from_pretrained(model_path).to(device)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
import soundfile as sf
from datasets import load_from_disk

In [6]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def map_to_result(batch):
    # print("batch path",batch["path"])
    # audio 처리
    batch["speech"], _ = sf.read(batch["path"])

    # 전사 데이터 처리
    result = re.sub(chars_to_ignore_regex, "", batch["transcription"])
    result = " ".join(result.split())
    result = result.lower() 
    batch["result"] = result

    return batch

In [7]:
def prepare_example(batch):
    inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding="longest")
    input_values = inputs.input_values.to(device)
    
    with torch.no_grad():
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
    
    predicted_ids[predicted_ids == -100] = processor.tokenizer.pad_token_id
    predicted_transcripts = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)

    batch["predicted_text"] = predicted_transcripts
    
    return batch

In [8]:
import random
import pandas as pd
from IPython.display import display, HTML

# 무작위로 샘플 보여주는 함수 정의
def show_random_elements(dataset, num_examples=7):
    assert num_examples <= len(dataset)
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [9]:
from jiwer import wer

def eval_model(dataset_path):
    dataset = load_from_disk(dataset_path)

    dataset = dataset.map(map_to_result,remove_columns=["transcription","audio","path"])
    dataset = dataset.map(prepare_example, batched=True, batch_size=16)

    show_random_elements(dataset,5)

    print(wer(dataset["result"],dataset["predicted_text"]))

In [10]:
# eval_model("./data/datasets/gaussian-10")

In [11]:
# eval_model("./data/datasets/gaussian-20")

In [12]:
# eval_model("./data/datasets/gaussian-30")

In [13]:
# eval_model("./data/datasets/gaussian-40")

0ex [00:00, ?ex/s]


LibsndfileError: Error opening './data/x_eval-gaussian-40/7294/86026/7294-86026-0001.flac': System error.

In [None]:
# eval_model("./data/datasets/gaussian-50")

In [None]:
eval_model("./data/datasets/gaussian-60")

In [None]:
eval_model("./data/datasets/gaussian-70")

In [None]:
eval_model("./data/datasets/gaussian-80")

In [None]:
eval_model("./data/datasets/gaussian-90")

In [None]:
eval_model("./data/datasets/gaussian-100")