In [23]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Create Wav2Vec2CTCTokenizer

In [31]:
from datasets import load_dataset
import pandas as pd
import numpy as np

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
# huggingface dataset
all_data = load_dataset('csv',data_files='/content/drive/MyDrive/Final/csvfile/KsponSpeech_0002.csv',split='train')

Using custom data configuration default-fffd43577fe70fb3


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-fffd43577fe70fb3/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-fffd43577fe70fb3/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58. Subsequent calls will reuse this data.


#자음모음으로 변환

In [34]:
__all__ = ["split_syllable_char", "split_syllables",
           "join_jamos", "join_jamos_char",
           "CHAR_INITIALS", "CHAR_MEDIALS", "CHAR_FINALS"]

import itertools

INITIAL = 0x001
MEDIAL = 0x010
FINAL = 0x100
CHAR_LISTS = {
    INITIAL: list(map(chr, [
        0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139,
        0x3141, 0x3142, 0x3143, 0x3145, 0x3146, 0x3147,
        0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d,
        0x314e
    ])),
    MEDIAL: list(map(chr, [
        0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154,
        0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a,
        0x315b, 0x315c, 0x315d, 0x315e, 0x315f, 0x3160,
        0x3161, 0x3162, 0x3163
    ])),
    FINAL: list(map(chr, [
        0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136,
        0x3137, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d,
        0x313e, 0x313f, 0x3140, 0x3141, 0x3142, 0x3144,
        0x3145, 0x3146, 0x3147, 0x3148, 0x314a, 0x314b,
        0x314c, 0x314d, 0x314e
    ]))
}
CHAR_INITIALS = CHAR_LISTS[INITIAL]
CHAR_MEDIALS = CHAR_LISTS[MEDIAL]
CHAR_FINALS = CHAR_LISTS[FINAL]
CHAR_SETS = {k: set(v) for k, v in CHAR_LISTS.items()}
CHARSET = set(itertools.chain(*CHAR_SETS.values()))
CHAR_INDICES = {k: {c: i for i, c in enumerate(v)}
                for k, v in CHAR_LISTS.items()}


def is_hangul_syllable(c):
    return 0xac00 <= ord(c) <= 0xd7a3  # Hangul Syllables


def is_hangul_jamo(c):
    return 0x1100 <= ord(c) <= 0x11ff  # Hangul Jamo


def is_hangul_compat_jamo(c):
    return 0x3130 <= ord(c) <= 0x318f  # Hangul Compatibility Jamo


def is_hangul_jamo_exta(c):
    return 0xa960 <= ord(c) <= 0xa97f  # Hangul Jamo Extended-A


def is_hangul_jamo_extb(c):
    return 0xd7b0 <= ord(c) <= 0xd7ff  # Hangul Jamo Extended-B


def is_hangul(c):
    return (is_hangul_syllable(c) or
            is_hangul_jamo(c) or
            is_hangul_compat_jamo(c) or
            is_hangul_jamo_exta(c) or
            is_hangul_jamo_extb(c))


def is_supported_hangul(c):
    return is_hangul_syllable(c) or is_hangul_compat_jamo(c)


def check_hangul(c, jamo_only=False):
    if not ((jamo_only or is_hangul_compat_jamo(c)) or is_supported_hangul(c)):
        raise ValueError(f"'{c}' is not a supported hangul character. "
                         f"'Hangul Syllables' (0xac00 ~ 0xd7a3) and "
                         f"'Hangul Compatibility Jamos' (0x3130 ~ 0x318f) are "
                         f"supported at the moment.")


def get_jamo_type(c):
    check_hangul(c)
    assert is_hangul_compat_jamo(c), f"not a jamo: {ord(c):x}"
    return sum(t for t, s in CHAR_SETS.items() if c in s)


def split_syllable_char(c):
    """
    Splits a given korean syllable into its components. Each component is
    represented by Unicode in 'Hangul Compatibility Jamo' range.

    Arguments:
        c: A Korean character.

    Returns:
        A triple (initial, medial, final) of Hangul Compatibility Jamos.
        If no jamo corresponds to a position, `None` is returned there.

    Example:
        >>> split_syllable_char("안")
        ("ㅇ", "ㅏ", "ㄴ")
        >>> split_syllable_char("고")
        ("ㄱ", "ㅗ", None)
        >>> split_syllable_char("ㅗ")
        (None, "ㅗ", None)
        >>> split_syllable_char("ㅇ")
        ("ㅇ", None, None)
    """
    check_hangul(c)
    if len(c) != 1:
        raise ValueError("Input string must have exactly one character.")

    init, med, final = None, None, None
    if is_hangul_syllable(c):
        offset = ord(c) - 0xac00
        x = (offset - offset % 28) // 28
        init, med, final = x // 21, x % 21, offset % 28
        if not final:
            final = None
        else:
            final -= 1
    else:
        pos = get_jamo_type(c)
        if pos & INITIAL == INITIAL:
            pos = INITIAL
        elif pos & MEDIAL == MEDIAL:
            pos = MEDIAL
        elif pos & FINAL == FINAL:
            pos = FINAL
        idx = CHAR_INDICES[pos][c]
        if pos == INITIAL:
            init = idx
        elif pos == MEDIAL:
            med = idx
        else:
            final = idx
    return tuple(CHAR_LISTS[pos][idx] if idx is not None else None
                 for pos, idx in
                 zip([INITIAL, MEDIAL, FINAL], [init, med, final]))


def split_syllables(s, ignore_err=True, pad=None):
    """
    Performs syllable-split on a string.

    Arguments:
        s (str): A string (possibly mixed with non-Hangul characters).
        ignore_err (bool): If set False, it ensures that all characters in
            the string are Hangul-splittable and throws a ValueError otherwise.
            (default: True)
        pad (str): Pad empty jamo positions (initial, medial, or final) with
            `pad` character. This is useful for cases where fixed-length
            strings are needed. (default: None)

    Returns:
        Hangul-split string

    Example:
        >>> split_syllables("안녕하세요")
        "ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ"
        >>> split_syllables("안녕하세요~~", ignore_err=False)
        ValueError: encountered an unsupported character: ~ (0x7e)
        >>> split_syllables("안녕하세요ㅛ", pad="x")
        'ㅇㅏㄴㄴㅕㅇㅎㅏxㅅㅔxㅇㅛxxㅛx'
    """

    def try_split(c):
        try:
            return split_syllable_char(c)
        except ValueError:
            if ignore_err:
                return (c,)
            raise ValueError(f"encountered an unsupported character: "
                             f"{c} (0x{ord(c):x})")

    s = map(try_split, s)
    if pad is not None:
        tuples = map(lambda x: tuple(pad if y is None else y for y in x), s)
    else:
        tuples = map(lambda x: filter(None, x), s)
    return "".join(itertools.chain(*tuples))


def join_jamos_char(init, med, final=None):
    """
    Combines jamos into a single syllable.

    Arguments:
        init (str): Initial jao.
        med (str): Medial jamo.
        final (str): Final jamo. If not supplied, the final syllable is made
            without the final. (default: None)

    Returns:
        A Korean syllable.
    """
    chars = (init, med, final)
    for c in filter(None, chars):
        check_hangul(c, jamo_only=True)

    idx = tuple(CHAR_INDICES[pos][c] if c is not None else c
                for pos, c in zip((INITIAL, MEDIAL, FINAL), chars))
    init_idx, med_idx, final_idx = idx
    # final index must be shifted once as
    # final index with 0 points to syllables without final
    final_idx = 0 if final_idx is None else final_idx + 1
    return chr(0xac00 + 28 * 21 * init_idx + 28 * med_idx + final_idx)


def join_jamos(s, ignore_err=True):
    """
    Combines a sequence of jamos to produce a sequence of syllables.

    Arguments:
        s (str): A string (possible mixed with non-jamo characters).
        ignore_err (bool): If set False, it will ensure that all characters
            will be consumed for the making of syllables. It will throw a
            ValueError when it fails to do so. (default: True)

    Returns:
        A string

    Example:
        >>> join_jamos("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안녕하세요"
        >>> join_jamos("ㅇㅏㄴㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안ㄴ녕하세요"
        >>> join_jamos()
    """
    last_t = 0
    queue = []
    new_string = ""

    def flush(n=0):
        new_queue = []
        while len(queue) > n:
            new_queue.append(queue.pop())
        if len(new_queue) == 1:
            if not ignore_err:
                raise ValueError(f"invalid jamo character: {new_queue[0]}")
            result = new_queue[0]
        elif len(new_queue) >= 2:
            try:
                result = join_jamos_char(*new_queue)
            except (ValueError, KeyError):
                # Invalid jamo combination
                if not ignore_err:
                    raise ValueError(f"invalid jamo characters: {new_queue}")
                result = "".join(new_queue)
        else:
            result = None
        return result

    for c in s:
        if c not in CHARSET:
            if queue:
                new_c = flush() + c
            else:
                new_c = c
            last_t = 0
        else:
            t = get_jamo_type(c)
            new_c = None
            if t & FINAL == FINAL:
                if not (last_t == MEDIAL):
                    new_c = flush()
            elif t == INITIAL:
                new_c = flush()
            elif t == MEDIAL:
                if last_t & INITIAL == INITIAL:
                    new_c = flush(1)
                else:
                    new_c = flush()
            last_t = t
            queue.insert(0, c)
        if new_c:
            new_string += new_c
    if queue:
        new_string += flush()
    return new_string

In [35]:
import re

c = '바보는 아니고 c/ l/ b/ *'
split_syllables(c)


chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    batch['text'] = split_syllables(batch['text'])
    return batch

In [36]:
remove_spectial_char_data = all_data.map(remove_special_characters)

  0%|          | 0/1000 [00:00<?, ?ex/s]

In [37]:
df = pd.DataFrame(remove_spectial_char_data)
df.rename(columns = {'src' : 'filename'}, inplace = True)
df.head()

Unnamed: 0.1,Unnamed: 0,filename,text
0,0,KsponSpeech_001001.wav,n/ ㅇㅏ ㄱㅡㄹㅓㄴㄱㅏㅇㅛ
1,1,KsponSpeech_001002.wav,ㄱㅡㄴㅑㅇ ㅂㅕㄹ ㅇㅕㄹㅅㅣㅁㅎㅣ ㅎㅏㅈㅣ ㅇㅏㄶㅇㅡㄴ ㄱㅓ ㅇㅏㄴㅣㅇㅑ ㅇㅣ ㅇㅕ...
2,2,KsponSpeech_001003.wav,ㅂㅏㄲㅜㄹㄹㅕㅁㅕㄴ ㄱㅡㄹㅓㄴ ㄱㅓㄹ ㅂㅏㄲㅝㅇㅑㅈㅣ
3,3,KsponSpeech_001004.wav,ㅇㅛ ㅇㅛㅈㅡㅁ ㅇㅕㅇㅎㅘ ㅂㅗㄴ ㄱㅓ ㅇㅣㅆㄴㅣ
4,4,KsponSpeech_001005.wav,ㅈㅗㅎㅇㅡㄴ ㄱㅓㄹㅏㄱㅗ ㅅㅐㅇㄱㅏㄱㅇㅣ ㄷㅡㄹㅇㅓㅆㄴㅡㄴㄷㅔ b/ ㅈㅗㄹㅇㅓㅂㅎㅏ...


In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}


In [None]:
char_vocab = remove_spectial_char_data.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=remove_spectial_char_data.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
char_vocab

Dataset({
    features: ['vocab', 'all_text'],
    num_rows: 1
})

In [None]:
vocab_list = list(set(char_vocab["vocab"][0]))

In [None]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{' ': 21,
 '*': 52,
 '/': 43,
 '<': 62,
 '>': 55,
 'a': 14,
 'b': 64,
 'c': 11,
 'f': 51,
 'k': 7,
 'l': 39,
 'm': 18,
 'n': 3,
 'p': 49,
 's': 40,
 't': 38,
 'u': 10,
 'ㄱ': 45,
 'ㄲ': 63,
 'ㄴ': 59,
 'ㄵ': 27,
 'ㄶ': 60,
 'ㄷ': 31,
 'ㄸ': 1,
 'ㄹ': 53,
 'ㄺ': 24,
 'ㄻ': 9,
 'ㄼ': 56,
 'ㄾ': 5,
 'ㅀ': 32,
 'ㅁ': 13,
 'ㅂ': 54,
 'ㅃ': 28,
 'ㅄ': 44,
 'ㅅ': 22,
 'ㅆ': 0,
 'ㅇ': 33,
 'ㅈ': 42,
 'ㅉ': 8,
 'ㅊ': 26,
 'ㅋ': 16,
 'ㅌ': 12,
 'ㅍ': 57,
 'ㅎ': 36,
 'ㅏ': 41,
 'ㅐ': 37,
 'ㅑ': 20,
 'ㅒ': 2,
 'ㅓ': 30,
 'ㅔ': 17,
 'ㅕ': 29,
 'ㅖ': 47,
 'ㅗ': 34,
 'ㅘ': 46,
 'ㅙ': 48,
 'ㅚ': 25,
 'ㅛ': 15,
 'ㅜ': 4,
 'ㅝ': 50,
 'ㅞ': 58,
 'ㅟ': 61,
 'ㅠ': 6,
 'ㅡ': 19,
 'ㅢ': 35,
 'ㅣ': 23}

In [None]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["<unk>"] = len(vocab_dict)
vocab_dict["<pad>"] = len(vocab_dict)
vocab_dict["c/"] = len(vocab_dict)
vocab_dict["b/"] = len(vocab_dict)
vocab_dict["n/"] = len(vocab_dict)
vocab_dict["l/"] = len(vocab_dict)
vocab_dict["*"] = len(vocab_dict)
len(vocab_dict)

71

In [None]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json",
                                 unk_token="[UNK]",
                                 pad_token="[PAD]",
                                 word_delimiter_token="|")

## Create XLSR-Wav2Vec2 Feature Extractor

In [None]:
import torch

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True)

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive/')

In [None]:
# processor.save_pretrained("/content/gdrive/MyDrive/wav2vec2-large-xlsr-ready")

## add audio array

In [None]:
import librosa

In [None]:
def load_audio(batch):
    batch['array'],_ = librosa.load('/content/drive/MyDrive/Final/KsponSpeech_0001/'+batch['src'],sr=16000)
#     batch['array'],_ = librosa.load(batch['src'], sr=16000)
    return batch

In [None]:
audio_data = remove_spectial_char_data.map(load_audio)
# all_data = all_data.map(load_audio)

  0%|          | 0/1000 [00:00<?, ?ex/s]

In [None]:
print(len(audio_data[0]['array']))

50368


## Preprocess Data

In [None]:
def prepare_dataset(batch):

    # batched output is "un-batched"
    batch["input_values"] = processor(batch["array"], sampling_rate=16000).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [None]:
audio_data

Dataset({
    features: ['Unnamed: 0', 'src', 'text', 'array'],
    num_rows: 1000
})

In [None]:
order_voice = audio_data.map(
    prepare_dataset,
    remove_columns=remove_spectial_char_data.column_names,
    num_proc=4 #병렬작업
)

     

#0:   0%|          | 0/250 [00:00<?, ?ex/s]

  

#1:   0%|          | 0/250 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/250 [00:00<?, ?ex/s]

#3:   0%|          | 0/250 [00:00<?, ?ex/s]

In [None]:
order_voice

Dataset({
    features: ['array', 'input_values', 'labels'],
    num_rows: 1000
})

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,src,text
0,0,KsponSpeech_000001.wav,아 몬 소리야 그건 또
1,1,KsponSpeech_000002.wav,나는 악습은 원래 없어진다 없어져야 된다고 생각하긴 했는데 근데 그 약간 필요악으로...
2,2,KsponSpeech_000003.wav,그래서 지호랑 계단 올라와서 막 위에 운동하는 기구 있대요 그서 그걸로 운...
3,3,KsponSpeech_000004.wav,뭐 정신과 병원도 그 약 타서 먹어보고 그 한동안 연락이 안 된 적이 있었단 말이야...
4,4,KsponSpeech_000005.wav,그게 프로영 점 일 프로 가정의 아이들과 가정의 모습이야


In [None]:
print(len(audio_data[0]['array']))

50368


In [None]:
df = pd.DataFrame(df,columns=['filename','text','array'])
df.head()

Unnamed: 0,src,text,array
0,KsponSpeech_000001.wav,아 몬 소리야 그건 또,
1,KsponSpeech_000002.wav,나는 악습은 원래 없어진다 없어져야 된다고 생각하긴 했는데 근데 그 약간 필요악으로...,
2,KsponSpeech_000003.wav,그래서 지호랑 계단 올라와서 막 위에 운동하는 기구 있대요 그서 그걸로 운...,
3,KsponSpeech_000004.wav,뭐 정신과 병원도 그 약 타서 먹어보고 그 한동안 연락이 안 된 적이 있었단 말이야...,
4,KsponSpeech_000005.wav,그게 프로영 점 일 프로 가정의 아이들과 가정의 모습이야,


In [None]:
df['array'] = audio_data['array']
df.head()

Unnamed: 0,src,text,array
0,KsponSpeech_000001.wav,아 몬 소리야 그건 또,"[-0.0009765625, -0.0009765625, -0.000885009765..."
1,KsponSpeech_000002.wav,나는 악습은 원래 없어진다 없어져야 된다고 생각하긴 했는데 근데 그 약간 필요악으로...,"[0.000885009765625, 0.000885009765625, 0.00088..."
2,KsponSpeech_000003.wav,그래서 지호랑 계단 올라와서 막 위에 운동하는 기구 있대요 그서 그걸로 운...,"[0.0001220703125, 6.103515625e-05, 0.000183105..."
3,KsponSpeech_000004.wav,뭐 정신과 병원도 그 약 타서 먹어보고 그 한동안 연락이 안 된 적이 있었단 말이야...,"[-0.0009765625, -6.103515625e-05, 0.0006713867..."
4,KsponSpeech_000005.wav,그게 프로영 점 일 프로 가정의 아이들과 가정의 모습이야,"[-0.0001220703125, 0.0, 3.0517578125e-05, 3.05..."


# Training

## Set-up Trainer

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.cuda.HalfTensor]]]) -> Dict[str, torch.cuda.HalfTensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
!pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset, load_metric, Audio

wer_metric = load_metric("wer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Import Model

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'project_hid.weight', 'project_hid.bias', 'project_q.weight', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to u

XLSR-Wav2Vec2의 첫 번째 구성 요소는 원시 음성 신호에서 음향적으로 의미가 있지만 문맥적으로 독립적인 기능을 추출하는 데 사용되는 CNN 계층 스택으로 구성됩니다.  
모델의 이 부분은 사전 교육 중에 이미 충분히 훈련되었으며 논문에 명시된 바와 같이 더 이상 미세 조정할 필요가 없습니다. 따라서 특징 추출 부분의 모든 파라미터에 대해 require_grad를 False로 설정할 수 있다.

In [None]:
model.freeze_feature_extractor()



메모리를 절약하기 위해 그라데이션 체크포인팅을 활성화

In [None]:
model.gradient_checkpointing_enable()

## TrainingArguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  # output_dir="/content/gdrive/MyDrive/wav2vec2-large-xlsr-turkish-demo",
  output_dir="./wav2vec2-large-xlsr-ko-demo",
  group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=3,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)

In [None]:
from transformers import Trainer
from torch.utils.data import random_split

ds_size = len(order_voice)
train_size = int(ds_size*0.8)
val_size = ds_size - train_size
train_ds, val_ds = random_split(order_voice,[train_size,val_size])

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=processor.feature_extractor,
)

Using amp half precision backend


## Model Training

In [None]:
torch.cuda.is_available()

True

In [None]:
hist = trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 150
The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: array. If array are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss,Wer
100,26.6008,21.37887,1.0


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: array. If array are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
Saving model checkpoint to ./wav2vec2-large-xlsr-ko-demo/checkpoint-100
Configuration saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/config.json
Model weights saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/pytorch_model.bin
Feature extractor saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/preprocessor_config.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
hist = trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 150
The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: array. If array are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss,Wer
100,4.5007,4.456544,1.0


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: array. If array are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
Saving model checkpoint to ./wav2vec2-large-xlsr-ko-demo/checkpoint-100
Configuration saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/config.json
Model weights saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/pytorch_model.bin
Feature extractor saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/preprocessor_config.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
hist = trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 150
The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: array. If array are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss,Wer
100,4.4285,4.361781,1.0


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: array. If array are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
Saving model checkpoint to ./wav2vec2-large-xlsr-ko-demo/checkpoint-100
Configuration saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/config.json
Model weights saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/pytorch_model.bin
Feature extractor saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/preprocessor_config.json


Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
hist = trainer.train()

***** Running training *****
  Num examples = 800
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 150
The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: array. If array are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss,Wer
100,4.2634,4.333417,1.0


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: array. If array are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
Saving model checkpoint to ./wav2vec2-large-xlsr-ko-demo/checkpoint-100
Configuration saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/config.json
Model weights saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/pytorch_model.bin
Feature extractor saved in ./wav2vec2-large-xlsr-ko-demo/checkpoint-100/preprocessor_config.json


Training completed. Do not forget to share your model on huggingface.co/models =)




CTC 손실을 사용하여 더 큰 데이터 세트에서 더 큰 모델을 미세 조정하려면 [여기서](https://github.com/huggingface/transformers/tree/master/examples/pytorch/speech-recognition#connectionist-temporal-classification-without-language-model-ctc-wo-lm) 공식 음성 인식 예를 살펴봐야 한다.

## Model predict

In [None]:
model(data_collator([order_voice[0]]))

KeyError: ignored