In [1]:
# 필요한 패키지(라이브러리)를 구글 드라이버에 설치하였는가?
IS_INSTALL_ENV = True
IS_COLAB_ENV = True

## pyTorch 환경설정

In [2]:
import os, sys 
import torch
# True가 나와야 cuda 환경을 사용할 수 있다.
torch.cuda.is_available()

True

In [3]:
if IS_COLAB_ENV == True:
    from google.colab import drive  
    drive.mount('/content/drive')   

    # colab의 os 폴더 위치
    ds_path = '/content/dataset' 

    # Google Drive에서 설치된 파일 위치 - 미리 해당 폴더를 생성해야 한다.
    os.symlink('/content/drive/My Drive/Colab Notebooks/Project/AI 속기사/dataset/', ds_path)  

    # colab의 os 폴더 위치
    pg_path = '/content/packages' 

    # Google Drive에서 설치된 파일 위치 - 미리 해당 폴더를 생성해야 한다.
    os.symlink('/content/drive/My Drive/Colab Notebooks/packages/', pg_path)  
    sys.path.insert(0, pg_path)  

    # 거대한 모델을 위한 경로
    md_path = '/content/wav2vec2-large-xlsr-ko-demo' 
    os.symlink('/content/drive/My Drive/Colab Notebooks/Project/AI 속기사/wav2vec2-large-xlsr-ko-demo/', md_path)  


    # colab에서는 jiwer는 미리 설치해도 계속 설치하라고 나옴 
    !pip install jiwer
    
    if IS_INSTALL_ENV == False:
    # !pip install transformers 대신 영구 설치
    # os.symlink로 걸어놓은 위치에서 pg_path로 연결된 폴더에 영구 설치한다. 1번만 설치하면 2번 설치할 필요 없음.
        !pip install --target=$pg_path datasets
        !pip install --target=$pg_path transformers  


    # /content/drive/My Drive/Colab Notebooks/Project/AI 속기사/dataset/KlecSpeech/

Mounted at /content/drive


## Create Wav2Vec2CTCTokenizer

In [4]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import tqdm
import glob

In [5]:
dir_path = ['./dataset/KlecSpeech/csv/', './dataset/csv/']
file_prefix = ['klecspeech_ko_500', 'order_speech_ko1000']

In [15]:
# 모든 csv 파일 리스트를 가져온다.
def get_all_filelist(dir_path):
    csv_files = []
    for i in range(len(dir_path)):
        csv_list = os.listdir(dir_path[i])
        for file in csv_list:
            csv_files.append(dir_path[i]+file)
    return csv_files

In [16]:
# 결과를 확인한다.
all_files = get_all_filelist(dir_path)

In [17]:
# get_filelist 
# start_idx부터 step_num 건수 만큼 파일리스트 추가
def get_filelist(csv_list, start_idx, step_num):

    end_idx = len(csv_list)
    new_list = []
    if len(csv_list) == 0:
        print("array size is 0")
        return None
    else:
        if len(csv_list) < start_idx + step_num:
            end_idx = len(csv_list)
        else:
            end_idx = start_idx + step_num

        for i in range(start_idx, end_idx):
            print("file appned : ", i, " ",csv_list[i])
            new_list.append(csv_list[i])
        return new_list

In [18]:
def load_datasets(files):
    # files가 list로 되어 있으면 한번에 합쳐서 load해온다.
    dataset = load_dataset('csv',data_files=files, split='train', sep='\t')
    return dataset

In [19]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

In [21]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json",
                                 unk_token="[UNK]",
                                 pad_token="[PAD]",
                                 word_delimiter_token="|")

## Create XLSR-Wav2Vec2 Feature Extractor

In [22]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True)

In [23]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

## add audio array

In [25]:
# import librosa
# def load_audio(batch):
#     batch['array'], _ = librosa.load(batch['filename'],sr=16000)
#     return batch

# csv에 array를 미리 넣었다.
def load_audio(batch):
    batch['array'] = np.array(batch['array'][1:-1].split(',')).astype(np.float32)
    return batch

In [26]:
def prepare_dataset(batch):

    # batched output is "un-batched"
    batch["input_values"] = processor(batch["array"], sampling_rate=16000).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

# Training

In [27]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.cuda.HalfTensor]]]) -> Dict[str, torch.cuda.HalfTensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

## Set-up Trainer

In [28]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## TrainingArguments

In [29]:
from transformers import TrainingArguments
from transformers import Trainer

training_args = TrainingArguments(
  # output_dir="/content/gdrive/MyDrive/wav2vec2-large-xlsr-ko-demo",
  output_dir="./wav2vec2-large-xlsr-ko-demo",
  group_by_length=True,
  per_device_train_batch_size=8, # 원래는 16인데 cuda memory가 15G를 요구하기 때문에 절반으로 줄였다.
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=5,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
)

In [39]:
model_pretrain_path = './wav2vec2-large-xlsr-ko-demo/'

# pretrain 폴더에서 가장 마지막 폴더를 가져온다.
def get_last_pretrain_model():
    listdirs = os.listdir(model_pretrain_path)
    model_pretain_list = [folder for folder in listdirs if folder.startswith("checkpoint-")]
    if len(model_pretain_list)==0:
        print("facebook/wav2vec2-large-xlsr-53")
        return "facebook/wav2vec2-large-xlsr-53"
    else:
        path = model_pretrain_path + max(model_pretain_list)
        print(path)
        return path

In [40]:
from datasets import load_dataset, load_metric, Audio
from transformers import Wav2Vec2ForCTC

from torch.utils.data import random_split
import gc

# 한 번 실행에 몇 개의 파일을 묶을 것인가?
num_of_files = 2

# load_metric : 이미 구현된 메트릭을 사용할 수 있는 함수
wer_metric = load_metric("wer")

for i in range(0, len(all_files), num_of_files):
    filelist = get_filelist(all_files, i, num_of_files)
    print(filelist)
    data_csv = load_datasets(filelist)

    remove_spectial_char_data = data_csv.map(remove_special_characters)
    # list를 dataframe으로 변경해주고, dataframe에서 Dataset으로 변경해준다.
    # 기존 'array' type이 str이라서 문제가 발생된다.
    audio_data = remove_spectial_char_data.map(load_audio)

    # Dataset type이 맞는지 확인할 것
    print(type(audio_data))
    print(type(data_csv))

    ## step : processing
    # for문 안에서 num_proc를 1 이상으로 설정하면, raise PicklingError 발생. 원인은 아직 모름.
    # for문 밖에서는 num_proc를 4로 설정해도 이상없음.
    order_voice = audio_data.map(
        prepare_dataset,
        remove_columns=remove_spectial_char_data.column_names,
        num_proc=1 # jupyter notebook에서는 에러발생 #colab 또는 하드웨어 준비된 환경에서 사용 가능한 옵션, 병렬로 함수를 실행할 수있다. 
    )

    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
    
    model = Wav2Vec2ForCTC.from_pretrained(
    get_last_pretrain_model(), # colab에서는 처음에 "facebook/wav2vec2-large-xlsr-53", 
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer))
    

    # XLSR-Wav2Vec2의 첫 번째 구성 요소는 원시 음성 신호에서 음향적으로 의미가 있지만 
    # 문맥적으로 독립적인 기능을 추출하는 데 사용되는 CNN 계층 스택으로 구성됩니다.
    # 모델의 이 부분은 사전 교육 중에 이미 충분히 훈련되었으며 논문에 명시된 바와 같이 더 이상 미세 조정할 필요가 없습니다. 
    # 따라서 특징 추출 부분의 모든 파라미터에 대해 require_grad를 False로 설정할 수 있다.
    model.freeze_feature_extractor()

    # 메모리를 절약하기 위해 그라데이션 체크포인팅을 활성화
    model.gradient_checkpointing_enable()

    ds_size = len(order_voice)
    train_size = int(ds_size*0.8)
    val_size = ds_size - train_size
    train_ds, val_ds = random_split(order_voice,[train_size,val_size])

    trainer = Trainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=processor.feature_extractor,
    )

    # cuda cache를 초기화 한다.
    torch.cuda.empty_cache()
    gc.collect()

    # RuntimeError: CUDA out of memory가 발생한다면, per_device_train_batch_size를 줄여본다.
    # 단 batch_size를 줄이는 경우, 학습이 늦어지고, 학습률에 영향이 있을 수 있다.
    trainer.train()


Using custom data configuration default-6202ba9b52256026
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-6202ba9b52256026/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


file appned :  0   ./dataset/KlecSpeech/csv/klecspeech_ko_500_000.csv
file appned :  1   ./dataset/KlecSpeech/csv/klecspeech_ko_500_001.csv
['./dataset/KlecSpeech/csv/klecspeech_ko_500_000.csv', './dataset/KlecSpeech/csv/klecspeech_ko_500_001.csv']


  0%|          | 0/20 [00:00<?, ?ex/s]

  0%|          | 0/20 [00:00<?, ?ex/s]

<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>


  0%|          | 0/20 [00:00<?, ?ex/s]

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 20
})
./wav2vec2-large-xlsr-ko-demo/checkpoint-2000


loading configuration file ./wav2vec2-large-xlsr-ko-demo/checkpoint-2000/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "./wav2vec2-large-xlsr-ko-demo/checkpoint-1900",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm": "la

Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Using custom data configuration default-ffe45d03b870e6b3
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-ffe45d03b870e6b3/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-ffe45d03b870e6b3/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-1c80317fa3b1799d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-ffe45d03b870e6b3/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-bdd640fb06671ad1.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-ffe45d03b870e6b3/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-3eb13b9046685257.arrow


file appned :  2   ./dataset/csv/order_speech_ko1000_000.csv
file appned :  3   ./dataset/csv/order_speech_ko1000_001.csv
['./dataset/csv/order_speech_ko1000_000.csv', './dataset/csv/order_speech_ko1000_001.csv']
<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>
Dataset({
    features: ['input_values', 'labels'],
    num_rows: 9
})


loading configuration file ./wav2vec2-large-xlsr-ko-demo/checkpoint-2000/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "./wav2vec2-large-xlsr-ko-demo/checkpoint-1900",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm": "la

./wav2vec2-large-xlsr-ko-demo/checkpoint-2000


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


