## Train
wav2vec 모델을 fine-tuning하는 과정

In [1]:
## Import Library / Package

In [2]:
# pip

In [3]:
# import
import os
import json
import torch
import torchaudio
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn

from transformers import Wav2Vec2ForCTC, AutoProcessor,Wav2Vec2CTCTokenizer

  from .autonotebook import tqdm as notebook_tqdm
2024-04-18 18:28:36.694955: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-18 18:28:36.750022: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-18 18:28:36.750067: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-18 18:28:36.750116: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-18 18:28:3

#### Define Custom DataSet
로컬에 있는 데이터를 pytorch의 Dataset 클래스를 상속 받아 사용하는 과정

In [4]:
class CustomDataset(Dataset):
    def __init__(self, json_path, processor):
        self.json_path = json_path
        self.processor = processor
        
        with open(json_path, 'r') as f:
            self.data = json.load(f)

        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        try:
            if "path" in sample and "transcription" in sample :
                print("처음")
                file_path = sample["path"]
                transcription = sample["transcription"]
                
                waveform, sampling_rate = torchaudio.load(file_path)
                print("waveform",waveform)
                inputs = self.processor(waveform,sampling_rate=sampling_rate, return_tensors="pt")
                labels = self.processor.tokenizer(transcription, return_tensors="pt").input_ids
                print("자료들",inputs,labels)
                print("끝")
                return {
                    "input_values": inputs,
                    "labels": labels
                }
                
        except AttributeError:
            return None

#### Set Up Config For DataSet
데이터셋을 위한 기본 설정을 셋팅함
현재 메모리 이슈가 있어 batch_size 및 num_workers 설정

num_workers : 일꾼들... gpu * 4

In [5]:
# GPU

# 에러 로깅 가능
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'

# cuda가 볼 수 잇는 GPU => 내가 사용할 GPU
os.environ["CUDA_VISIBLE_DEVICES"] = '1,2,3,4,5,6,7'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 데이터 병렬 처리를 위한..셋팅...
model_name = "facebook/wav2vec2-base-960h"
processor = AutoProcessor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)

if torch.cuda.is_available():
    model = nn.DataParallel(model)
    processor = nn.DataParallel(processor)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [6]:
# Setting
learning_rate = 1e-4
num_epoch=2

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CTCLoss()

In [7]:
# Load Data
num_workers = 16
batch_size = 32
json_data_path = "./data/exist_test/rami_mapping.json"

dataset = CustomDataset(json_data_path, processor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

In [8]:
# 모델 평가
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in dataloader:
        try:
            waveforms, sample_rates = batch
            waveforms = waveforms.to(device)
            
            # forward pass 및 예측값 얻기
            outputs = model(input_values=waveforms)
            predicted_labels = get_predicted_labels(outputs)
            
            # 정확도 계산
            total_correct += (predicted_labels == true_labels).sum().item()
            total_samples += len(true_labels)
        except TypeError:
            continue

# 전체 테스트 데이터셋에 대한 정확도 계산
accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy}")

처음처음처음
처음처음처음처음





waveformwaveformwaveform   waveform tensor([[-0.0013, -0.0014, -0.0013,  ..., -0.0016, -0.0013, -0.0013]])tensor([[ 0.0004, -0.0011, -0.0007,  ..., -0.0002, -0.0008, -0.0013]])tensor([[-0.0031, -0.0037, -0.0023,  ...,  0.0028,  0.0020, -0.0011]])


처음처음waveform처음
tensor([[-0.0008, -0.0007, -0.0002,  ...,  0.0013,  0.0005, -0.0001]])

waveform 
waveform  처음tensor([[-0.0030,  0.0179,  0.0232,  ..., -0.0001, -0.0002, -0.0006]])tensor([[-3.0518e-04, -1.0071e-03, -1.8311e-04,  ..., -1.2207e-04,
         -9.1553e-05,  9.1553e-05]])tensor([[-0.0120, -0.0162, -0.0012,  ..., -0.0005,  0.0005,  0.0024]])waveformwaveform


  
waveform처음tensor([[8.5449e-04, 3.6316e-03, 5.0659e-03,  ..., 6.1035e-05, 0.0000e+00,
         6.1035e-05]])tensor([[ 0.0001,  0.0002,  0.0003,  ..., -0.0009, -0.0011, -0.0009]])처음 


처음tensor([[ 0.0019,  0.0019,  0.0013,  ..., -0.0014, -0.0016, -0.0016]])
처음처음



처음
waveform tensor([[-2.7466e-04, -2.7466e-04, -9.1553e-05,  ...,  2.9602e-03,
         -2.1

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/haram/.local/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/haram/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/home/haram/.local/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 277, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/home/haram/.local/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 152, in collate
    raise TypeError(default_collate_err_msg_format.format(elem_type))
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>
