In [None]:
import pandas as pd
import os
import librosa
import soundfile as sf
from tqdm import tqdm
import wave
import contextlib
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
# drive mount
from google.colab import drive
drive.mount('/content/drive')

### 1. 경로 재할당

In [None]:
domain = 'domain'

df = pd.read_csv(f"/content/drive/MyDrive/nexochat/NexoChat_share_data/youtube_dataset_30sec/csv_file/{domain}/{domain}_cer_filtered.csv", index_col=0)

# CER==0 은 오류 데이터이므로 제거
df = df[df['CER']!=0]

In [None]:
# 'raw_data' 컬럼을 수정하여 'raw_data_resampled' 컬럼을 만드는 함수

def modify_path(path):
    # '/audio/segment/' 부분을 찾아 '/audio/resampled/'로 변경
    new_path = path.replace('/audio/segment/', '/audio/resampled/')
    return new_path

# 새 컬럼 생성
df['raw_data_resampled'] = df['raw_data'].apply(modify_path)

df.to_csv(f"/content/drive/MyDrive/nexochat/NexoChat_share_data/youtube_dataset_30sec/csv_file/{domain}/{domain}_cer_filtered.csv")

### 2. Resampling - 병렬처리

In [None]:
import os
import subprocess
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# ffmpeg를 사용하여 오디오 파일을 16kHz 모노로 변환하는 함수
def convert_to_16k_mono(input_path, output_path):
    try:
        # sampling rate를 16000으로, channel 수를 1개로 변경한다. 변경 완료 후 용량은 1/6배로 줄어든다.
        # -y: 동일한 파일이 있을 경우 덮어쓴다.
        command = [
            'ffmpeg', '-y', '-i', input_path, '-ac', '1', '-ar', '16000', output_path
        ]
        subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)

    except subprocess.CalledProcessError as e:
        print(f"Error: {e.stderr.decode()}")

# 데이터프레임을 순회하면서 필요한 모든 폴더를 미리 생성
# 병렬 처리에서 파일이 중복으로 생기는 문제 방지
for index, row in df.iterrows():
    output_dir = os.path.dirname(row['raw_data_resampled'])
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

# 병렬 처리를 위한 함수
def process_audio(row):
    convert_to_16k_mono(row['raw_data'], row['raw_data_resampled'])

# 적절한 max_workers 값 설정
# colab CPU 코어 수의 2배 설정
max_workers = 16

# 데이터프레임을 병렬로 처리
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = [executor.submit(process_audio, row) for index, row in df.iterrows()]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing audio files"):
        future.result()

print("오디오 파일이 성공적으로 변환되어 저장되었습니다.")


Processing audio files:  18%|█▊        | 3697/21001 [05:49<23:37, 12.21it/s]

### 3. resampling 전후 용량 및 시간 차이 확인

- resample 전후 용량 차이 확인

In [None]:
def get_total_size(file_paths):
    total_size = 0
    for file_path in file_paths:
        if os.path.exists(file_path):
            total_size += os.path.getsize(file_path)
    return total_size

def format_size(size_in_bytes):
    size_in_mb = size_in_bytes / (1024 ** 2)
    size_in_gb = size_in_bytes / (1024 ** 3)
    return size_in_mb, size_in_gb

# raw_data 경로의 파일들의 용량 합계
raw_data_total_size_bytes = get_total_size(df['raw_data'])
raw_data_total_size_mb, raw_data_total_size_gb = format_size(raw_data_total_size_bytes)

# raw_data_resampled 경로의 파일들의 용량 합계
raw_data_resampled_total_size_bytes = get_total_size(df['raw_data_resampled'])
raw_data_resampled_total_size_mb, raw_data_resampled_total_size_gb = format_size(raw_data_resampled_total_size_bytes)

print(f"Total size of raw_data files: {raw_data_total_size_bytes} bytes ({raw_data_total_size_mb:.2f} MB, {raw_data_total_size_gb:.2f} GB)")
print(f"Total size of raw_data_resampled files: {raw_data_resampled_total_size_bytes} bytes ({raw_data_resampled_total_size_mb:.2f} MB, {raw_data_resampled_total_size_gb:.2f} GB)")


- resample 전후 시간 차이 확인

In [None]:
def get_audio_length(audio_file):
    try:
        if audio_file.endswith('.wav'):
            with contextlib.closing(wave.open(audio_file, 'r')) as f:
                frames = f.getnframes()
                rate = f.getframerate()
                duration = frames / float(rate)
                return duration
        else:
            f = sf.SoundFile(audio_file)
            return len(f) / f.samplerate
    except Exception as e:
        print(f"Error processing file {audio_file}: {e}")
        return 0.0

def calculate_total_length(audio_files):
    total_length = 0.0
    with ThreadPoolExecutor(max_workers=16) as executor:
        future_to_file = {executor.submit(get_audio_length, file): file for file in audio_files}
        for future in tqdm(as_completed(future_to_file), total=len(future_to_file), desc="Calculating audio lengths"):
            total_length += future.result()
    return total_length

# Calculate lengths for raw_data
raw_data_list = list(df['raw_data'])
total_length_raw = calculate_total_length(raw_data_list)

# Calculate lengths for raw_data_resampled
raw_data_resampled_list = list(df['raw_data_resampled'])
total_length_resampled = calculate_total_length(raw_data_resampled_list)

# Convert lengths to minutes and hours
total_length_raw_minutes = total_length_raw / 60
total_length_raw_hours = total_length_raw_minutes / 60

total_length_resampled_minutes = total_length_resampled / 60
total_length_resampled_hours = total_length_resampled_minutes / 60

# Print results
print(f"Total length of all raw audio files: {total_length_raw:.2f} seconds")
print(f"Total length of all raw audio files: {total_length_raw_minutes:.2f} minutes")
print(f"Total length of all raw audio files: {total_length_raw_hours:.2f} hours")

print()

print(f"Total length of all resampled audio files: {total_length_resampled:.2f} seconds")
print(f"Total length of all resampled audio files: {total_length_resampled_minutes:.2f} minutes")
print(f"Total length of all resampled audio files: {total_length_resampled_hours:.2f} hours")