In [None]:
import os
import random
import numpy as np
import pandas as pd
import librosa
import soundfile as sf
from tqdm import tqdm
from scipy.signal import wiener
import torch
import warnings
import pickle
from scipy.stats import skew, kurtosis

warnings.filterwarnings("ignore", message="Trying to estimate tuning from empty frequency set.")

class Config:
    SR = 32000
    N_MFCC = 13
    ROOT_FOLDER = './'
    N_CLASSES = 2
    BATCH_SIZE = 32
    N_EPOCHS = 25
    LR = 3e-4
    SEED = 42

CONFIG = Config()

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CONFIG.SEED)

def pad_or_truncate(audio, target_length):
    if len(audio) > target_length:
        return audio[:target_length]
    else:
        return np.pad(audio, (0, target_length - len(audio)), mode='reflect')

def preemphasis(signal, coeff=0.97):
    return np.append(signal[0], signal[1:] - coeff * signal[:-1])

def noise_reduction(signal):
    filtered_signal = wiener(signal)
    return filtered_signal / np.sqrt(np.mean(filtered_signal ** 2))  # RMS 정규화

def energy_vad(signal, frame_length=2048, hop_length=512, threshold=0.1):
    energy = librosa.feature.rms(y=signal, frame_length=frame_length, hop_length=hop_length)[0]
    frames = np.nonzero(energy > threshold)[0]
    vad_signal = []
    for frame in frames:
        start = frame * hop_length
        end = start + frame_length
        vad_signal.extend(signal[start:end])
    return np.array(vad_signal)

def mix_signals(signal1, signal2, sr):
    max_length = max(len(signal1), len(signal2))
    signal1 = np.pad(signal1, (0, max_length - len(signal1)), mode='constant')
    signal2 = np.pad(signal2, (0, max_length - len(signal2)), mode='constant')
    mixed_signal = signal1 + signal2
    return mixed_signal / np.max(np.abs(mixed_signal)), sr  # 정규화

def calculate_statistics(features):
    return np.concatenate([
        np.mean(features, axis=0),
        np.std(features, axis=0),
        np.max(features, axis=0),
        np.min(features, axis=0),
        np.median(features, axis=0),
        np.percentile(features, 25, axis=0),
        np.percentile(features, 75, axis=0),
        skew(features, axis=0),
        kurtosis(features, axis=0)
    ])

def extract_features(audio_path, target_length=5*CONFIG.SR):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        y = librosa.resample(y, orig_sr=sr, target_sr=CONFIG.SR)
        sr = CONFIG.SR

        y = preemphasis(y)
        y = noise_reduction(y)

        vad_signal = energy_vad(y)

        if len(vad_signal) == 0:
            vad_signal = y

        vad_signal = pad_or_truncate(vad_signal, target_length)
        mfcc = librosa.feature.mfcc(y=vad_signal, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc2 = librosa.feature.mfcc(y=vad_signal, sr=sr, n_mfcc=2 * CONFIG.N_MFCC)
        stft = np.abs(librosa.stft(vad_signal))
        chroma = librosa.feature.chroma_stft(y=vad_signal, sr=sr)
        spectral_contrast = librosa.feature.spectral_contrast(y=vad_signal, sr=sr)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(vad_signal), sr=sr)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

        return np.concatenate([
            calculate_statistics(mfcc.T),
            calculate_statistics(mfcc2.T),
            calculate_statistics(stft.T),
            calculate_statistics(chroma.T),
            calculate_statistics(spectral_contrast.T),
            calculate_statistics(tonnetz.T),
            calculate_statistics(mfcc_delta.T),
            calculate_statistics(mfcc_delta2.T)
        ])
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return np.zeros((CONFIG.N_MFCC * 9 + 2 * CONFIG.N_MFCC * 9 + 1025 * 9 + 12 * 9 + 7 * 9 + 6 * 9 + CONFIG.N_MFCC * 9 + CONFIG.N_MFCC * 9))

def save_features(features, labels, filename):
    with open(filename, 'wb') as f:
        pickle.dump((features, labels if labels is not None else []), f)

os.makedirs(os.path.join(CONFIG.ROOT_FOLDER, 'newFolder'), exist_ok=True)

train_df = pd.read_csv(os.path.join(CONFIG.ROOT_FOLDER, 'train.csv'))

train_audio_paths = train_df.iloc[:, 1].tolist()
train_audio_paths = [os.path.join(CONFIG.ROOT_FOLDER, path) for path in train_audio_paths]
train_labels = [1 if label == 'real' else 0 for label in train_df['label']]
train_features = [extract_features(path) for path in tqdm(train_audio_paths, dynamic_ncols=True, position=0, leave=True)]

real_voices = [path for path, label in zip(train_audio_paths, train_labels) if label == 1]
fake_voices = [path for path, label in zip(train_audio_paths, train_labels) if label == 0]

mixed_audio_paths = []
for real_voice, fake_voice in zip(real_voices, fake_voices):
    try:
        y_real, sr_real = librosa.load(real_voice, sr=CONFIG.SR)
        y_fake, sr_fake = librosa.load(fake_voice, sr=CONFIG.SR)
        mixed_signal, sr = mix_signals(y_real, y_fake, sr_real)
        mixed_audio_path = os.path.join(CONFIG.ROOT_FOLDER, f'newFolder/mixed_{os.path.basename(real_voice)}')
        sf.write(mixed_audio_path, mixed_signal, sr)
        mixed_audio_paths.append(mixed_audio_path)
    except Exception as e:
        print(f"Error processing {real_voice} and {fake_voice}: {e}")

mixed_features = [extract_features(path) for path in tqdm(mixed_audio_paths, dynamic_ncols=True, position=0, leave=True)]
mixed_labels = [1] * len(mixed_features)

train_features.extend(mixed_features)
train_labels.extend(mixed_labels)

save_features(train_features, train_labels, os.path.join(CONFIG.ROOT_FOLDER, 'newFolder/train_features.pkl'))

### 학습

In [12]:
import os
import numpy as np
import pandas as pd
import pickle
from scipy.signal import wiener
from scipy.stats import skew, kurtosis
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

class Config:
    N_FEATURES = 9
    SEED = 42

CONFIG = Config()

# 데이터 불러오기
with open('./newFolder/train_features.pkl', 'rb') as f:
    features, labels = pickle.load(f)

# NaN 값을 평균값으로 대체
imputer = SimpleImputer(strategy='mean')
features = imputer.fit_transform(features)

# 다중 레이블로 변환
labels = np.array(labels)
y = np.column_stack([(labels == 0).astype(int), (labels == 1).astype(int)])

# 데이터셋을 학습용과 테스트용으로 분리
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=CONFIG.SEED)

# 다중 레이블 분류를 위한 랜덤 포레스트 모델 생성
rf_model = MultiOutputClassifier(RandomForestClassifier(random_state=CONFIG.SEED))

# 모델 학습
rf_model.fit(X_train, y_train)

# 모델 평가
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
roc_auc_fake = roc_auc_score(y_test[:, 0], [prob[1] for prob in y_pred_proba[0]])
roc_auc_real = roc_auc_score(y_test[:, 1], [prob[1] for prob in y_pred_proba[1]])

print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC Score (Fake): {roc_auc_fake:.4f}")
print(f"ROC AUC Score (Real): {roc_auc_real:.4f}")

# 모델 저장
with open('./newFolder/rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)


Accuracy: 0.9783
ROC AUC Score (Fake): 0.9980
ROC AUC Score (Real): 0.9980


In [15]:
import os
import numpy as np
import pandas as pd
import librosa
import pickle
from tqdm import tqdm
from scipy.signal import wiener
from scipy.stats import skew, kurtosis
from sklearn.impute import SimpleImputer

class Config:
    SR = 32000
    N_MFCC = 13
    ROOT_FOLDER = './'
    N_CLASSES = 2
    BATCH_SIZE = 32
    N_EPOCHS = 50
    LR = 3e-4
    SEED = 42

CONFIG = Config()

def pad_or_truncate(audio, target_length):
    if len(audio) > target_length:
        return audio[:target_length]
    else:
        return np.pad(audio, (0, target_length - len(audio)), mode='reflect')

def preemphasis(signal, coeff=0.97):
    return np.append(signal[0], signal[1:] - coeff * signal[:-1])

def noise_reduction(signal):
    filtered_signal = wiener(signal)
    return filtered_signal / np.sqrt(np.mean(filtered_signal ** 2))  # RMS 정규화

def energy_vad(signal, frame_length=2048, hop_length=512, threshold=0.1):
    energy = librosa.feature.rms(y=signal, frame_length=frame_length, hop_length=hop_length)[0]
    frames = np.nonzero(energy > threshold)[0]
    vad_signal = []
    for frame in frames:
        start = frame * hop_length
        end = start + frame_length
        vad_signal.extend(signal[start:end])
    return np.array(vad_signal)

def calculate_statistics(features):
    return np.concatenate([
        np.mean(features, axis=0),
        np.std(features, axis=0),
        np.max(features, axis=0),
        np.min(features, axis=0),
        np.median(features, axis=0),
        np.percentile(features, 25, axis=0),
        np.percentile(features, 75, axis=0),
        skew(features, axis=0),
        kurtosis(features, axis=0)
    ])

def extract_features(audio_path, target_length=5*CONFIG.SR):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        y = librosa.resample(y, orig_sr=sr, target_sr=CONFIG.SR)
        sr = CONFIG.SR

        y = preemphasis(y)
        y = noise_reduction(y)

        vad_signal = energy_vad(y)

        if len(vad_signal) == 0:
            vad_signal = y

        vad_signal = pad_or_truncate(vad_signal, target_length)
        mfcc = librosa.feature.mfcc(y=vad_signal, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc2 = librosa.feature.mfcc(y=vad_signal, sr=sr, n_mfcc=2 * CONFIG.N_MFCC)
        stft = np.abs(librosa.stft(vad_signal))
        chroma = librosa.feature.chroma_stft(y=vad_signal, sr=sr)
        spectral_contrast = librosa.feature.spectral_contrast(y=vad_signal, sr=sr)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(vad_signal), sr=sr)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

        return np.concatenate([
            calculate_statistics(mfcc.T),
            calculate_statistics(mfcc2.T),
            calculate_statistics(stft.T),
            calculate_statistics(chroma.T),
            calculate_statistics(spectral_contrast.T),
            calculate_statistics(tonnetz.T),
            calculate_statistics(mfcc_delta.T),
            calculate_statistics(mfcc_delta2.T)
        ])
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return np.zeros((CONFIG.N_MFCC * 9 + 2 * CONFIG.N_MFCC * 9 + 1025 * 9 + 12 * 9 + 7 * 9 + 6 * 9 + CONFIG.N_MFCC * 9 + CONFIG.N_MFCC * 9))

# 모델 불러오기
with open('./newFolder/rf_model.pkl', 'rb') as f:
    rf_model = pickle.load(f)

# test.csv 파일 읽기
test_df = pd.read_csv('test.csv')

# test.csv 파일에 있는 모든 파일에 대해 특징 추출 및 예측 수행
test_features = []
file_ids = []

for file_id in tqdm(test_df['id'], desc="Extracting features from test files"):
    file_path = os.path.join('./test', file_id + '.ogg')
    features = extract_features(file_path)
    test_features.append(features)
    file_ids.append(file_id)

test_features = np.array(test_features)

# NaN 값을 평균값으로 대체
imputer = SimpleImputer(strategy='mean')
test_features = imputer.fit_transform(test_features)

# 예측 수행
y_pred_proba = rf_model.predict_proba(test_features)

# 예측 결과를 DataFrame으로 저장
results = pd.DataFrame({
    'id': file_ids,
    'fake': [prob[1] for prob in y_pred_proba[0]],
    'real': [prob[1] for prob in y_pred_proba[1]]
})

# 결과를 CSV 파일로 저장
results.to_csv('predictions.csv', index=False)


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)
  return pitch_tuning(
  skew(features, axis=0),
  kurtosis(features, axis=0)
Extracting features from test files: 100%|██████████| 50000/50000 [8:13:09<00:00,  1.69it/s]  


In [4]:
pip install spleeter

Collecting spleeter
  Downloading spleeter-2.4.0-py3-none-any.whl.metadata (10 kB)
Collecting ffmpeg-python<0.3.0,>=0.2.0 (from spleeter)
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting httpx<0.20.0,>=0.19.0 (from httpx[http2]<0.20.0,>=0.19.0->spleeter)
  Downloading httpx-0.19.0-py3-none-any.whl.metadata (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.6/45.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting norbert<0.3.0,>=0.2.1 (from spleeter)
  Downloading norbert-0.2.1-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting pandas<2.0.0,>=1.3.0 (from spleeter)
  Downloading pandas-1.5.3-cp38-cp38-macosx_11_0_arm64.whl.metadata (11 kB)
INFO: pip is looking at multiple versions of spleeter to determine which version is compatible with other requirements. This could take a while.
Collecting spleeter
  Downloading spleeter-2.3.2-py3-none-any.whl.metadata (11 kB)
Collecting librosa<0.9.0,>=0.8.0 (from spleeter)
  Download

# unlabeled data labeling

In [6]:
import os
import numpy as np
import pandas as pd
import librosa
import pickle
from tqdm import tqdm
from scipy.signal import wiener
from scipy.stats import skew, kurtosis
from sklearn.impute import SimpleImputer

class Config:
    SR = 32000
    N_MFCC = 13
    ROOT_FOLDER = './'
    N_CLASSES = 2
    BATCH_SIZE = 32
    N_EPOCHS = 50
    LR = 3e-4
    SEED = 42

CONFIG = Config()

def pad_or_truncate(audio, target_length):
    if len(audio) > target_length:
        return audio[:target_length]
    else:
        return np.pad(audio, (0, target_length - len(audio)), mode='reflect')

def preemphasis(signal, coeff=0.97):
    return np.append(signal[0], signal[1:] - coeff * signal[:-1])

def noise_reduction(signal):
    filtered_signal = wiener(signal)
    return filtered_signal / np.sqrt(np.mean(filtered_signal ** 2))  # RMS 정규화

def energy_vad(signal, frame_length=2048, hop_length=512, threshold=0.1):
    energy = librosa.feature.rms(y=signal, frame_length=frame_length, hop_length=hop_length)[0]
    frames = np.nonzero(energy > threshold)[0]
    vad_signal = []
    for frame in frames:
        start = frame * hop_length
        end = start + frame_length
        vad_signal.extend(signal[start:end])
    return np.array(vad_signal)

def calculate_statistics(features):
    return np.concatenate([
        np.mean(features, axis=0),
        np.std(features, axis=0),
        np.max(features, axis=0),
        np.min(features, axis=0),
        np.median(features, axis=0),
        np.percentile(features, 25, axis=0),
        np.percentile(features, 75, axis=0),
        skew(features, axis=0),
        kurtosis(features, axis=0)
    ])

def extract_features(audio_path, target_length=5*CONFIG.SR):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        y = librosa.resample(y, orig_sr=sr, target_sr=CONFIG.SR)
        sr = CONFIG.SR

        y = preemphasis(y)
        y = noise_reduction(y)

        vad_signal = energy_vad(y)

        if len(vad_signal) == 0:
            vad_signal = y

        vad_signal = pad_or_truncate(vad_signal, target_length)
        mfcc = librosa.feature.mfcc(y=vad_signal, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc2 = librosa.feature.mfcc(y=vad_signal, sr=sr, n_mfcc=2 * CONFIG.N_MFCC)
        stft = np.abs(librosa.stft(vad_signal))
        chroma = librosa.feature.chroma_stft(y=vad_signal, sr=sr)
        spectral_contrast = librosa.feature.spectral_contrast(y=vad_signal, sr=sr)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(vad_signal), sr=sr)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

        return np.concatenate([
            calculate_statistics(mfcc.T),
            calculate_statistics(mfcc2.T),
            calculate_statistics(stft.T),
            calculate_statistics(chroma.T),
            calculate_statistics(spectral_contrast.T),
            calculate_statistics(tonnetz.T),
            calculate_statistics(mfcc_delta.T),
            calculate_statistics(mfcc_delta2.T)
        ])
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return np.zeros((CONFIG.N_MFCC * 9 + 2 * CONFIG.N_MFCC * 9 + 1025 * 9 + 12 * 9 + 7 * 9 + 6 * 9 + CONFIG.N_MFCC * 9 + CONFIG.N_MFCC * 9))

# 모델 불러오기
with open('./newFolder/rf_model.pkl', 'rb') as f:
    rf_model = pickle.load(f)

# unlabeled_data 폴더 내 모든 파일 처리
unlabeled_folder = './unlabeled_data'
audio_files = [f for f in os.listdir(unlabeled_folder) if os.path.isfile(os.path.join(unlabeled_folder, f))]

test_features = []
file_paths = []

for audio_file in tqdm(audio_files, desc="Extracting features from unlabeled files"):
    file_path = os.path.join(unlabeled_folder, audio_file)
    features = extract_features(file_path)
    test_features.append(features)
    file_paths.append(file_path)

test_features = np.array(test_features)

# NaN 값을 평균값으로 대체
imputer = SimpleImputer(strategy='mean')
test_features = imputer.fit_transform(test_features)

# 예측 수행
y_pred_proba = rf_model.predict_proba(test_features)
y_pred = (y_pred_proba[:, 1] >= 0.5).astype(int)  # 진짜 목소리일 확률이 높으면 1, 가짜 목소리일 확률이 높으면 0

# 예측 결과를 DataFrame으로 저장
results = pd.DataFrame({
    'path': file_paths,
    'label': y_pred
})

# 결과를 CSV 파일로 저장
results.to_csv('unlabeled_data_predictions.csv', index=False)


  res *= (1 - noise / lVar)
  res *= (1 - noise / lVar)
Extracting features from unlabeled files: 100%|██████████| 1264/1264 [11:32<00:00,  1.83it/s]


TypeError: list indices must be integers or slices, not tuple

# 음성 분리

In [5]:
import os
import numpy as np
import pandas as pd
import librosa
import pickle
from tqdm import tqdm
from scipy.signal import wiener
from scipy.stats import skew, kurtosis
from sklearn.impute import SimpleImputer
from spleeter.separator import Separator

class Config:
    SR = 32000
    N_MFCC = 13
    ROOT_FOLDER = './'
    N_CLASSES = 2
    BATCH_SIZE = 32
    N_EPOCHS = 50
    LR = 3e-4
    SEED = 42

CONFIG = Config()

def pad_or_truncate(audio, target_length):
    if len(audio) > target_length:
        return audio[:target_length]
    else:
        return np.pad(audio, (0, target_length - len(audio)), mode='reflect')

def preemphasis(signal, coeff=0.97):
    return np.append(signal[0], signal[1:] - coeff * signal[:-1])

def noise_reduction(signal):
    filtered_signal = wiener(signal)
    return filtered_signal / np.sqrt(np.mean(filtered_signal ** 2))  # RMS 정규화

def energy_vad(signal, frame_length=2048, hop_length=512, threshold=0.1):
    energy = librosa.feature.rms(y=signal, frame_length=frame_length, hop_length=hop_length)[0]
    frames = np.nonzero(energy > threshold)[0]
    vad_signal = []
    for frame in frames:
        start = frame * hop_length
        end = start + frame_length
        vad_signal.extend(signal[start:end])
    return np.array(vad_signal)

def calculate_statistics(features):
    return np.concatenate([
        np.mean(features, axis=0),
        np.std(features, axis=0),
        np.max(features, axis=0),
        np.min(features, axis=0),
        np.median(features, axis=0),
        np.percentile(features, 25, axis=0),
        np.percentile(features, 75, axis=0),
        skew(features, axis=0),
        kurtosis(features, axis=0)
    ])

def extract_features(y, sr, target_length=5*CONFIG.SR):
    try:
        y = librosa.resample(y, orig_sr=sr, target_sr=CONFIG.SR)
        sr = CONFIG.SR

        y = preemphasis(y)
        y = noise_reduction(y)

        vad_signal = energy_vad(y)

        if len(vad_signal) == 0:
            vad_signal = y

        vad_signal = pad_or_truncate(vad_signal, target_length)
        mfcc = librosa.feature.mfcc(y=vad_signal, sr=sr, n_mfcc=CONFIG.N_MFCC)
        mfcc2 = librosa.feature.mfcc(y=vad_signal, sr=sr, n_mfcc=2 * CONFIG.N_MFCC)
        stft = np.abs(librosa.stft(vad_signal))
        chroma = librosa.feature.chroma_stft(y=vad_signal, sr=sr)
        spectral_contrast = librosa.feature.spectral_contrast(y=vad_signal, sr=sr)
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(vad_signal), sr=sr)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

        return np.concatenate([
            calculate_statistics(mfcc.T),
            calculate_statistics(mfcc2.T),
            calculate_statistics(stft.T),
            calculate_statistics(chroma.T),
            calculate_statistics(spectral_contrast.T),
            calculate_statistics(tonnetz.T),
            calculate_statistics(mfcc_delta.T),
            calculate_statistics(mfcc_delta2.T)
        ])
    except Exception as e:
        print(f"Error processing: {e}")
        return np.zeros((CONFIG.N_MFCC * 9 + 2 * CONFIG.N_MFCC * 9 + 1025 * 9 + 12 * 9 + 7 * 9 + 6 * 9 + CONFIG.N_MFCC * 9 + CONFIG.N_MFCC * 9))

def separate_voices(audio_path):
    separator = Separator('spleeter:2stems')
    output_dir = './output'
    separator.separate_to_file(audio_path, output_dir)
    return os.path.join(output_dir, 'vocals.wav'), os.path.join(output_dir, 'accompaniment.wav')

# 모델 불러오기
with open('./newFolder/rf_model.pkl', 'rb') as f:
    rf_model = pickle.load(f)

# test.csv 파일 읽기
test_df = pd.read_csv('test.csv')

# test.csv 파일에 있는 모든 파일에 대해 특징 추출 및 예측 수행
test_features = []
file_ids = []

for file_id in tqdm(test_df['id'], desc="Extracting features from test files"):
    file_path = os.path.join('./test', file_id + '.ogg')
    vocal_path, accompaniment_path = separate_voices(file_path)
    
    y_vocal, sr_vocal = librosa.load(vocal_path, sr=None)
    y_accompaniment, sr_accompaniment = librosa.load(accompaniment_path, sr=None)
    
    features_vocal = extract_features(y_vocal, sr_vocal)
    features_accompaniment = extract_features(y_accompaniment, sr_accompaniment)
    
    combined_features = np.concatenate([features_vocal, features_accompaniment])
    test_features.append(combined_features)
    file_ids.append(file_id)

test_features = np.array(test_features)

# NaN 값을 평균값으로 대체
imputer = SimpleImputer(strategy='mean')
test_features = imputer.fit_transform(test_features)

# 예측 수행
y_pred_proba = rf_model.predict_proba(test_features)

# 예측 결과를 DataFrame으로 저장
results = pd.DataFrame({
    'id': file_ids,
    'fake': [prob[1] for prob in y_pred_proba[:, 0]],
    'real': [prob[1] for prob in y_pred_proba[:, 1]]
})

# 결과를 CSV 파일로 저장
results.to_csv('predictions.csv', index=False)


Extracting features from test files:   0%|          | 0/50000 [00:00<?, ?it/s]

INFO:tensorflow:Apply unet for vocals_spectrogram
Instructions for updating:
Colocations handled automatically by placer.
INFO:tensorflow:Apply unet for accompaniment_spectrogram
INFO:spleeter:Downloading model archive https://github.com/deezer/spleeter/releases/download/v1.4.0/2stems.tar.gz
INFO:spleeter:Validating archive checksum
INFO:spleeter:Extracting downloaded 2stems archive
INFO:spleeter:2stems model file(s) extracted
INFO:tensorflow:Restoring parameters from pretrained_models/2stems/model


Extracting features from test files:   0%|          | 0/50000 [00:28<?, ?it/s]

INFO:spleeter:File ./output/TEST_00000/accompaniment.wav written succesfully
INFO:spleeter:File ./output/TEST_00000/vocals.wav written succesfully





FileNotFoundError: [Errno 2] No such file or directory: './output/vocals.wav'