In [1]:
# 구글 드라이브의 파일 다운로드를 위한 패키지 설치
!pip install gdown
# 구글 드라이브로 공유된 zip 파일을 colab 환경에 직접 다운로드
!gdown --id 1dCMUGl1sNj0hOZkuBXlk3ounl_QkD5Xz  -O data.zip
# data.zip 파일을 data라는 폴더에 압축풀기
!unzip -q data.zip -d data
# data.zip 파일 및 data 폴더가 다운로드 및 생성되었는지 확인
!ls -al

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0
Downloading...
From (original): https://drive.google.com/uc?id=1dCMUGl1sNj0hOZkuBXlk3ounl_QkD5Xz
From (redirected): https://drive.google.com/uc?id=1dCMUGl1sNj0hOZkuBXlk3ounl_QkD5Xz&confirm=t&uuid=565bc6e6-ed70-4895-95b4-f058592555d4
To: /kaggle/working/data.zip
100%|██████████████████████████████████████| 7.25G/7.25G [02:37<00:00, 45.9MB/s]
total 7078444
drwxr-xr-x 4 root root       4096 Nov 25 14:47 .
drwxr-xr-x 5 root root       4096 Nov 25 14:42 ..
drwxr-xr-x 2 root root       4096 Nov 25 14:42 .virtual_documents
drwxr-xr-x 4 root root       4096 Nov 25 14:48 data
-rw-r--r-- 1 root root 7248304333 Nov 25 10:11 data.zip


In [2]:
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [3]:
pip install librosa numpy pandas scikit-learn tqdm

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# 경로 설정
BASE_PATH = "/kaggle/working/data"
TRAIN_PATH = os.path.join(BASE_PATH, "train")
TEST_PATH = os.path.join(BASE_PATH, "test")

class VADDataset(Dataset):
    def __init__(self, base_path, is_train=True):
        if is_train:
            self.wav_path = os.path.join(TRAIN_PATH, 'wav160')
            self.text_path = os.path.join(TRAIN_PATH, 'text')
            self.file_list = sorted(os.listdir(self.wav_path))
        else:
            self.wav_path = os.path.join(TEST_PATH, 'wav160')
            self.file_list = sorted(os.listdir(self.wav_path))
        
        print(f"{'Train' if is_train else 'Test'} WAV path: {self.wav_path}")
        if is_train:
            print(f"Train TEXT path: {self.text_path}")
        print(f"Number of files: {len(self.file_list)}")
    
    def __len__(self):
        return len(self.file_list)
    
    def __getitem__(self, idx):
        wav_file = self.file_list[idx]
        wav_path = os.path.join(self.wav_path, wav_file)
        
        # 오디오 로드
        audio, sr = librosa.load(wav_path, sr=16000)
        
        # MFCC 특성 추출 (프레임 길이와 홉 길이 명시)
        mfcc = librosa.feature.mfcc(
            y=audio, 
            sr=sr, 
            n_mfcc=13,
            n_fft=400,      # 25ms at 16kHz
            hop_length=160,  # 10ms at 16kHz
            n_mels=40
        )
        mfcc = mfcc.T  # (time, features)
        
        # 에너지 특성 추출 (동일한 프레임 길이와 홉 길이 사용)
        frame_length = 400
        hop_length = 160
        
        # RMS 에너지 계산
        rms = librosa.feature.rms(
            y=audio,
            frame_length=frame_length,
            hop_length=hop_length
        )
        rms = rms.T  # (time, 1)
        
        # Zero Crossing Rate 추가
        zcr = librosa.feature.zero_crossing_rate(
            y=audio,
            frame_length=frame_length,
            hop_length=hop_length
        )
        zcr = zcr.T  # (time, 1)
        
        # 모든 특성 결합
        features = np.concatenate([mfcc, rms, zcr], axis=1)  # (time, features)
        
        # 패딩이나 자르기로 시퀀스 길이 고정
        target_length = 400  # 4초 * 100 프레임/초
        if features.shape[0] < target_length:
            pad_width = ((0, target_length - features.shape[0]), (0, 0))
            features = np.pad(features, pad_width, mode='constant')
        else:
            features = features[:target_length]
            
        features = torch.FloatTensor(features)
        
        if hasattr(self, 'text_path'):  # train 데이터
            txt_file = wav_file.replace('.wav', '.txt')
            txt_path = os.path.join(self.text_path, txt_file)
            
            with open(txt_path, 'r') as f:
                content = f.read().strip()
                onset, offset, sound_class = content.split('\t')
                
                if 'speech' in sound_class:
                    center = (float(onset) + float(offset)) / 2
                else:
                    center = -1.0
                    
                return features, torch.FloatTensor([center])
        else:
            return features

class VADModel(nn.Module):
    def __init__(self, input_size):
        super(VADModel, self).__init__()
        
        self.lstm = nn.LSTM(
            input_size=input_size, 
            hidden_size=64,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )
        
        self.attention = nn.Sequential(
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
        
        self.fc = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        # LSTM 처리
        lstm_out, _ = self.lstm(x)  # (batch, time, 128)
        
        # Attention 가중치 계산
        attention_weights = self.attention(lstm_out)  # (batch, time, 1)
        
        # Attention 적용
        weighted = lstm_out * attention_weights
        context = weighted.sum(dim=1)  # (batch, 128)
        
        # 최종 예측
        output = self.fc(context)
        return output

def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    
    for features, targets in tqdm(train_loader):
        features, targets = features.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, targets)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def predict(model, test_loader, device):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for features in tqdm(test_loader):
            if isinstance(features, tuple):
                features = features[0]
            features = features.to(device)
            outputs = model(features)
            predictions.extend(outputs.cpu().numpy())
    
    return np.array(predictions)

def main():
    # 설정
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    batch_size = 32
    num_epochs = 5  # 에폭 수 증가
    learning_rate = 0.001
    
    # 데이터셋 및 데이터로더 생성
    train_dataset = VADDataset(BASE_PATH, is_train=True)
    test_dataset = VADDataset(BASE_PATH, is_train=False)
    
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True,
        num_workers=4,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        shuffle=False,
        num_workers=4,
        pin_memory=True if torch.cuda.is_available() else False
    )
    
    # 모델 초기화
    input_size = 15  # MFCC(13) + RMS(1) + ZCR(1)
    model = VADModel(input_size).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2, verbose=True
    )
    
    # 학습
    print("\nStarting training...")
    best_loss = float('inf')
    
    for epoch in range(num_epochs):
        train_loss = train_model(model, train_loader, criterion, optimizer, device)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}')
        
        # Learning rate 조정
        scheduler.step(train_loss)
        
        # 모델 저장
        if train_loss < best_loss:
            best_loss = train_loss
            torch.save(model.state_dict(), 'best_model.pth')
    
    # 최적 모델 로드
    model.load_state_dict(torch.load('best_model.pth'))
    
    # 테스트 데이터 예측
    print("\nGenerating predictions...")
    predictions = predict(model, test_loader, device)
    
    # 제출 파일 생성
    submission_df = pd.DataFrame({
        'Id': np.arange(len(predictions)),
        'Center': predictions.flatten()
    })
    
    submission_path = 'submission.csv'
    submission_df.to_csv(submission_path, index=False)
    print(f"\nSubmission file saved to: {submission_path}")
    print("First few predictions:", submission_df['Center'].head())

if __name__ == '__main__':
    main()

Using device: cuda
Train WAV path: /kaggle/working/data/train/wav160
Train TEXT path: /kaggle/working/data/train/text
Number of files: 50000
Test WAV path: /kaggle/working/data/test/wav160
Number of files: 10000

Starting training...


100%|██████████| 1563/1563 [19:43<00:00,  1.32it/s]


Epoch 1/5, Loss: 3.0112


100%|██████████| 1563/1563 [19:48<00:00,  1.32it/s]


Epoch 2/5, Loss: 2.7240


100%|██████████| 1563/1563 [19:57<00:00,  1.30it/s]


Epoch 3/5, Loss: 2.0417


100%|██████████| 1563/1563 [19:22<00:00,  1.34it/s]


Epoch 4/5, Loss: 1.7379


100%|██████████| 1563/1563 [19:27<00:00,  1.34it/s]


Epoch 5/5, Loss: 1.5522

Generating predictions...


100%|██████████| 313/313 [03:50<00:00,  1.36it/s]


Submission file saved to: submission.csv
First few predictions: 0   -0.635486
1    0.822275
2   -0.743630
3   -0.789553
4    0.351603
Name: Center, dtype: float32



