In [1]:
import torch

if torch.cuda.is_available():
    print("GPU is available")
    print(f"GPU device name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available")

GPU is available
GPU device name: Tesla T4


## Pretrained Hubert Fine-Tuning

In [2]:
import os
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from transformers.optimization import AdamW, get_constant_schedule_with_warmup
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, StochasticWeightAveraging
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, AutoFeatureExtractor, HubertForSequenceClassification, AutoConfig

In [3]:
def accuracy(preds, labels):
    return (preds == labels).float().mean()

In [4]:
class MyLitModel(pl.LightningModule):
    def __init__(self, audio_model_name, num_labels, n_layers=1, projector=True, classifier=True, dropout=0.07, lr_decay=1):
        super(MyLitModel, self).__init__()
        self.config = AutoConfig.from_pretrained(audio_model_name)
        self.config.activation_dropout=dropout
        self.config.attention_dropout=dropout
        self.config.final_dropout=dropout
        self.config.hidden_dropout=dropout
        self.config.hidden_dropout_prob=dropout
        self.audio_model = HubertForSequenceClassification.from_pretrained(audio_model_name, config=self.config)
        self.lr_decay = lr_decay
        self._do_reinit(n_layers, projector, classifier)

    def forward(self, audio_values, audio_attn_mask):
        logits = self.audio_model(input_values=audio_values, attention_mask=audio_attn_mask).logits
        logits = torch.stack([
            logits[:,0]+logits[:,7],
            logits[:,2]+logits[:,9],
            logits[:,5]+logits[:,12],
            logits[:,1]+logits[:,8],
            logits[:,4]+logits[:,11],
            logits[:,3]+logits[:,10]]
        , dim=-1)
        return logits

    def training_step(self, batch, batch_idx):
        audio_values = batch['audio_values']
        audio_attn_mask = batch['audio_attn_mask']
        labels = batch['label']

        logits = self(audio_values, audio_attn_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)
        
        preds = torch.argmax(logits, dim=1)
        acc = accuracy(preds, labels)

        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        audio_values = batch['audio_values']
        audio_attn_mask = batch['audio_attn_mask']
        labels = batch['label']

        logits = self(audio_values, audio_attn_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)

        preds = torch.argmax(logits, dim=1)
        acc = accuracy(preds, labels)

        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('val_acc', acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)

        return loss

    def predict_step(self, batch, batch_idx, dataloader_idx=None):
        audio_values = batch['audio_values']
        audio_attn_mask = batch['audio_attn_mask']

        logits = self(audio_values, audio_attn_mask)
        preds = torch.argmax(logits, dim=1)

        return preds

    def configure_optimizers(self):
        lr = 1e-5
        layer_decay = self.lr_decay
        weight_decay = 0.01
        llrd_params = self._get_llrd_params(lr=lr, layer_decay=layer_decay, weight_decay=weight_decay)
        optimizer = AdamW(llrd_params)
        return optimizer

    def _get_llrd_params(self, lr, layer_decay, weight_decay):
        n_layers = self.audio_model.config.num_hidden_layers
        llrd_params = []
        for name, value in list(self.named_parameters()):
            if ('bias' in name) or ('layer_norm' in name):
                llrd_params.append({"params": value, "lr": lr, "weight_decay": 0.0})
            elif ('emb' in name) or ('feature' in name) : 
                llrd_params.append({"params": value, "lr": lr * (layer_decay**(n_layers+1)), "weight_decay": weight_decay})
            elif 'encoder.layer' in name:
                for n_layer in range(n_layers):
                    if f'encoder.layer.{n_layer}' in name:
                        llrd_params.append({"params": value, "lr": lr * (layer_decay**(n_layer+1)), "weight_decay": weight_decay})
            else:
                llrd_params.append({"params": value, "lr": lr , "weight_decay": weight_decay})
        return llrd_params
    
    def _do_reinit(self, n_layers=0, projector=True, classifier=True):
        if projector:
            self.audio_model.projector.apply(self._init_weight_and_bias)
        if classifier:
            self.audio_model.classifier.apply(self._init_weight_and_bias)
        
        for n in range(n_layers):
            self.audio_model.hubert.encoder.layers[-(n+1)].apply(self._init_weight_and_bias)
            
    def _init_weight_and_bias(self, module):                        
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.audio_model.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)   

In [6]:
import os
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
import soundfile as sf
import warnings

from tqdm.auto import tqdm
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold, train_test_split
from transformers.optimization import AdamW, get_constant_schedule_with_warmup
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, StochasticWeightAveraging
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, AutoFeatureExtractor, HubertForSequenceClassification, AutoConfig
from sklearn.metrics import accuracy_score

# Helper function for loading audio
def load_audio(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    speech, rate = sf.read(file_path)
    if rate != SAMPLING_RATE:  # Resample if the rate is not 16000Hz
        speech = librosa.resample(speech, orig_sr=rate, target_sr=SAMPLING_RATE)
    return speech

# Dataset Class to load audio files and their respective labels
class SpeechDataset(Dataset):
    def __init__(self, df, base_directory, folders):
        self.base_directory = base_directory
        self.folders = folders
        self.data = []

        # Iterate through the folders to match CSV information
        for folder_name in folders:
            folder_path = os.path.join(base_directory, folder_name)
            for file_name in os.listdir(folder_path):
                if file_name.endswith(".wav"):
                    file_number = int(file_name.split('_')[-1].replace(".wav", ""))
                    matching_rows = df[df['연번'] == file_number]
                    if not matching_rows.empty:
                        emotion = matching_rows.iloc[0]['감정']
                        label = label2id[emotion]
                        
                        self.data.append({
                            'file_path': os.path.join(folder_path, file_name),
                            'label': label
                        })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_path = self.data[idx]['file_path']
        label = self.data[idx]['label']
        speech = load_audio(file_path)
        inputs = audio_feature_extractor(speech, sampling_rate=SAMPLING_RATE, return_tensors="pt", padding=True)

        return {
            "input_values": inputs.input_values[0],
            "attention_mask": inputs.attention_mask[0] if "attention_mask" in inputs else None,
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Collate function to pad sequences
def collate_fn(batch):
    input_values = [item["input_values"] for item in batch]
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
    input_values_padded = pad_sequence(input_values, batch_first=True, padding_value=0.0)
    attention_mask = (input_values_padded != 0).long()

    return {
        "audio_values": input_values_padded,
        "audio_attn_mask": attention_mask,
        "label": labels,
    }

# Updated Constants and Configuration
DATA_DIR = './data'
PREPROC_DIR = './preproc'
SUBMISSION_DIR = './submission'
MODEL_DIR = './model'
SAMPLING_RATE = 16000
SEED=0
BATCH_SIZE=8
NUM_LABELS = 6

seed_everything(SEED)

# Defining emotions and mapping them to IDs
csv_file_path = './data/csv/labeling.csv'
df = pd.read_csv(csv_file_path).dropna(subset=['감정'])
emotion_labels = df['감정'].unique().tolist()
label2id = {str(emotion): int(i) for i, emotion in enumerate(emotion_labels)}
id2label = {int(i): str(emotion) for i, emotion in enumerate(emotion_labels)}

# Extract folders and split train/test
audio_directory = "/mnt/gcs-bucket/data/"
all_folders = [folder for folder in os.listdir(audio_directory) if os.path.isdir(os.path.join(audio_directory, folder)) and folder.startswith("F")]
all_folders = sorted(all_folders)
train_folders, test_folders = train_test_split(all_folders, test_size=0.2, random_state=42)

# Initialize feature extractor
audio_model_name = 'Rajaram1996/Hubert_emotion'
audio_feature_extractor = AutoFeatureExtractor.from_pretrained(audio_model_name)
audio_feature_extractor.return_attention_mask=True

# Prepare datasets
train_dataset = SpeechDataset(df, audio_directory, train_folders)
test_dataset = SpeechDataset(df, audio_directory, test_folders)

# Split dataset into smaller subsets
def split_dataset(dataset, num_splits):
    split_size = len(dataset) // num_splits
    splits = [torch.utils.data.Subset(dataset, list(range(i * split_size, (i + 1) * split_size))) for i in range(num_splits)]
    remainder = len(dataset) % num_splits
    if remainder:
        splits[-1] = torch.utils.data.ConcatDataset([splits[-1], torch.utils.data.Subset(dataset, list(range(num_splits * split_size, len(dataset))))])
    return splits

# Train/Test dataset split into subsets
train_subsets = split_dataset(train_dataset, 2)
test_subsets = split_dataset(test_dataset, 2)

# Training and evaluating each subset
for i, (train_subset, test_subset) in enumerate(zip(train_subsets, test_subsets)):
    warnings.filterwarnings("ignore")  # Hide all warnings
    torch.cuda.empty_cache()
    print(f"Training on subset {i + 1}/36")
    train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
    test_loader = DataLoader(test_subset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    # Initialize model
    my_lit_model = MyLitModel(
        audio_model_name=audio_model_name,
        num_labels=NUM_LABELS,
        n_layers=1, projector=True, classifier=True, dropout=0.07, lr_decay=0.8
    )

    # Trainer configuration
    trainer = pl.Trainer(
        accelerator='cuda', 
        max_epochs=3,
        precision=16,
        val_check_interval=0.1,
    )

    # Training model
    trainer.fit(my_lit_model, train_loader, test_loader)
    del my_lit_model


Seed set to 0


Training on subset 1/36


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at Rajaram1996/Hubert_emotion and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
2024-10-22 09:38:42.800437: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-

Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=3` reached.


Training on subset 2/36


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at Rajaram1996/Hubert_emotion and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                            | Params | Mode
-----------------------------------------------------------------------
0 | audio_model | HubertForSequence

Sanity Checking: |                                                                                            …

Training: |                                                                                                   …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=3` reached.


In [6]:
class MyLitModel(pl.LightningModule):
    def __init__(self, audio_model_name, num_labels, n_layers, projector, classifier, dropout, lr_decay):
        super().__init__()
        self.num_labels = num_labels
        self.hubert_model = HubertForSequenceClassification.from_pretrained(audio_model_name, num_labels=num_labels)
        self.dropout = torch.nn.Dropout(dropout)  # Dropout 레이어
        self.lr_decay = lr_decay  # 추가로 필요할 경우 사용할 학습률 감소 파라미터

    def forward(self, batch):
        input_values = batch['audio_values']
        attention_mask = batch['audio_attn_mask']
        labels = batch['label']

        # 모델에 입력값 전달
        outputs = self.hubert_model(input_values, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        return loss, logits

    def test_step(self, batch, batch_idx):
        loss, logits = self(batch)
        preds = torch.argmax(logits, dim=-1)
        accuracy = accuracy_score(batch['label'].cpu(), preds.cpu())
        self.log('test_acc', accuracy)

In [10]:
from torch.utils.data import DataLoader

# 체크포인트 파일로부터 모델 로드
checkpoint_path = './lightning_logs/version_1/checkpoints/epoch=2-step=4476.ckpt'
model = MyLitModel.load_from_checkpoint(
    checkpoint_path, 
    audio_model_name='Rajaram1996/Hubert_emotion', 
    num_labels=6, 
    n_layers=1, 
    projector=True, 
    classifier=True, 
    dropout=0.07, 
    lr_decay=0.8,
    strict=False
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at Rajaram1996/Hubert_emotion and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# 모델 체크포인트로부터 로드된 모델 저장
torch.save(model.state_dict(), './model/model_weights.pth')