In [None]:
from google.colab import files
uploaded = files.upload()  # Choose files from your system


Saving updated_iemocap_metadata.csv to updated_iemocap_metadata.csv


In [None]:
!ls

sample_data  updated_iemocap_metadata.csv


In [None]:
import pandas as pd

# Step 1: Load the CSV
csv_path = '/content/updated_iemocap_metadata.csv'
df = pd.read_csv(csv_path)

# Step 2: Update paths (from Kaggle path to Colab path)
df['filepath'] = df['filepath'].str.replace(
    '/kaggle/input/iemocap/iemocap_audio/', '/content/iemocap/', regex=False
)

# Step 3: Save the updated CSV
updated_csv_path = '/content/sample_data/updated_iemocap_metadata_colab.csv'
df.to_csv(updated_csv_path, index=False)

print(f"✅ Updated CSV saved to: {updated_csv_path}")


✅ Updated CSV saved to: /content/sample_data/updated_iemocap_metadata_colab.csv


In [None]:
print(df['filepath'].head())

0    /content/iemocap/Ses01F_impro01_F000.wav
1    /content/iemocap/Ses01F_impro01_F001.wav
2    /content/iemocap/Ses01F_impro01_F002.wav
3    /content/iemocap/Ses01F_impro01_F003.wav
4    /content/iemocap/Ses01F_impro01_F004.wav
Name: filepath, dtype: object


In [None]:
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')


Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
import zipfile
import os

# Set the path
zip_path = '/content/drive/MyDrive/iemocap_audio.zip'
extract_to = '/content/iemocap/'

# Create target folder
os.makedirs(extract_to, exist_ok=True)

# Extract the zip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(f"✅ Unzipped to {extract_to}")


✅ Unzipped to /content/iemocap/


In [None]:
import pandas as pd
import librosa
from IPython.display import Audio

# Load CSV
csv_path = '/content/sample_data/updated_iemocap_metadata_colab.csv'
df = pd.read_csv(csv_path)

# Get the first audio file path
audio_path = df['filepath'].iloc[0]

# Load audio
waveform, sr = librosa.load(audio_path, sr=None)

# Play audio
Audio(waveform, rate=sr)


In [None]:
# Step 2: Install dependencies
!pip install numpy==1.26.4
!pip install pandas==2.2.2
!pip install torch==2.0.0 torchvision==0.15.1
!pip install transformers==4.31.0
!pip install torchaudio==2.0.1
!pip install tqdm==4.66.2
!pip install nlpaug==1.1.11



In [None]:
import numpy
import pandas
import torch
import transformers
import torchaudio
import tqdm
import nlpaug
print(f"NumPy: {numpy.__version__}")  # Should be 1.26.4
print(f"Pandas: {pandas.__version__}")  # Should be 2.2.2
print(f"Torch: {torch.__version__}")  # Should be 2.0.0
print(f"Transformers: {transformers.__version__}")  # Should be 4.31.0
print(f"Torchaudio: {torchaudio.__version__}")  # Should be 2.0.1
print(f"Tqdm: {tqdm.__version__}")  # Should be 4.66.2
print(f"Nlpaug: {nlpaug.__version__}")  # Should be 1.1.11
print(f"CUDA available: {torch.cuda.is_available()}")  # Should be True

NumPy: 1.26.4
Pandas: 2.2.2
Torch: 2.0.0+cu117
Transformers: 4.31.0
Torchaudio: 2.0.1+cu117
Tqdm: 4.66.2
Nlpaug: 1.1.11
CUDA available: False


In [None]:
# Step 2: Initialize NLTK resources
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import HubertModel, Wav2Vec2FeatureExtractor, BertModel, BertTokenizer
import torchaudio
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import nlpaug.augmenter.word as naw
import random

# Dataset with fixed augmentation
class MultimodalIEMOCAPDataset(Dataset):
    def __init__(self, csv_file, audio_feature_extractor, text_tokenizer, augment=True, max_audio_samples=128000):
        self.df = pd.read_csv(csv_file)
        self.audio_feature_extractor = audio_feature_extractor
        self.text_tokenizer = text_tokenizer
        self.augment = augment
        self.text_augmenter = naw.SynonymAug(aug_p=0.3) if augment else None
        self.max_audio_samples = max_audio_samples

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['filepath']
        transcript = self.df.iloc[idx]['transcription']
        vad_label = self.df.iloc[idx]['EmoVal'].astype(np.float32)

        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file missing: {audio_path}")

        try:
            audio, sr = torchaudio.load(audio_path)
            if audio.abs().mean() < 1e-5:
                raise ValueError(f"Silent audio: {audio_path}")
        except Exception as e:
            raise RuntimeError(f"Error loading audio {audio_path}: {str(e)}")

        if sr != 16000:
            audio = torchaudio.transforms.Resample(sr, 16000)(audio)

        audio = audio.squeeze(0)  # Remove channel dim if mono
        if audio.dim() > 1:
            audio = audio[0]  # Take first channel if stereo

        # Pad or truncate to max_audio_samples
        if audio.size(0) > self.max_audio_samples:
            audio = audio[:self.max_audio_samples]
        elif audio.size(0) < self.max_audio_samples:
            audio = torch.nn.functional.pad(audio, (0, self.max_audio_samples - audio.size(0)))

        if self.augment and random.random() < 0.5:
            # Add noise
            noise = torch.randn_like(audio) * 0.005
            audio = audio + noise

            # Speed augmentation
            try:
                speed_factor = random.uniform(0.9, 1.1)
                effect = torch.tensor(audio).unsqueeze(0)  # [1, samples]
                augmented_audio, new_sr = torchaudio.sox_effects.apply_effects_tensor(
                    effect,
                    sample_rate=16000,
                    effects=[["speed", str(speed_factor)], ["rate", "16000"]]
                )
                audio = augmented_audio.squeeze(0)
                # Re-pad or truncate to max_audio_samples
                if audio.size(0) > self.max_audio_samples:
                    audio = audio[:self.max_audio_samples]
                elif audio.size(0) < self.max_audio_samples:
                    audio = torch.nn.functional.pad(audio, (0, self.max_audio_samples - audio.size(0)))
            except Exception as e:
                print(f"Speed augmentation failed for {audio_path}: {e}")
                # Keep original audio

        audio = audio.numpy()

        if self.augment and self.text_augmenter and random.random() < 0.3:
            try:
                transcript = self.text_augmenter.augment(transcript)[0]
            except Exception as e:
                print(f"Text augmentation failed for {audio_path}: {e}")
                # Keep original transcript

        audio_inputs = self.audio_feature_extractor(
            audio,
            sampling_rate=16000,
            return_tensors="pt",
            padding=False,
            truncation=False
        )

        text_inputs = self.text_tokenizer(
            transcript,
            padding=False,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        return {
            'audio_values': audio_inputs['input_values'].squeeze(0),
            'input_ids': text_inputs['input_ids'].squeeze(0),
            'attention_mask': text_inputs['attention_mask'].squeeze(0)
        }, torch.tensor(vad_label)

# Dynamic collation
def dynamic_collate_fn(batch):
    inputs, labels = zip(*batch)
    audio_values = [item['audio_values'] for item in inputs]
    input_ids = [item['input_ids'] for item in inputs]
    attention_masks = [item['attention_mask'] for item in inputs]

    audio_values_padded = pad_sequence(audio_values, batch_first=True, padding_value=0.0)
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    labels_stacked = torch.stack(labels)

    return {
        'audio_values': audio_values_padded,
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks_padded
    }, labels_stacked

# Label smoothing loss
class SmoothMSELoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super().__init__()
        self.smoothing = smoothing
        self.mse = nn.MSELoss()

    def forward(self, pred, target):
        smooth_target = target * (1 - self.smoothing) + 3.0 * self.smoothing
        return self.mse(pred, smooth_target)

# Transformer model
class ValenceRegressor(nn.Module):
    def __init__(self, audio_dim=768, text_dim=768, hidden_dim=192, num_heads=6, num_layers=2, dropout=0.5):
        super().__init__()

        self.audio_transformer = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=audio_dim,
                nhead=num_heads,
                dim_feedforward=hidden_dim * 4,
                dropout=dropout,
                batch_first=True
            ) for _ in range(num_layers)
        ])

        self.audio_layer_norm = nn.LayerNorm(audio_dim)

        self.audio_attention_pool = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 2, 1)
        )

        self.text_encoder = BertModel.from_pretrained("bert-base-uncased")
        for param in self.text_encoder.parameters():
            param.requires_grad = False
        for param in list(self.text_encoder.parameters())[-2:]:
            param.requires_grad = True

        self.audio_projection = nn.Linear(audio_dim, hidden_dim)
        self.text_projection = nn.Linear(text_dim, hidden_dim)

        self.audio_to_text_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads//2,
            dropout=dropout,
            batch_first=True
        )

        self.text_to_audio_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads//2,
            dropout=dropout,
            batch_first=True
        )

        self.audio_gate = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.Sigmoid()
        )

        self.text_gate = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.Sigmoid()
        )

        self.fusion_layer = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim*2),
            nn.LayerNorm(hidden_dim*2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim*2, hidden_dim)
        )

        self.shared_fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        self.output_branch = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.LayerNorm(hidden_dim//2),
            nn.GELU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(hidden_dim//2, 1)
        )

    def audio_attention_pooling(self, x, audio_mask=None):
        weights = self.audio_attention_pool(x)
        if audio_mask is not None:
            weights = weights.masked_fill(~audio_mask.bool().unsqueeze(-1), float('-inf'))
        weights = torch.softmax(weights, dim=1)
        output = torch.bmm(weights.transpose(1, 2), x)
        return output.squeeze(1)

    def forward(self, audio_features, input_ids, attention_mask):
        audio_mask = (audio_features.abs().sum(dim=-1) > 1e-6)

        audio_repr = audio_features
        for layer in self.audio_transformer:
            audio_key_padding_mask = (~audio_mask).float()
            audio_repr = layer(audio_repr, src_key_padding_mask=audio_key_padding_mask)

        audio_repr = self.audio_layer_norm(audio_repr)

        text_outputs = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_repr = text_outputs.last_hidden_state

        audio_proj = self.audio_projection(audio_repr)
        text_proj = self.text_projection(text_repr)

        audio_attended_text, _ = self.audio_to_text_attention(
            query=audio_proj,
            key=text_proj,
            value=text_proj,
            key_padding_mask=(1 - attention_mask).bool()
        )

        text_attended_audio, _ = self.text_to_audio_attention(
            query=text_proj,
            key=audio_proj,
            value=audio_proj,
            key_padding_mask=(~audio_mask).bool()
        )

        audio_concat = torch.cat([audio_proj, audio_attended_text], dim=-1)
        text_concat = torch.cat([text_proj, text_attended_audio], dim=-1)

        audio_gate_value = self.audio_gate(audio_concat)
        text_gate_value = self.text_gate(text_concat)

        gated_audio = audio_proj * audio_gate_value
        gated_text = text_proj * text_gate_value

        pooled_audio = self.audio_attention_pooling(gated_audio, audio_mask)
        text_sum = torch.sum(gated_text * attention_mask.unsqueeze(-1), dim=1)
        text_count = torch.sum(attention_mask, dim=1, keepdim=True).clamp(min=1)
        pooled_text = text_sum / text_count

        fused = torch.cat([pooled_audio, pooled_text], dim=1)
        joint_repr = self.fusion_layer(fused)

        shared = self.shared_fc(joint_repr)

        output = self.output_branch(shared)
        scaled_output = 1.0 + 4.0 * torch.sigmoid(output)

        return scaled_output

# Training function
def train_valence_model(model, train_loader, val_loader, audio_model,
                        num_epochs=15, lr=5e-5, max_norm=0.5):
    device = torch.device("cuda")
    model = model.to(device)
    audio_model = audio_model.to(device)

    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-2)
    criterion = SmoothMSELoss(smoothing=0.1)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=3, T_mult=2, eta_min=1e-6
    )

    best_val_loss = float('inf')
    patience_counter = 0
    max_patience = 7

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        num_train_batches = 0

        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train Valence]")

        for batch_inputs, labels in train_pbar:
            try:
                audio_values = batch_inputs['audio_values'].to(device)
                input_ids = batch_inputs['input_ids'].to(device)
                attention_mask = batch_inputs['attention_mask'].to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                with torch.no_grad():
                    hubert_out = audio_model(audio_values).last_hidden_state
                    hubert_out = hubert_out.detach()

                outputs = model(hubert_out, input_ids, attention_mask)
                loss = criterion(outputs.squeeze(), labels)

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm)
                optimizer.step()

                train_loss += loss.item()
                num_train_batches += 1

                train_pbar.set_postfix({'loss': f"{loss.item():.4f}"})

            except Exception as e:
                print(f"Error in training batch: {e}")
                continue

        scheduler.step()

        if num_train_batches > 0:
            train_loss /= num_train_batches

        model.eval()
        val_loss = 0.0
        num_val_batches = 0

        val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val Valence]")

        with torch.no_grad():
            for batch_inputs, labels in val_pbar:
                try:
                    audio_values = batch_inputs['audio_values'].to(device)
                    input_ids = batch_inputs['input_ids'].to(device)
                    attention_mask = batch_inputs['attention_mask'].to(device)
                    labels = labels.to(device)

                    hubert_out = audio_model(audio_values).last_hidden_state

                    outputs = model(hubert_out, input_ids, attention_mask)
                    loss = criterion(outputs.squeeze(), labels)

                    val_loss += loss.item()
                    num_val_batches += 1

                    val_pbar.set_postfix({'loss': f"{loss.item():.4f}"})

                except Exception as e:
                    print(f"Error in validation batch: {e}")
                    continue

        if num_val_batches > 0:
            val_loss /= num_val_batches

        print(f"Epoch {epoch+1}/{num_epochs} [Valence] Results:")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), '/content/best_valence_regressor.pth')
            print(f"  Saved best model (val_loss: {val_loss:.4f})")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= max_patience:
                print(f"Early stopping after {epoch+1} epochs without improvement")
                break

    return best_val_loss

# Evaluation function
def evaluate_valence_model(model, test_loader, audio_model, device):
    model.eval()
    criterion = SmoothMSELoss(smoothing=0.1)
    test_loss = 0.0
    num_batches = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch_inputs, labels in test_loader:
            try:
                audio_values = batch_inputs['audio_values'].to(device)
                input_ids = batch_inputs['input_ids'].to(device)
                attention_mask = batch_inputs['attention_mask'].to(device)
                labels = labels.to(device)

                hubert_out = audio_model(audio_values).last_hidden_state

                outputs = model(hubert_out, input_ids, attention_mask)
                loss = criterion(outputs.squeeze(), labels)

                test_loss += loss.item()
                num_batches += 1

                all_preds.append(outputs.squeeze().cpu().numpy())
                all_labels.append(labels.cpu().numpy())

            except Exception as e:
                print(f"Error in test batch: {e}")
                continue

    if num_batches > 0:
        test_loss /= num_batches
        all_preds = np.concatenate(all_preds)
        all_labels = np.concatenate(all_labels)
        from scipy.stats import pearsonr
        pcc, _ = pearsonr(all_preds, all_labels)
    else:
        test_loss = float('inf')
        pcc = 0.0

    return test_loss, pcc

# Main execution
def main():
    device = torch.device("cuda")

    # Configuration
    csv_file = "/content/sample_data/updated_iemocap_metadata_colab.csv"
    audio_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
    text_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
    for param in hubert_model.parameters():
        param.requires_grad = False
    hubert_model.eval()
    hubert_model = hubert_model.to(device)

    # Dataset and loaders
    train_dataset = MultimodalIEMOCAPDataset(
        csv_file=csv_file,
        audio_feature_extractor=audio_feature_extractor,
        text_tokenizer=text_tokenizer,
        augment=True,
        max_audio_samples=128000
    )
    val_dataset = MultimodalIEMOCAPDataset(
        csv_file=csv_file,
        audio_feature_extractor=audio_feature_extractor,
        text_tokenizer=text_tokenizer,
        augment=False,
        max_audio_samples=128000
    )
    test_dataset = MultimodalIEMOCAPDataset(
        csv_file=csv_file,
        audio_feature_extractor=audio_feature_extractor,
        text_tokenizer=text_tokenizer,
        augment=False,
        max_audio_samples=128000
    )

    train_size = int(0.8 * len(train_dataset))
    val_size = int(0.1 * len(train_dataset))
    test_size = len(train_dataset) - train_size - val_size
    train_dataset, _, _ = random_split(
        train_dataset, [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)
    )
    _, val_dataset, _ = random_split(
        val_dataset, [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)
    )
    _, _, test_dataset = random_split(
        test_dataset, [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=8,
        shuffle=True,
        num_workers=0,
        pin_memory=True,
        collate_fn=dynamic_collate_fn
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        collate_fn=dynamic_collate_fn
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        collate_fn=dynamic_collate_fn
    )

    # Model
    model = ValenceRegressor(
        audio_dim=768,
        text_dim=768,
        hidden_dim=192,
        num_heads=6,
        num_layers=2,
        dropout=0.5
    ).to(device)

    # Training
    print("Training Valence model...")
    best_val_loss = train_valence_model(
        model,
        train_loader,
        val_loader,
        hubert_model,
        num_epochs=15,
        lr=5e-5,
        max_norm=0.5
    )

    # Evaluation
    model.load_state_dict(torch.load('/content/best_valence_regressor.pth'))
    test_loss, test_pcc = evaluate_valence_model(model, test_loader, hubert_model, device)

    print("\nTest Results:")
    print(f"  Valence MSE: {test_loss:.4f}")
    print(f"  Valence PCC: {test_pcc:.4f}")

if __name__ == "__main__":
    main()



Training Valence model...


  effect = torch.tensor(audio).unsqueeze(0)  # [1, samples]
Epoch 1/15 [Train Valence]: 100%|██████████| 1004/1004 [09:21<00:00,  1.79it/s, loss=0.9520]
  return torch._transformer_encoder_layer_fwd(
Epoch 1/15 [Val Valence]: 100%|██████████| 126/126 [00:45<00:00,  2.76it/s, loss=0.4118]


Epoch 1/15 [Valence] Results:
  Train Loss: 0.6754
  Val Loss: 0.4813
  Saved best model (val_loss: 0.4813)


Epoch 2/15 [Train Valence]: 100%|██████████| 1004/1004 [09:08<00:00,  1.83it/s, loss=0.3010]
Epoch 2/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.82it/s, loss=0.1743]


Epoch 2/15 [Valence] Results:
  Train Loss: 0.4885
  Val Loss: 0.3815
  Saved best model (val_loss: 0.3815)


Epoch 3/15 [Train Valence]: 100%|██████████| 1004/1004 [09:10<00:00,  1.82it/s, loss=0.4766]
Epoch 3/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.83it/s, loss=0.0935]


Epoch 3/15 [Valence] Results:
  Train Loss: 0.4110
  Val Loss: 0.4085


Epoch 4/15 [Train Valence]: 100%|██████████| 1004/1004 [09:10<00:00,  1.82it/s, loss=0.2069]
Epoch 4/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.82it/s, loss=0.4076]


Epoch 4/15 [Valence] Results:
  Train Loss: 0.4181
  Val Loss: 0.4480


Epoch 5/15 [Train Valence]: 100%|██████████| 1004/1004 [09:07<00:00,  1.83it/s, loss=0.6245]
Epoch 5/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.83it/s, loss=0.0390]


Epoch 5/15 [Valence] Results:
  Train Loss: 0.3867
  Val Loss: 0.3726
  Saved best model (val_loss: 0.3726)


Epoch 6/15 [Train Valence]: 100%|██████████| 1004/1004 [09:10<00:00,  1.83it/s, loss=0.5646]
Epoch 6/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.82it/s, loss=0.2800]


Epoch 6/15 [Valence] Results:
  Train Loss: 0.3553
  Val Loss: 0.3403
  Saved best model (val_loss: 0.3403)


Epoch 7/15 [Train Valence]: 100%|██████████| 1004/1004 [09:07<00:00,  1.83it/s, loss=0.1587]
Epoch 7/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.82it/s, loss=0.1156]


Epoch 7/15 [Valence] Results:
  Train Loss: 0.3271
  Val Loss: 0.3369
  Saved best model (val_loss: 0.3369)


Epoch 8/15 [Train Valence]: 100%|██████████| 1004/1004 [09:07<00:00,  1.83it/s, loss=0.1090]
Epoch 8/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.84it/s, loss=0.1195]


Epoch 8/15 [Valence] Results:
  Train Loss: 0.3029
  Val Loss: 0.3290
  Saved best model (val_loss: 0.3290)


Epoch 9/15 [Train Valence]: 100%|██████████| 1004/1004 [09:03<00:00,  1.85it/s, loss=0.2170]
Epoch 9/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.84it/s, loss=0.1045]


Epoch 9/15 [Valence] Results:
  Train Loss: 0.2851
  Val Loss: 0.3294


Epoch 10/15 [Train Valence]: 100%|██████████| 1004/1004 [09:02<00:00,  1.85it/s, loss=0.4141]
Epoch 10/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.84it/s, loss=0.2042]


Epoch 10/15 [Valence] Results:
  Train Loss: 0.3131
  Val Loss: 0.3676


Epoch 11/15 [Train Valence]: 100%|██████████| 1004/1004 [09:01<00:00,  1.85it/s, loss=0.1772]
Epoch 11/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.84it/s, loss=0.1814]


Epoch 11/15 [Valence] Results:
  Train Loss: 0.2970
  Val Loss: 0.3376


Epoch 12/15 [Train Valence]: 100%|██████████| 1004/1004 [09:06<00:00,  1.84it/s, loss=0.1734]
Epoch 12/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.84it/s, loss=0.1563]


Epoch 12/15 [Valence] Results:
  Train Loss: 0.2860
  Val Loss: 0.3289
  Saved best model (val_loss: 0.3289)


Epoch 13/15 [Train Valence]: 100%|██████████| 1004/1004 [09:06<00:00,  1.84it/s, loss=0.0873]
Epoch 13/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.84it/s, loss=0.0209]


Epoch 13/15 [Valence] Results:
  Train Loss: 0.2726
  Val Loss: 0.3717


Epoch 14/15 [Train Valence]: 100%|██████████| 1004/1004 [09:00<00:00,  1.86it/s, loss=0.5535]
Epoch 14/15 [Val Valence]: 100%|██████████| 126/126 [00:44<00:00,  2.84it/s, loss=0.0028]


Epoch 14/15 [Valence] Results:
  Train Loss: 0.2543
  Val Loss: 0.3121
  Saved best model (val_loss: 0.3121)


Epoch 15/15 [Train Valence]:   1%|          | 8/1004 [00:04<07:50,  2.11it/s, loss=0.2920]

In [None]:
!ls

sample_data


In [None]:
import torch

model_path = '/content/drive/MyDrive/best_valence_regressor_8.pth'
model = torch.load(model_path, map_location='cpu')


In [None]:
import torch
import torchaudio
from transformers import HubertModel, Wav2Vec2FeatureExtractor, BertModel, BertTokenizer
import pandas as pd
import numpy as np
import os

def predict_valence_from_csv(csv_file, row_idx, model_path, device='cpu'):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    if row_idx < 0 or row_idx >= len(df):
        raise ValueError(f"Row index {row_idx} is out of range [0, {len(df)-1}]")

    # Extract audio path and transcription from the specified row
    audio_path = df.iloc[row_idx]['filepath']
    transcription = df.iloc[row_idx]['transcription']
    ground_truth_valence = df.iloc[row_idx]['EmoVal'].astype(np.float32)

    # Load the trained model with map_location to CPU
    device = torch.device(device)  # Force CPU since CUDA is unavailable
    model = ValenceRegressor(audio_dim=768, text_dim=768, hidden_dim=192, num_heads=6, num_layers=2, dropout=0.5)
    model.load_state_dict(torch.load(model_path, map_location=device))  # Map to CPU
    model = model.to(device)
    model.eval()

    # Load feature extractors and tokenizers
    audio_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
    text_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
    hubert_model.eval()
    hubert_model = hubert_model.to(device)
    for param in hubert_model.parameters():
        param.requires_grad = False

    # Load and preprocess audio
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file missing: {audio_path}")

    try:
        audio, sr = torchaudio.load(audio_path)
        if audio.abs().mean() < 1e-5:
            raise ValueError(f"Silent audio: {audio_path}")
    except Exception as e:
        raise RuntimeError(f"Error loading audio {audio_path}: {str(e)}")

    if sr != 16000:
        audio = torchaudio.transforms.Resample(sr, 16000)(audio)

    audio = audio.squeeze(0)  # Remove channel dim if mono
    if audio.dim() > 1:
        audio = audio[0]  # Take first channel if stereo

    max_audio_samples = 128000
    if audio.size(0) > max_audio_samples:
        audio = audio[:max_audio_samples]
    elif audio.size(0) < max_audio_samples:
        audio = torch.nn.functional.pad(audio, (0, max_audio_samples - audio.size(0)))

    audio = audio.numpy()
    audio_inputs = audio_feature_extractor(
        audio,
        sampling_rate=16000,
        return_tensors="pt",
        padding=False,
        truncation=False
    )
    audio_values = audio_inputs['input_values'].to(device)

    # Process transcription
    text_inputs = text_tokenizer(
        transcription,
        padding=False,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    input_ids = text_inputs['input_ids'].to(device)
    attention_mask = text_inputs['attention_mask'].to(device)

    # Get features from Hubert model
    with torch.no_grad():
        hubert_out = hubert_model(audio_values).last_hidden_state

    # Predict valence
    with torch.no_grad():
        output = model(hubert_out, input_ids, attention_mask)
        predicted_valence = output.squeeze().item()  # Scale is 1.0 to 5.0

    return predicted_valence, ground_truth_valence

# Transformer model definition (same as in training code)
class ValenceRegressor(nn.Module):
    def __init__(self, audio_dim=768, text_dim=768, hidden_dim=192, num_heads=6, num_layers=2, dropout=0.5):
        super().__init__()

        self.audio_transformer = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=audio_dim,
                nhead=num_heads,
                dim_feedforward=hidden_dim * 4,
                dropout=dropout,
                batch_first=True
            ) for _ in range(num_layers)
        ])

        self.audio_layer_norm = nn.LayerNorm(audio_dim)

        self.audio_attention_pool = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 2, 1)
        )

        self.text_encoder = BertModel.from_pretrained("bert-base-uncased")
        for param in self.text_encoder.parameters():
            param.requires_grad = False
        for param in list(self.text_encoder.parameters())[-2:]:
            param.requires_grad = True

        self.audio_projection = nn.Linear(audio_dim, hidden_dim)
        self.text_projection = nn.Linear(text_dim, hidden_dim)

        self.audio_to_text_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads//2,
            dropout=dropout,
            batch_first=True
        )

        self.text_to_audio_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads//2,
            dropout=dropout,
            batch_first=True
        )

        self.audio_gate = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.Sigmoid()
        )

        self.text_gate = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.Sigmoid()
        )

        self.fusion_layer = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim*2),
            nn.LayerNorm(hidden_dim*2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim*2, hidden_dim)
        )

        self.shared_fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        self.output_branch = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.LayerNorm(hidden_dim//2),
            nn.GELU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(hidden_dim//2, 1)
        )

    def audio_attention_pooling(self, x, audio_mask=None):
        weights = self.audio_attention_pool(x)
        if audio_mask is not None:
            weights = weights.masked_fill(~audio_mask.bool().unsqueeze(-1), float('-inf'))
        weights = torch.softmax(weights, dim=1)
        output = torch.bmm(weights.transpose(1, 2), x)
        return output.squeeze(1)

    def forward(self, audio_features, input_ids, attention_mask):
        audio_mask = (audio_features.abs().sum(dim=-1) > 1e-6)

        audio_repr = audio_features
        for layer in self.audio_transformer:
            audio_key_padding_mask = (~audio_mask).float()
            audio_repr = layer(audio_repr, src_key_padding_mask=audio_key_padding_mask)

        audio_repr = self.audio_layer_norm(audio_repr)

        text_outputs = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_repr = text_outputs.last_hidden_state

        audio_proj = self.audio_projection(audio_repr)
        text_proj = self.text_projection(text_repr)

        audio_attended_text, _ = self.audio_to_text_attention(
            query=audio_proj,
            key=text_proj,
            value=text_proj,
            key_padding_mask=(1 - attention_mask).bool()
        )

        text_attended_audio, _ = self.text_to_audio_attention(
            query=text_proj,
            key=audio_proj,
            value=audio_proj,
            key_padding_mask=(~audio_mask).bool()
        )

        audio_concat = torch.cat([audio_proj, audio_attended_text], dim=-1)
        text_concat = torch.cat([text_proj, text_attended_audio], dim=-1)

        audio_gate_value = self.audio_gate(audio_concat)
        text_gate_value = self.text_gate(text_concat)

        gated_audio = audio_proj * audio_gate_value
        gated_text = text_proj * text_gate_value

        pooled_audio = self.audio_attention_pooling(gated_audio, audio_mask)
        text_sum = torch.sum(gated_text * attention_mask.unsqueeze(-1), dim=1)
        text_count = torch.sum(attention_mask, dim=1, keepdim=True).clamp(min=1)
        pooled_text = text_sum / text_count

        fused = torch.cat([pooled_audio, pooled_text], dim=1)
        joint_repr = self.fusion_layer(fused)

        shared = self.shared_fc(joint_repr)

        output = self.output_branch(shared)
        scaled_output = 1.0 + 4.0 * torch.sigmoid(output)

        return scaled_output

# Example usage
if __name__ == "__main__":
    # Edit these paths and row index
    csv_file = "/content/sample_data/updated_iemocap_metadata_colab.csv"
    row_idx = 10021  # Change to desired row number
    model_path = "/content/drive/MyDrive/best_valence_regressor_8.pth"

    predicted_valence, ground_truth_valence = predict_valence_from_csv(csv_file, row_idx, model_path)
    print(f"Predicted Valence Score: {predicted_valence:.4f}")
    print(f"Ground Truth Valence: {ground_truth_valence:.4f}")

  return torch._transformer_encoder_layer_fwd(


Predicted Valence Score: 1.7735
Ground Truth Valence: 1.5000


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import HubertModel, Wav2Vec2FeatureExtractor, BertModel, BertTokenizer, get_linear_schedule_with_warmup
import torchaudio
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import nlpaug.augmenter.word as naw
import random

# Dataset with fixed tensor copy
class MultimodalIEMOCAPDataset(Dataset):
    def __init__(self, csv_file, audio_feature_extractor, text_tokenizer, augment=True, max_audio_samples=128000):
        self.df = pd.read_csv(csv_file)
        self.audio_feature_extractor = audio_feature_extractor
        self.text_tokenizer = text_tokenizer
        self.augment = augment
        self.text_augmenter = naw.SynonymAug(aug_p=0.3) if augment else None
        self.max_audio_samples = max_audio_samples

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['filepath']
        transcript = self.df.iloc[idx]['transcription']

        # Get both arousal and dominance labels
        arousal_label = self.df.iloc[idx]['EmoAct'].astype(np.float32)
        dominance_label = self.df.iloc[idx]['EmoDom'].astype(np.float32)

        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file missing: {audio_path}")

        try:
            audio, sr = torchaudio.load(audio_path)
            if audio.abs().mean() < 1e-5:
                raise ValueError(f"Silent audio: {audio_path}")
        except Exception as e:
            raise RuntimeError(f"Error loading audio {audio_path}: {str(e)}")

        if sr != 16000:
            audio = torchaudio.transforms.Resample(sr, 16000)(audio)

        audio = audio.squeeze(0)  # Remove channel dim if mono
        if audio.dim() > 1:
            audio = audio[0]  # Take first channel if stereo

        # Pad or truncate to max_audio_samples
        if audio.size(0) > self.max_audio_samples:
            audio = audio[:self.max_audio_samples]
        elif audio.size(0) < self.max_audio_samples:
            audio = torch.nn.functional.pad(audio, (0, self.max_audio_samples - audio.size(0)))

        if self.augment and random.random() < 0.5:
            # Add noise
            noise = torch.randn_like(audio) * 0.005
            audio = audio + noise

            # Speed augmentation
            try:
                speed_factor = random.uniform(0.9, 1.1)
                effect = audio.clone().detach().unsqueeze(0)  # [1, samples]
                augmented_audio, new_sr = torchaudio.sox_effects.apply_effects_tensor(
                    effect,
                    sample_rate=16000,
                    effects=[["speed", str(speed_factor)], ["rate", "16000"]]
                )
                audio = augmented_audio.squeeze(0)
                # Re-pad or truncate to max_audio_samples
                if audio.size(0) > self.max_audio_samples:
                    audio = audio[:self.max_audio_samples]
                elif audio.size(0) < self.max_audio_samples:
                    audio = torch.nn.functional.pad(audio, (0, self.max_audio_samples - audio.size(0)))
            except Exception as e:
                print(f"Speed augmentation failed for {audio_path}: {e}")
                # Keep original audio

        audio = audio.numpy()

        if self.augment and self.text_augmenter and random.random() < 0.3:
            try:
                transcript = self.text_augmenter.augment(transcript)[0]
            except Exception as e:
                print(f"Text augmentation failed for {audio_path}: {e}")
                # Keep original transcript

        audio_inputs = self.audio_feature_extractor(
            audio,
            sampling_rate=16000,
            return_tensors="pt",
            padding=False,
            truncation=False
        )

        text_inputs = self.text_tokenizer(
            transcript,
            padding=False,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

        return {
            'audio_values': audio_inputs['input_values'].squeeze(0),
            'input_ids': text_inputs['input_ids'].squeeze(0),
            'attention_mask': text_inputs['attention_mask'].squeeze(0)
        }, {
            'arousal': torch.tensor(arousal_label),
            'dominance': torch.tensor(dominance_label)
        }

# Dynamic collation
def dynamic_collate_fn(batch):
    inputs, labels = zip(*batch)
    audio_values = [item['audio_values'] for item in inputs]
    input_ids = [item['input_ids'] for item in inputs]
    attention_masks = [item['attention_mask'] for item in inputs]

    audio_values_padded = pad_sequence(audio_values, batch_first=True, padding_value=0.0)
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    arousal_labels = torch.stack([item['arousal'] for item in labels])
    dominance_labels = torch.stack([item['dominance'] for item in labels])

    return {
        'audio_values': audio_values_padded,
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks_padded
    }, {
        'arousal': arousal_labels,
        'dominance': dominance_labels
    }

# MultimodalArousalDominanceModel with fixed audio_attention_pool
class MultimodalArousalDominanceModel(nn.Module):
    def __init__(self, audio_dim=768, text_dim=768, hidden_dim=192, num_heads=6, num_layers=2, dropout=0.5):
        super().__init__()

        self.audio_transformer = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=audio_dim,
                nhead=num_heads,
                dim_feedforward=hidden_dim * 4,
                dropout=dropout,
                batch_first=True
            ) for _ in range(num_layers)
        ])

        self.audio_layer_norm = nn.LayerNorm(audio_dim)

        # Fixed input dimension to hidden_dim
        self.audio_attention_pool = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),  # 192 -> 384
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 2, 1)  # 384 -> 1
        )

        self.text_encoder = BertModel.from_pretrained("bert-base-uncased")
        for param in self.text_encoder.parameters():
            param.requires_grad = False
        for param in list(self.text_encoder.parameters())[-2:]:
            param.requires_grad = True

        self.audio_projection = nn.Linear(audio_dim, hidden_dim)
        self.text_projection = nn.Linear(text_dim, hidden_dim)

        self.audio_to_text_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads//2,
            dropout=dropout,
            batch_first=True
        )

        self.text_to_audio_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads//2,
            dropout=dropout,
            batch_first=True
        )

        self.audio_gate = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.Sigmoid()
        )

        self.text_gate = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim),
            nn.Sigmoid()
        )

        # Task-Specific Fusion Layers
        self.fusion_layer_arousal = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim*2),
            nn.LayerNorm(hidden_dim*2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim*2, hidden_dim)
        )

        self.fusion_layer_dominance = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim*2),
            nn.LayerNorm(hidden_dim*2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim*2, hidden_dim)
        )

        # Task-Specific FC Layers
        self.shared_fc_arousal = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        self.shared_fc_dominance = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        # Task-Specific Output Branches
        self.output_branch_arousal = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.LayerNorm(hidden_dim//2),
            nn.GELU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(hidden_dim//2, 1)
        )

        self.output_branch_dominance = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.LayerNorm(hidden_dim//2),
            nn.GELU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(hidden_dim//2, 1)
        )

    def audio_attention_pooling(self, x, audio_mask=None):
        weights = self.audio_attention_pool(x)
        if audio_mask is not None:
            weights = weights.masked_fill(~audio_mask.bool().unsqueeze(-1), float('-inf'))
        weights = torch.softmax(weights, dim=1)
        output = torch.bmm(weights.transpose(1, 2), x)
        return output.squeeze(1)

    def forward(self, audio_features, input_ids, attention_mask):
        audio_mask = (audio_features.abs().sum(dim=-1) > 1e-6)

        audio_repr = audio_features
        for layer in self.audio_transformer:
            audio_key_padding_mask = (~audio_mask).float()
            audio_repr = layer(audio_repr, src_key_padding_mask=audio_key_padding_mask)

        audio_repr = self.audio_layer_norm(audio_repr)

        text_outputs = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_repr = text_outputs.last_hidden_state

        audio_proj = self.audio_projection(audio_repr)
        text_proj = self.text_projection(text_repr)

        audio_attended_text, _ = self.audio_to_text_attention(
            query=audio_proj,
            key=text_proj,
            value=text_proj,
            key_padding_mask=(1 - attention_mask).bool()
        )

        text_attended_audio, _ = self.text_to_audio_attention(
            query=text_proj,
            key=audio_proj,
            value=audio_proj,
            key_padding_mask=(~audio_mask).bool()
        )

        audio_concat = torch.cat([audio_proj, audio_attended_text], dim=-1)
        text_concat = torch.cat([text_proj, text_attended_audio], dim=-1)

        audio_gate_value = self.audio_gate(audio_concat)
        text_gate_value = self.text_gate(text_concat)

        gated_audio = audio_proj * audio_gate_value
        gated_text = text_proj * text_gate_value

        pooled_audio = self.audio_attention_pooling(gated_audio, audio_mask)
        text_sum = torch.sum(gated_text * attention_mask.unsqueeze(-1), dim=1)
        text_count = torch.sum(attention_mask, dim=1, keepdim=True).clamp(min=1)
        pooled_text = text_sum / text_count

        fused = torch.cat([pooled_audio, pooled_text], dim=1)

        # Task-Specific Processing
        joint_repr_arousal = self.fusion_layer_arousal(fused)
        joint_repr_dominance = self.fusion_layer_dominance(fused)

        shared_arousal = self.shared_fc_arousal(joint_repr_arousal)
        shared_dominance = self.shared_fc_dominance(joint_repr_dominance)

        output_arousal = self.output_branch_arousal(shared_arousal)
        output_dominance = self.output_branch_dominance(shared_dominance)

        # Scale outputs to [1, 5]
        scaled_arousal = 1.0 + 4.0 * torch.sigmoid(output_arousal)
        scaled_dominance = 1.0 + 4.0 * torch.sigmoid(output_dominance)

        return scaled_arousal, scaled_dominance

# Training function with backpropagation optimizations
def train_emotion_model(model, train_loader, val_loader, audio_model, num_epochs=15, lr=5e-5, max_norm=1.0):
    device = torch.device("cuda")
    model = model.to(device)
    audio_model = audio_model.to(device)

    # Differential learning rates
    optimizer = optim.AdamW([
        {"params": model.text_encoder.parameters(), "lr": 5e-6},
        {"params": [p for n, p in model.named_parameters() if "dominance" in n], "lr": 1e-4},  # Higher for dominance
        {"params": [p for n, p in model.named_parameters() if "text_encoder" not in n and "dominance" not in n], "lr": lr}
    ], weight_decay=1e-2)

    criterion = nn.HuberLoss()  # Robust to outliers
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=3, T_mult=2, eta_min=1e-6
    )
    warmup_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=len(train_loader) * num_epochs)

    best_val_loss = float('inf')
    patience_counter = 0
    max_patience = 10
    arousal_weight = 1.0
    dominance_weight = 1.0
    accumulation_steps = 2

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_arousal_loss = 0.0
        train_dominance_loss = 0.0
        num_train_batches = 0

        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")

        for i, (batch_inputs, labels) in enumerate(train_pbar):
            try:
                audio_values = batch_inputs['audio_values'].to(device)
                input_ids = batch_inputs['input_ids'].to(device)
                attention_mask = batch_inputs['attention_mask'].to(device)
                arousal_labels = labels['arousal'].to(device)
                dominance_labels = labels['dominance'].to(device)

                optimizer.zero_grad(set_to_none=True)

                with torch.no_grad():
                    hubert_out = audio_model(audio_values).last_hidden_state

                arousal_outputs, dominance_outputs = model(hubert_out, input_ids, attention_mask)

                arousal_loss = criterion(arousal_outputs.squeeze(), arousal_labels)
                dominance_loss = criterion(dominance_outputs.squeeze(), dominance_labels)

                total_loss = (arousal_weight * arousal_loss + dominance_weight * dominance_loss) / accumulation_steps

                total_loss.backward()

                if (i + 1) % accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm)
                    optimizer.step()
                    optimizer.zero_grad(set_to_none=True)

                # Monitor gradients every 100 batches
                if i % 100 == 0:
                    for name, param in model.named_parameters():
                        if param.grad is not None and ("arousal" in name or "dominance" in name):
                            print(f"{name}: Grad Norm = {param.grad.norm().item():.4f}")

                # Warmup for first 500 steps, then cosine annealing
                if i + epoch * len(train_loader) < 500:
                    warmup_scheduler.step()
                else:
                    scheduler.step()

                train_loss += total_loss.item() * accumulation_steps
                train_arousal_loss += arousal_loss.item()
                train_dominance_loss += dominance_loss.item()
                num_train_batches += 1

                train_pbar.set_postfix({
                    'loss': f"{total_loss.item() * accumulation_steps:.4f}",
                    'a_loss': f"{arousal_loss.item():.4f}",
                    'd_loss': f"{dominance_loss.item():.4f}"
                })

            except Exception as e:
                print(f"Error in training batch: {e}")
                continue

        if num_train_batches > 0:
            train_loss /= num_train_batches
            train_arousal_loss /= num_train_batches
            train_dominance_loss /= num_train_batches

            # Fixed and enhanced dynamic weighting
            total_task_loss = train_arousal_loss + train_dominance_loss
            if total_task_loss > 0:
                arousal_weight = 0.5 * (train_arousal_loss / total_task_loss)
                dominance_weight = 0.5 * (train_dominance_loss / total_task_loss)
                if train_dominance_loss > train_arousal_loss * 1.2:
                    dominance_weight *= 1.5
                    arousal_weight *= 0.5
                total_weight = arousal_weight + dominance_weight
                arousal_weight /= total_weight
                dominance_weight /= total_weight
                arousal_weight = max(0.25, min(0.75, arousal_weight))
                dominance_weight = max(0.25, min(0.75, dominance_weight))

        model.eval()
        val_loss = 0.0
        val_arousal_loss = 0.0
        val_dominance_loss = 0.0
        num_val_batches = 0

        val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")

        with torch.no_grad():
            for batch_inputs, labels in val_pbar:
                try:
                    audio_values = batch_inputs['audio_values'].to(device)
                    input_ids = batch_inputs['input_ids'].to(device)
                    attention_mask = batch_inputs['attention_mask'].to(device)
                    arousal_labels = labels['arousal'].to(device)
                    dominance_labels = labels['dominance'].to(device)

                    hubert_out = audio_model(audio_values).last_hidden_state

                    arousal_outputs, dominance_outputs = model(hubert_out, input_ids, attention_mask)

                    arousal_loss = criterion(arousal_outputs.squeeze(), arousal_labels)
                    dominance_loss = criterion(dominance_outputs.squeeze(), dominance_labels)
                    total_loss = 0.5 * arousal_loss + 0.5 * dominance_loss

                    val_loss += total_loss.item()
                    val_arousal_loss += arousal_loss.item()
                    val_dominance_loss += dominance_loss.item()
                    num_val_batches += 1

                    val_pbar.set_postfix({
                        'loss': f"{total_loss.item():.4f}",
                        'a_loss': f"{arousal_loss.item():.4f}",
                        'd_loss': f"{dominance_loss.item():.4f}"
                    })

                except Exception as e:
                    print(f"Error in validation batch: {e}")
                    continue

        if num_val_batches > 0:
            val_loss /= num_val_batches
            val_arousal_loss /= num_val_batches
            val_dominance_loss /= num_val_batches

        print(f"Epoch {epoch+1}/{num_epochs} Results:")
        print(f"  Train Loss: {train_loss:.4f} (Arousal: {train_arousal_loss:.4f}, Dominance: {train_dominance_loss:.4f})")
        print(f"  Val Loss: {val_loss:.4f} (Arousal: {val_arousal_loss:.4f}, Dominance: {val_dominance_loss:.4f})")
        print(f"  Loss Weights - Arousal: {arousal_weight:.4f}, Dominance: {dominance_weight:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), '/content/best_arousal_dominance_model.pth')
            print(f"  Saved best model (val_loss: {val_loss:.4f})")
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= max_patience:
                print(f"Early stopping after {epoch+1} epochs without improvement")
                break

    return best_val_loss

# Evaluation function (completed)
def evaluate_emotion_model(model, test_loader, audio_model, device):
    model.eval()
    criterion = nn.HuberLoss()
    test_loss = 0.0
    test_arousal_loss = 0.0
    test_dominance_loss = 0.0
    num_batches = 0

    all_arousal_preds = []
    all_arousal_labels = []
    all_dominance_preds = []
    all_dominance_labels = []

    with torch.no_grad():
        for batch_inputs, labels in test_loader:
            try:
                audio_values = batch_inputs['audio_values'].to(device)
                input_ids = batch_inputs['input_ids'].to(device)
                attention_mask = batch_inputs['attention_mask'].to(device)
                arousal_labels = labels['arousal'].to(device)
                dominance_labels = labels['dominance'].to(device)

                hubert_out = audio_model(audio_values).last_hidden_state

                arousal_outputs, dominance_outputs = model(hubert_out, input_ids, attention_mask)

                arousal_loss = criterion(arousal_outputs.squeeze(), arousal_labels)
                dominance_loss = criterion(dominance_outputs.squeeze(), dominance_labels)

                total_loss = 0.5 * arousal_loss + 0.5 * dominance_loss

                test_loss += total_loss.item()
                test_arousal_loss += arousal_loss.item()
                test_dominance_loss += dominance_loss.item()
                num_batches += 1

                all_arousal_preds.append(arousal_outputs.squeeze().cpu().numpy())
                all_arousal_labels.append(arousal_labels.cpu().numpy())
                all_dominance_preds.append(dominance_outputs.squeeze().cpu().numpy())
                all_dominance_labels.append(dominance_labels.cpu().numpy())

            except Exception as e:
                print(f"Error in test batch: {e}")
                continue

    if num_batches > 0:
        test_loss /= num_batches
        test_arousal_loss /= num_batches
        test_dominance_loss /= num_batches

        from scipy.stats import pearsonr
        all_arousal_preds = np.concatenate(all_arousal_preds)
        all_arousal_labels = np.concatenate(all_arousal_labels)
        all_dominance_preds = np.concatenate(all_dominance_preds)
        all_dominance_labels = np.concatenate(all_dominance_labels)

        arousal_pcc, _ = pearsonr(all_arousal_preds, all_arousal_labels)
        dominance_pcc, _ = pearsonr(all_dominance_preds, all_dominance_labels)
    else:
        test_loss = float('inf')
        test_arousal_loss = float('inf')
        test_dominance_loss = float('inf')
        arousal_pcc = 0.0
        dominance_pcc = 0.0

    return test_loss, test_arousal_loss, test_dominance_loss, arousal_pcc, dominance_pcc

# Main execution
def main():
    device = torch.device("cuda")

    # Configuration
    csv_file = "/content/sample_data/updated_iemocap_metadata_colab.csv"
    audio_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
    text_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
    for param in hubert_model.parameters():
        param.requires_grad = False
    hubert_model.eval()
    hubert_model = hubert_model.to(device)

    # Dataset and loaders
    train_dataset = MultimodalIEMOCAPDataset(
        csv_file=csv_file,
        audio_feature_extractor=audio_feature_extractor,
        text_tokenizer=text_tokenizer,
        augment=True,
        max_audio_samples=128000
    )
    val_dataset = MultimodalIEMOCAPDataset(
        csv_file=csv_file,
        audio_feature_extractor=audio_feature_extractor,
        text_tokenizer=text_tokenizer,
        augment=False,
        max_audio_samples=128000
    )
    test_dataset = MultimodalIEMOCAPDataset(
        csv_file=csv_file,
        audio_feature_extractor=audio_feature_extractor,
        text_tokenizer=text_tokenizer,
        augment=False,
        max_audio_samples=128000
    )

    train_size = int(0.8 * len(train_dataset))
    val_size = int(0.1 * len(train_dataset))
    test_size = len(train_dataset) - train_size - val_size
    train_dataset, _, _ = random_split(
        train_dataset, [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)
    )
    _, val_dataset, _ = random_split(
        val_dataset, [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)
    )
    _, _, test_dataset = random_split(
        test_dataset, [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=8,
        shuffle=True,
        num_workers=0,
        pin_memory=True,
        collate_fn=dynamic_collate_fn
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        collate_fn=dynamic_collate_fn
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        collate_fn=dynamic_collate_fn
    )

    # Initialize the model
    model = MultimodalArousalDominanceModel(
        audio_dim=768,
        text_dim=768,
        hidden_dim=192,
        num_heads=6,
        num_layers=2,
        dropout=0.5
    ).to(device)

    # Training
    print("Training Arousal and Dominance model...")
    best_val_loss = train_emotion_model(
        model,
        train_loader,
        val_loader,
        hubert_model,
        num_epochs=15,
        lr=5e-5,
        max_norm=1.0
    )

    # Evaluation
    model.load_state_dict(torch.load('/content/best_arousal_dominance_model.pth'))
    test_loss, test_arousal_loss, test_dominance_loss, arousal_pcc, dominance_pcc = evaluate_emotion_model(
        model, test_loader, hubert_model, device
    )

    print("\nTest Results:")
    print(f"  Overall Test Loss: {test_loss:.4f}")
    print(f"  Arousal MSE: {test_arousal_loss:.4f}, PCC: {arousal_pcc:.4f}")
    print(f"  Dominance MSE: {test_dominance_loss:.4f}, PCC: {dominance_pcc:.4f}")

if __name__ == "__main__":
    main()



Training Arousal and Dominance model...


Epoch 1/15 [Train]:   0%|          | 1/1004 [00:00<10:42,  1.56it/s, loss=0.7697, a_loss=0.3017, d_loss=0.4680]

fusion_layer_arousal.0.weight: Grad Norm = 1.7369
fusion_layer_arousal.0.bias: Grad Norm = 0.7337
fusion_layer_arousal.1.weight: Grad Norm = 0.0602
fusion_layer_arousal.1.bias: Grad Norm = 0.0610
fusion_layer_arousal.4.weight: Grad Norm = 1.8725
fusion_layer_arousal.4.bias: Grad Norm = 0.1073
fusion_layer_dominance.0.weight: Grad Norm = 1.5990
fusion_layer_dominance.0.bias: Grad Norm = 0.6528
fusion_layer_dominance.1.weight: Grad Norm = 0.0603
fusion_layer_dominance.1.bias: Grad Norm = 0.0520
fusion_layer_dominance.4.weight: Grad Norm = 1.7725
fusion_layer_dominance.4.bias: Grad Norm = 0.1039
shared_fc_arousal.0.weight: Grad Norm = 1.3586
shared_fc_arousal.0.bias: Grad Norm = 0.1959
shared_fc_arousal.1.weight: Grad Norm = 0.0633
shared_fc_arousal.1.bias: Grad Norm = 0.0611
shared_fc_dominance.0.weight: Grad Norm = 1.3562
shared_fc_dominance.0.bias: Grad Norm = 0.1830
shared_fc_dominance.1.weight: Grad Norm = 0.0766
shared_fc_dominance.1.bias: Grad Norm = 0.0637
output_branch_arousal.0.

Epoch 1/15 [Train]:  10%|█         | 101/1004 [00:56<08:01,  1.88it/s, loss=0.5750, a_loss=0.1902, d_loss=0.3848]

fusion_layer_arousal.0.weight: Grad Norm = 1.2496
fusion_layer_arousal.0.bias: Grad Norm = 0.4401
fusion_layer_arousal.1.weight: Grad Norm = 0.0436
fusion_layer_arousal.1.bias: Grad Norm = 0.0412
fusion_layer_arousal.4.weight: Grad Norm = 1.4112
fusion_layer_arousal.4.bias: Grad Norm = 0.0748
fusion_layer_dominance.0.weight: Grad Norm = 1.7919
fusion_layer_dominance.0.bias: Grad Norm = 0.6621
fusion_layer_dominance.1.weight: Grad Norm = 0.0606
fusion_layer_dominance.1.bias: Grad Norm = 0.0579
fusion_layer_dominance.4.weight: Grad Norm = 1.9217
fusion_layer_dominance.4.bias: Grad Norm = 0.1040
shared_fc_arousal.0.weight: Grad Norm = 0.9900
shared_fc_arousal.0.bias: Grad Norm = 0.1323
shared_fc_arousal.1.weight: Grad Norm = 0.0522
shared_fc_arousal.1.bias: Grad Norm = 0.0392
shared_fc_dominance.0.weight: Grad Norm = 1.4174
shared_fc_dominance.0.bias: Grad Norm = 0.1832
shared_fc_dominance.1.weight: Grad Norm = 0.0801
shared_fc_dominance.1.bias: Grad Norm = 0.0604
output_branch_arousal.0.

Epoch 1/15 [Train]:  20%|██        | 201/1004 [01:51<07:00,  1.91it/s, loss=0.6419, a_loss=0.2649, d_loss=0.3770]

fusion_layer_arousal.0.weight: Grad Norm = 1.8727
fusion_layer_arousal.0.bias: Grad Norm = 0.6892
fusion_layer_arousal.1.weight: Grad Norm = 0.0682
fusion_layer_arousal.1.bias: Grad Norm = 0.0650
fusion_layer_arousal.4.weight: Grad Norm = 2.0849
fusion_layer_arousal.4.bias: Grad Norm = 0.1332
fusion_layer_dominance.0.weight: Grad Norm = 1.6503
fusion_layer_dominance.0.bias: Grad Norm = 0.5803
fusion_layer_dominance.1.weight: Grad Norm = 0.0613
fusion_layer_dominance.1.bias: Grad Norm = 0.0551
fusion_layer_dominance.4.weight: Grad Norm = 1.6863
fusion_layer_dominance.4.bias: Grad Norm = 0.0879
shared_fc_arousal.0.weight: Grad Norm = 1.4763
shared_fc_arousal.0.bias: Grad Norm = 0.2251
shared_fc_arousal.1.weight: Grad Norm = 0.0709
shared_fc_arousal.1.bias: Grad Norm = 0.0658
shared_fc_dominance.0.weight: Grad Norm = 1.1860
shared_fc_dominance.0.bias: Grad Norm = 0.1485
shared_fc_dominance.1.weight: Grad Norm = 0.0570
shared_fc_dominance.1.bias: Grad Norm = 0.0456
output_branch_arousal.0.

Epoch 1/15 [Train]:  30%|██▉       | 301/1004 [02:47<06:46,  1.73it/s, loss=0.4837, a_loss=0.1158, d_loss=0.3680]

fusion_layer_arousal.0.weight: Grad Norm = 0.9406
fusion_layer_arousal.0.bias: Grad Norm = 0.2809
fusion_layer_arousal.1.weight: Grad Norm = 0.0334
fusion_layer_arousal.1.bias: Grad Norm = 0.0305
fusion_layer_arousal.4.weight: Grad Norm = 1.0644
fusion_layer_arousal.4.bias: Grad Norm = 0.0515
fusion_layer_dominance.0.weight: Grad Norm = 1.4085
fusion_layer_dominance.0.bias: Grad Norm = 0.4642
fusion_layer_dominance.1.weight: Grad Norm = 0.0526
fusion_layer_dominance.1.bias: Grad Norm = 0.0461
fusion_layer_dominance.4.weight: Grad Norm = 1.5657
fusion_layer_dominance.4.bias: Grad Norm = 0.0815
shared_fc_arousal.0.weight: Grad Norm = 0.7740
shared_fc_arousal.0.bias: Grad Norm = 0.0944
shared_fc_arousal.1.weight: Grad Norm = 0.0360
shared_fc_arousal.1.bias: Grad Norm = 0.0310
shared_fc_dominance.0.weight: Grad Norm = 1.1188
shared_fc_dominance.0.bias: Grad Norm = 0.1356
shared_fc_dominance.1.weight: Grad Norm = 0.0498
shared_fc_dominance.1.bias: Grad Norm = 0.0478
output_branch_arousal.0.

Epoch 1/15 [Train]:  40%|███▉      | 401/1004 [03:42<05:30,  1.83it/s, loss=0.3776, a_loss=0.1970, d_loss=0.1806]

fusion_layer_arousal.0.weight: Grad Norm = 0.9827
fusion_layer_arousal.0.bias: Grad Norm = 0.2859
fusion_layer_arousal.1.weight: Grad Norm = 0.0336
fusion_layer_arousal.1.bias: Grad Norm = 0.0335
fusion_layer_arousal.4.weight: Grad Norm = 1.0304
fusion_layer_arousal.4.bias: Grad Norm = 0.0588
fusion_layer_dominance.0.weight: Grad Norm = 0.8897
fusion_layer_dominance.0.bias: Grad Norm = 0.2505
fusion_layer_dominance.1.weight: Grad Norm = 0.0357
fusion_layer_dominance.1.bias: Grad Norm = 0.0311
fusion_layer_dominance.4.weight: Grad Norm = 1.0702
fusion_layer_dominance.4.bias: Grad Norm = 0.0596
shared_fc_arousal.0.weight: Grad Norm = 0.7759
shared_fc_arousal.0.bias: Grad Norm = 0.1031
shared_fc_arousal.1.weight: Grad Norm = 0.0365
shared_fc_arousal.1.bias: Grad Norm = 0.0333
shared_fc_dominance.0.weight: Grad Norm = 0.8061
shared_fc_dominance.0.bias: Grad Norm = 0.0982
shared_fc_dominance.1.weight: Grad Norm = 0.0434
shared_fc_dominance.1.bias: Grad Norm = 0.0365
output_branch_arousal.0.

Epoch 1/15 [Train]:  50%|████▉     | 501/1004 [04:38<04:50,  1.73it/s, loss=0.4709, a_loss=0.2376, d_loss=0.2333]

fusion_layer_arousal.0.weight: Grad Norm = 1.0019
fusion_layer_arousal.0.bias: Grad Norm = 0.2920
fusion_layer_arousal.1.weight: Grad Norm = 0.0373
fusion_layer_arousal.1.bias: Grad Norm = 0.0345
fusion_layer_arousal.4.weight: Grad Norm = 1.3378
fusion_layer_arousal.4.bias: Grad Norm = 0.0831
fusion_layer_dominance.0.weight: Grad Norm = 0.9114
fusion_layer_dominance.0.bias: Grad Norm = 0.2487
fusion_layer_dominance.1.weight: Grad Norm = 0.0346
fusion_layer_dominance.1.bias: Grad Norm = 0.0303
fusion_layer_dominance.4.weight: Grad Norm = 1.1492
fusion_layer_dominance.4.bias: Grad Norm = 0.0546
shared_fc_arousal.0.weight: Grad Norm = 1.0382
shared_fc_arousal.0.bias: Grad Norm = 0.1424
shared_fc_arousal.1.weight: Grad Norm = 0.0551
shared_fc_arousal.1.bias: Grad Norm = 0.0504
shared_fc_dominance.0.weight: Grad Norm = 0.9348
shared_fc_dominance.0.bias: Grad Norm = 0.0977
shared_fc_dominance.1.weight: Grad Norm = 0.0423
shared_fc_dominance.1.bias: Grad Norm = 0.0431
output_branch_arousal.0.

Epoch 1/15 [Train]:  60%|█████▉    | 601/1004 [05:35<03:46,  1.78it/s, loss=0.2712, a_loss=0.1630, d_loss=0.1082]

fusion_layer_arousal.0.weight: Grad Norm = 0.8459
fusion_layer_arousal.0.bias: Grad Norm = 0.2063
fusion_layer_arousal.1.weight: Grad Norm = 0.0303
fusion_layer_arousal.1.bias: Grad Norm = 0.0282
fusion_layer_arousal.4.weight: Grad Norm = 0.9970
fusion_layer_arousal.4.bias: Grad Norm = 0.0488
fusion_layer_dominance.0.weight: Grad Norm = 0.4691
fusion_layer_dominance.0.bias: Grad Norm = 0.1260
fusion_layer_dominance.1.weight: Grad Norm = 0.0184
fusion_layer_dominance.1.bias: Grad Norm = 0.0160
fusion_layer_dominance.4.weight: Grad Norm = 0.6359
fusion_layer_dominance.4.bias: Grad Norm = 0.0342
shared_fc_arousal.0.weight: Grad Norm = 0.7464
shared_fc_arousal.0.bias: Grad Norm = 0.0849
shared_fc_arousal.1.weight: Grad Norm = 0.0351
shared_fc_arousal.1.bias: Grad Norm = 0.0293
shared_fc_dominance.0.weight: Grad Norm = 0.5080
shared_fc_dominance.0.bias: Grad Norm = 0.0565
shared_fc_dominance.1.weight: Grad Norm = 0.0328
shared_fc_dominance.1.bias: Grad Norm = 0.0256
output_branch_arousal.0.

Epoch 1/15 [Train]:  70%|██████▉   | 701/1004 [06:30<02:44,  1.84it/s, loss=0.4041, a_loss=0.2000, d_loss=0.2040]

fusion_layer_arousal.0.weight: Grad Norm = 1.1270
fusion_layer_arousal.0.bias: Grad Norm = 0.3469
fusion_layer_arousal.1.weight: Grad Norm = 0.0462
fusion_layer_arousal.1.bias: Grad Norm = 0.0425
fusion_layer_arousal.4.weight: Grad Norm = 1.3907
fusion_layer_arousal.4.bias: Grad Norm = 0.0858
fusion_layer_dominance.0.weight: Grad Norm = 1.0122
fusion_layer_dominance.0.bias: Grad Norm = 0.3085
fusion_layer_dominance.1.weight: Grad Norm = 0.0399
fusion_layer_dominance.1.bias: Grad Norm = 0.0370
fusion_layer_dominance.4.weight: Grad Norm = 1.1837
fusion_layer_dominance.4.bias: Grad Norm = 0.0791
shared_fc_arousal.0.weight: Grad Norm = 1.0830
shared_fc_arousal.0.bias: Grad Norm = 0.1424
shared_fc_arousal.1.weight: Grad Norm = 0.0472
shared_fc_arousal.1.bias: Grad Norm = 0.0550
shared_fc_dominance.0.weight: Grad Norm = 0.9954
shared_fc_dominance.0.bias: Grad Norm = 0.1347
shared_fc_dominance.1.weight: Grad Norm = 0.0549
shared_fc_dominance.1.bias: Grad Norm = 0.0559
output_branch_arousal.0.

Epoch 1/15 [Train]:  80%|███████▉  | 801/1004 [07:26<01:48,  1.88it/s, loss=0.4453, a_loss=0.1864, d_loss=0.2589]

fusion_layer_arousal.0.weight: Grad Norm = 0.8670
fusion_layer_arousal.0.bias: Grad Norm = 0.2299
fusion_layer_arousal.1.weight: Grad Norm = 0.0336
fusion_layer_arousal.1.bias: Grad Norm = 0.0299
fusion_layer_arousal.4.weight: Grad Norm = 0.9297
fusion_layer_arousal.4.bias: Grad Norm = 0.0564
fusion_layer_dominance.0.weight: Grad Norm = 0.6259
fusion_layer_dominance.0.bias: Grad Norm = 0.1581
fusion_layer_dominance.1.weight: Grad Norm = 0.0273
fusion_layer_dominance.1.bias: Grad Norm = 0.0224
fusion_layer_dominance.4.weight: Grad Norm = 0.7651
fusion_layer_dominance.4.bias: Grad Norm = 0.0378
shared_fc_arousal.0.weight: Grad Norm = 0.7399
shared_fc_arousal.0.bias: Grad Norm = 0.0961
shared_fc_arousal.1.weight: Grad Norm = 0.0426
shared_fc_arousal.1.bias: Grad Norm = 0.0355
shared_fc_dominance.0.weight: Grad Norm = 0.6065
shared_fc_dominance.0.bias: Grad Norm = 0.0682
shared_fc_dominance.1.weight: Grad Norm = 0.0393
shared_fc_dominance.1.bias: Grad Norm = 0.0281
output_branch_arousal.0.

Epoch 1/15 [Train]:  90%|████████▉ | 901/1004 [08:23<00:55,  1.86it/s, loss=0.4665, a_loss=0.2380, d_loss=0.2285]

fusion_layer_arousal.0.weight: Grad Norm = 0.8576
fusion_layer_arousal.0.bias: Grad Norm = 0.2205
fusion_layer_arousal.1.weight: Grad Norm = 0.0362
fusion_layer_arousal.1.bias: Grad Norm = 0.0294
fusion_layer_arousal.4.weight: Grad Norm = 1.0463
fusion_layer_arousal.4.bias: Grad Norm = 0.0575
fusion_layer_dominance.0.weight: Grad Norm = 0.5904
fusion_layer_dominance.0.bias: Grad Norm = 0.1507
fusion_layer_dominance.1.weight: Grad Norm = 0.0263
fusion_layer_dominance.1.bias: Grad Norm = 0.0214
fusion_layer_dominance.4.weight: Grad Norm = 0.8024
fusion_layer_dominance.4.bias: Grad Norm = 0.0408
shared_fc_arousal.0.weight: Grad Norm = 0.8188
shared_fc_arousal.0.bias: Grad Norm = 0.0982
shared_fc_arousal.1.weight: Grad Norm = 0.0406
shared_fc_arousal.1.bias: Grad Norm = 0.0373
shared_fc_dominance.0.weight: Grad Norm = 0.6192
shared_fc_dominance.0.bias: Grad Norm = 0.0637
shared_fc_dominance.1.weight: Grad Norm = 0.0325
shared_fc_dominance.1.bias: Grad Norm = 0.0297
output_branch_arousal.0.

Epoch 1/15 [Train]: 100%|█████████▉| 1001/1004 [09:17<00:01,  1.97it/s, loss=0.3570, a_loss=0.1981, d_loss=0.1588]

fusion_layer_arousal.0.weight: Grad Norm = 0.8663
fusion_layer_arousal.0.bias: Grad Norm = 0.1892
fusion_layer_arousal.1.weight: Grad Norm = 0.0355
fusion_layer_arousal.1.bias: Grad Norm = 0.0314
fusion_layer_arousal.4.weight: Grad Norm = 1.1058
fusion_layer_arousal.4.bias: Grad Norm = 0.0638
fusion_layer_dominance.0.weight: Grad Norm = 0.4606
fusion_layer_dominance.0.bias: Grad Norm = 0.0924
fusion_layer_dominance.1.weight: Grad Norm = 0.0207
fusion_layer_dominance.1.bias: Grad Norm = 0.0169
fusion_layer_dominance.4.weight: Grad Norm = 0.5999
fusion_layer_dominance.4.bias: Grad Norm = 0.0302
shared_fc_arousal.0.weight: Grad Norm = 0.8540
shared_fc_arousal.0.bias: Grad Norm = 0.1040
shared_fc_arousal.1.weight: Grad Norm = 0.0418
shared_fc_arousal.1.bias: Grad Norm = 0.0436
shared_fc_dominance.0.weight: Grad Norm = 0.4576
shared_fc_dominance.0.bias: Grad Norm = 0.0521
shared_fc_dominance.1.weight: Grad Norm = 0.0290
shared_fc_dominance.1.bias: Grad Norm = 0.0226
output_branch_arousal.0.

Epoch 1/15 [Train]: 100%|██████████| 1004/1004 [09:19<00:00,  1.79it/s, loss=0.3073, a_loss=0.1535, d_loss=0.1538]
  return torch._transformer_encoder_layer_fwd(
Epoch 1/15 [Val]: 100%|██████████| 126/126 [00:45<00:00,  2.75it/s, loss=0.3696, a_loss=0.2295, d_loss=0.5096]


Epoch 1/15 Results:
  Train Loss: 0.5051 (Arousal: 0.2214, Dominance: 0.2837)
  Val Loss: 0.1893 (Arousal: 0.1610, Dominance: 0.2176)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1893)


Epoch 2/15 [Train]:   0%|          | 1/1004 [00:00<11:36,  1.44it/s, loss=0.2484, a_loss=0.3076, d_loss=0.2287]

fusion_layer_arousal.0.weight: Grad Norm = 0.2406
fusion_layer_arousal.0.bias: Grad Norm = 0.0571
fusion_layer_arousal.1.weight: Grad Norm = 0.0097
fusion_layer_arousal.1.bias: Grad Norm = 0.0086
fusion_layer_arousal.4.weight: Grad Norm = 0.2845
fusion_layer_arousal.4.bias: Grad Norm = 0.0167
fusion_layer_dominance.0.weight: Grad Norm = 0.4924
fusion_layer_dominance.0.bias: Grad Norm = 0.1173
fusion_layer_dominance.1.weight: Grad Norm = 0.0227
fusion_layer_dominance.1.bias: Grad Norm = 0.0189
fusion_layer_dominance.4.weight: Grad Norm = 0.6278
fusion_layer_dominance.4.bias: Grad Norm = 0.0321
shared_fc_arousal.0.weight: Grad Norm = 0.2075
shared_fc_arousal.0.bias: Grad Norm = 0.0252
shared_fc_arousal.1.weight: Grad Norm = 0.0095
shared_fc_arousal.1.bias: Grad Norm = 0.0098
shared_fc_dominance.0.weight: Grad Norm = 0.4778
shared_fc_dominance.0.bias: Grad Norm = 0.0538
shared_fc_dominance.1.weight: Grad Norm = 0.0273
shared_fc_dominance.1.bias: Grad Norm = 0.0245
output_branch_arousal.0.

Epoch 2/15 [Train]:  10%|█         | 101/1004 [00:56<08:24,  1.79it/s, loss=0.2441, a_loss=0.2177, d_loss=0.2530]

fusion_layer_arousal.0.weight: Grad Norm = 0.1926
fusion_layer_arousal.0.bias: Grad Norm = 0.0393
fusion_layer_arousal.1.weight: Grad Norm = 0.0071
fusion_layer_arousal.1.bias: Grad Norm = 0.0068
fusion_layer_arousal.4.weight: Grad Norm = 0.2253
fusion_layer_arousal.4.bias: Grad Norm = 0.0128
fusion_layer_dominance.0.weight: Grad Norm = 0.4792
fusion_layer_dominance.0.bias: Grad Norm = 0.1158
fusion_layer_dominance.1.weight: Grad Norm = 0.0225
fusion_layer_dominance.1.bias: Grad Norm = 0.0183
fusion_layer_dominance.4.weight: Grad Norm = 0.6068
fusion_layer_dominance.4.bias: Grad Norm = 0.0282
shared_fc_arousal.0.weight: Grad Norm = 0.1739
shared_fc_arousal.0.bias: Grad Norm = 0.0212
shared_fc_arousal.1.weight: Grad Norm = 0.0089
shared_fc_arousal.1.bias: Grad Norm = 0.0084
shared_fc_dominance.0.weight: Grad Norm = 0.5048
shared_fc_dominance.0.bias: Grad Norm = 0.0521
shared_fc_dominance.1.weight: Grad Norm = 0.0267
shared_fc_dominance.1.bias: Grad Norm = 0.0244
output_branch_arousal.0.

Epoch 2/15 [Train]:  20%|██        | 201/1004 [01:51<07:18,  1.83it/s, loss=0.1395, a_loss=0.0856, d_loss=0.1574]

fusion_layer_arousal.0.weight: Grad Norm = 0.1559
fusion_layer_arousal.0.bias: Grad Norm = 0.0427
fusion_layer_arousal.1.weight: Grad Norm = 0.0062
fusion_layer_arousal.1.bias: Grad Norm = 0.0058
fusion_layer_arousal.4.weight: Grad Norm = 0.1628
fusion_layer_arousal.4.bias: Grad Norm = 0.0104
fusion_layer_dominance.0.weight: Grad Norm = 0.5399
fusion_layer_dominance.0.bias: Grad Norm = 0.1462
fusion_layer_dominance.1.weight: Grad Norm = 0.0226
fusion_layer_dominance.1.bias: Grad Norm = 0.0202
fusion_layer_dominance.4.weight: Grad Norm = 0.6511
fusion_layer_dominance.4.bias: Grad Norm = 0.0350
shared_fc_arousal.0.weight: Grad Norm = 0.1226
shared_fc_arousal.0.bias: Grad Norm = 0.0167
shared_fc_arousal.1.weight: Grad Norm = 0.0062
shared_fc_arousal.1.bias: Grad Norm = 0.0065
shared_fc_dominance.0.weight: Grad Norm = 0.4982
shared_fc_dominance.0.bias: Grad Norm = 0.0616
shared_fc_dominance.1.weight: Grad Norm = 0.0236
shared_fc_dominance.1.bias: Grad Norm = 0.0252
output_branch_arousal.0.

Epoch 2/15 [Train]:  30%|██▉       | 301/1004 [02:45<05:51,  2.00it/s, loss=0.1545, a_loss=0.2809, d_loss=0.1124]

fusion_layer_arousal.0.weight: Grad Norm = 0.1596
fusion_layer_arousal.0.bias: Grad Norm = 0.0420
fusion_layer_arousal.1.weight: Grad Norm = 0.0064
fusion_layer_arousal.1.bias: Grad Norm = 0.0056
fusion_layer_arousal.4.weight: Grad Norm = 0.2026
fusion_layer_arousal.4.bias: Grad Norm = 0.0114
fusion_layer_dominance.0.weight: Grad Norm = 0.2767
fusion_layer_dominance.0.bias: Grad Norm = 0.0779
fusion_layer_dominance.1.weight: Grad Norm = 0.0137
fusion_layer_dominance.1.bias: Grad Norm = 0.0110
fusion_layer_dominance.4.weight: Grad Norm = 0.3800
fusion_layer_dominance.4.bias: Grad Norm = 0.0190
shared_fc_arousal.0.weight: Grad Norm = 0.1557
shared_fc_arousal.0.bias: Grad Norm = 0.0194
shared_fc_arousal.1.weight: Grad Norm = 0.0077
shared_fc_arousal.1.bias: Grad Norm = 0.0076
shared_fc_dominance.0.weight: Grad Norm = 0.3467
shared_fc_dominance.0.bias: Grad Norm = 0.0336
shared_fc_dominance.1.weight: Grad Norm = 0.0204
shared_fc_dominance.1.bias: Grad Norm = 0.0191
output_branch_arousal.0.

Epoch 2/15 [Train]:  40%|███▉      | 401/1004 [03:40<05:20,  1.88it/s, loss=0.1935, a_loss=0.1613, d_loss=0.2043]

fusion_layer_arousal.0.weight: Grad Norm = 0.1637
fusion_layer_arousal.0.bias: Grad Norm = 0.0409
fusion_layer_arousal.1.weight: Grad Norm = 0.0070
fusion_layer_arousal.1.bias: Grad Norm = 0.0057
fusion_layer_arousal.4.weight: Grad Norm = 0.1879
fusion_layer_arousal.4.bias: Grad Norm = 0.0105
fusion_layer_dominance.0.weight: Grad Norm = 0.3213
fusion_layer_dominance.0.bias: Grad Norm = 0.0816
fusion_layer_dominance.1.weight: Grad Norm = 0.0148
fusion_layer_dominance.1.bias: Grad Norm = 0.0126
fusion_layer_dominance.4.weight: Grad Norm = 0.4482
fusion_layer_dominance.4.bias: Grad Norm = 0.0213
shared_fc_arousal.0.weight: Grad Norm = 0.1545
shared_fc_arousal.0.bias: Grad Norm = 0.0206
shared_fc_arousal.1.weight: Grad Norm = 0.0086
shared_fc_arousal.1.bias: Grad Norm = 0.0079
shared_fc_dominance.0.weight: Grad Norm = 0.3308
shared_fc_dominance.0.bias: Grad Norm = 0.0349
shared_fc_dominance.1.weight: Grad Norm = 0.0174
shared_fc_dominance.1.bias: Grad Norm = 0.0165
output_branch_arousal.0.

Epoch 2/15 [Train]:  50%|████▉     | 501/1004 [04:35<04:18,  1.95it/s, loss=0.2337, a_loss=0.0943, d_loss=0.2801]

fusion_layer_arousal.0.weight: Grad Norm = 0.1428
fusion_layer_arousal.0.bias: Grad Norm = 0.0420
fusion_layer_arousal.1.weight: Grad Norm = 0.0048
fusion_layer_arousal.1.bias: Grad Norm = 0.0049
fusion_layer_arousal.4.weight: Grad Norm = 0.1599
fusion_layer_arousal.4.bias: Grad Norm = 0.0085
fusion_layer_dominance.0.weight: Grad Norm = 0.3319
fusion_layer_dominance.0.bias: Grad Norm = 0.0889
fusion_layer_dominance.1.weight: Grad Norm = 0.0170
fusion_layer_dominance.1.bias: Grad Norm = 0.0128
fusion_layer_dominance.4.weight: Grad Norm = 0.4505
fusion_layer_dominance.4.bias: Grad Norm = 0.0224
shared_fc_arousal.0.weight: Grad Norm = 0.1336
shared_fc_arousal.0.bias: Grad Norm = 0.0157
shared_fc_arousal.1.weight: Grad Norm = 0.0066
shared_fc_arousal.1.bias: Grad Norm = 0.0064
shared_fc_dominance.0.weight: Grad Norm = 0.3853
shared_fc_dominance.0.bias: Grad Norm = 0.0378
shared_fc_dominance.1.weight: Grad Norm = 0.0224
shared_fc_dominance.1.bias: Grad Norm = 0.0195
output_branch_arousal.0.

Epoch 2/15 [Train]:  60%|█████▉    | 601/1004 [05:29<03:34,  1.88it/s, loss=0.3600, a_loss=0.1785, d_loss=0.4205]

fusion_layer_arousal.0.weight: Grad Norm = 0.1360
fusion_layer_arousal.0.bias: Grad Norm = 0.0330
fusion_layer_arousal.1.weight: Grad Norm = 0.0061
fusion_layer_arousal.1.bias: Grad Norm = 0.0048
fusion_layer_arousal.4.weight: Grad Norm = 0.1631
fusion_layer_arousal.4.bias: Grad Norm = 0.0087
fusion_layer_dominance.0.weight: Grad Norm = 0.3692
fusion_layer_dominance.0.bias: Grad Norm = 0.0971
fusion_layer_dominance.1.weight: Grad Norm = 0.0191
fusion_layer_dominance.1.bias: Grad Norm = 0.0146
fusion_layer_dominance.4.weight: Grad Norm = 0.5342
fusion_layer_dominance.4.bias: Grad Norm = 0.0259
shared_fc_arousal.0.weight: Grad Norm = 0.1312
shared_fc_arousal.0.bias: Grad Norm = 0.0163
shared_fc_arousal.1.weight: Grad Norm = 0.0075
shared_fc_arousal.1.bias: Grad Norm = 0.0065
shared_fc_dominance.0.weight: Grad Norm = 0.5101
shared_fc_dominance.0.bias: Grad Norm = 0.0480
shared_fc_dominance.1.weight: Grad Norm = 0.0308
shared_fc_dominance.1.bias: Grad Norm = 0.0281
output_branch_arousal.0.

Epoch 2/15 [Train]:  70%|██████▉   | 701/1004 [06:25<02:45,  1.83it/s, loss=0.1873, a_loss=0.1609, d_loss=0.1961]

fusion_layer_arousal.0.weight: Grad Norm = 0.1467
fusion_layer_arousal.0.bias: Grad Norm = 0.0378
fusion_layer_arousal.1.weight: Grad Norm = 0.0064
fusion_layer_arousal.1.bias: Grad Norm = 0.0054
fusion_layer_arousal.4.weight: Grad Norm = 0.1774
fusion_layer_arousal.4.bias: Grad Norm = 0.0108
fusion_layer_dominance.0.weight: Grad Norm = 0.3876
fusion_layer_dominance.0.bias: Grad Norm = 0.1097
fusion_layer_dominance.1.weight: Grad Norm = 0.0226
fusion_layer_dominance.1.bias: Grad Norm = 0.0152
fusion_layer_dominance.4.weight: Grad Norm = 0.5155
fusion_layer_dominance.4.bias: Grad Norm = 0.0267
shared_fc_arousal.0.weight: Grad Norm = 0.1497
shared_fc_arousal.0.bias: Grad Norm = 0.0194
shared_fc_arousal.1.weight: Grad Norm = 0.0082
shared_fc_arousal.1.bias: Grad Norm = 0.0079
shared_fc_dominance.0.weight: Grad Norm = 0.4070
shared_fc_dominance.0.bias: Grad Norm = 0.0430
shared_fc_dominance.1.weight: Grad Norm = 0.0193
shared_fc_dominance.1.bias: Grad Norm = 0.0212
output_branch_arousal.0.

Epoch 2/15 [Train]:  80%|███████▉  | 801/1004 [07:19<01:52,  1.81it/s, loss=0.1172, a_loss=0.2786, d_loss=0.0634]

fusion_layer_arousal.0.weight: Grad Norm = 0.1741
fusion_layer_arousal.0.bias: Grad Norm = 0.0335
fusion_layer_arousal.1.weight: Grad Norm = 0.0076
fusion_layer_arousal.1.bias: Grad Norm = 0.0058
fusion_layer_arousal.4.weight: Grad Norm = 0.2214
fusion_layer_arousal.4.bias: Grad Norm = 0.0105
fusion_layer_dominance.0.weight: Grad Norm = 0.1657
fusion_layer_dominance.0.bias: Grad Norm = 0.0358
fusion_layer_dominance.1.weight: Grad Norm = 0.0089
fusion_layer_dominance.1.bias: Grad Norm = 0.0064
fusion_layer_dominance.4.weight: Grad Norm = 0.2175
fusion_layer_dominance.4.bias: Grad Norm = 0.0098
shared_fc_arousal.0.weight: Grad Norm = 0.1830
shared_fc_arousal.0.bias: Grad Norm = 0.0202
shared_fc_arousal.1.weight: Grad Norm = 0.0096
shared_fc_arousal.1.bias: Grad Norm = 0.0089
shared_fc_dominance.0.weight: Grad Norm = 0.1804
shared_fc_dominance.0.bias: Grad Norm = 0.0157
shared_fc_dominance.1.weight: Grad Norm = 0.0108
shared_fc_dominance.1.bias: Grad Norm = 0.0094
output_branch_arousal.0.

Epoch 2/15 [Train]:  90%|████████▉ | 901/1004 [08:13<00:53,  1.92it/s, loss=0.1846, a_loss=0.1307, d_loss=0.2026]

fusion_layer_arousal.0.weight: Grad Norm = 0.1509
fusion_layer_arousal.0.bias: Grad Norm = 0.0299
fusion_layer_arousal.1.weight: Grad Norm = 0.0056
fusion_layer_arousal.1.bias: Grad Norm = 0.0052
fusion_layer_arousal.4.weight: Grad Norm = 0.1770
fusion_layer_arousal.4.bias: Grad Norm = 0.0094
fusion_layer_dominance.0.weight: Grad Norm = 0.2631
fusion_layer_dominance.0.bias: Grad Norm = 0.0618
fusion_layer_dominance.1.weight: Grad Norm = 0.0127
fusion_layer_dominance.1.bias: Grad Norm = 0.0107
fusion_layer_dominance.4.weight: Grad Norm = 0.3778
fusion_layer_dominance.4.bias: Grad Norm = 0.0208
shared_fc_arousal.0.weight: Grad Norm = 0.1381
shared_fc_arousal.0.bias: Grad Norm = 0.0179
shared_fc_arousal.1.weight: Grad Norm = 0.0071
shared_fc_arousal.1.bias: Grad Norm = 0.0067
shared_fc_dominance.0.weight: Grad Norm = 0.3438
shared_fc_dominance.0.bias: Grad Norm = 0.0357
shared_fc_dominance.1.weight: Grad Norm = 0.0232
shared_fc_dominance.1.bias: Grad Norm = 0.0199
output_branch_arousal.0.

Epoch 2/15 [Train]: 100%|█████████▉| 1001/1004 [09:08<00:01,  1.86it/s, loss=0.1843, a_loss=0.1111, d_loss=0.2086]

fusion_layer_arousal.0.weight: Grad Norm = 0.1662
fusion_layer_arousal.0.bias: Grad Norm = 0.0486
fusion_layer_arousal.1.weight: Grad Norm = 0.0071
fusion_layer_arousal.1.bias: Grad Norm = 0.0064
fusion_layer_arousal.4.weight: Grad Norm = 0.1971
fusion_layer_arousal.4.bias: Grad Norm = 0.0123
fusion_layer_dominance.0.weight: Grad Norm = 0.3394
fusion_layer_dominance.0.bias: Grad Norm = 0.0958
fusion_layer_dominance.1.weight: Grad Norm = 0.0169
fusion_layer_dominance.1.bias: Grad Norm = 0.0137
fusion_layer_dominance.4.weight: Grad Norm = 0.4451
fusion_layer_dominance.4.bias: Grad Norm = 0.0254
shared_fc_arousal.0.weight: Grad Norm = 0.1492
shared_fc_arousal.0.bias: Grad Norm = 0.0200
shared_fc_arousal.1.weight: Grad Norm = 0.0072
shared_fc_arousal.1.bias: Grad Norm = 0.0077
shared_fc_dominance.0.weight: Grad Norm = 0.3891
shared_fc_dominance.0.bias: Grad Norm = 0.0416
shared_fc_dominance.1.weight: Grad Norm = 0.0241
shared_fc_dominance.1.bias: Grad Norm = 0.0232
output_branch_arousal.0.

Epoch 2/15 [Train]: 100%|██████████| 1004/1004 [09:09<00:00,  1.83it/s, loss=0.1755, a_loss=0.1477, d_loss=0.1848]
Epoch 2/15 [Val]: 100%|██████████| 126/126 [00:44<00:00,  2.82it/s, loss=0.2787, a_loss=0.2486, d_loss=0.3087]


Epoch 2/15 Results:
  Train Loss: 0.2093 (Arousal: 0.1664, Dominance: 0.2236)
  Val Loss: 0.1594 (Arousal: 0.1294, Dominance: 0.1895)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1594)


Epoch 3/15 [Train]:   0%|          | 1/1004 [00:00<09:05,  1.84it/s, loss=0.2278, a_loss=0.0716, d_loss=0.2799]

fusion_layer_arousal.0.weight: Grad Norm = 0.0956
fusion_layer_arousal.0.bias: Grad Norm = 0.0232
fusion_layer_arousal.1.weight: Grad Norm = 0.0033
fusion_layer_arousal.1.bias: Grad Norm = 0.0035
fusion_layer_arousal.4.weight: Grad Norm = 0.1196
fusion_layer_arousal.4.bias: Grad Norm = 0.0073
fusion_layer_dominance.0.weight: Grad Norm = 0.3256
fusion_layer_dominance.0.bias: Grad Norm = 0.0771
fusion_layer_dominance.1.weight: Grad Norm = 0.0161
fusion_layer_dominance.1.bias: Grad Norm = 0.0129
fusion_layer_dominance.4.weight: Grad Norm = 0.5042
fusion_layer_dominance.4.bias: Grad Norm = 0.0238
shared_fc_arousal.0.weight: Grad Norm = 0.0952
shared_fc_arousal.0.bias: Grad Norm = 0.0121
shared_fc_arousal.1.weight: Grad Norm = 0.0055
shared_fc_arousal.1.bias: Grad Norm = 0.0049
shared_fc_dominance.0.weight: Grad Norm = 0.4144
shared_fc_dominance.0.bias: Grad Norm = 0.0389
shared_fc_dominance.1.weight: Grad Norm = 0.0229
shared_fc_dominance.1.bias: Grad Norm = 0.0238
output_branch_arousal.0.

Epoch 3/15 [Train]:  10%|█         | 101/1004 [00:55<07:11,  2.09it/s, loss=0.2027, a_loss=0.3010, d_loss=0.1699]

fusion_layer_arousal.0.weight: Grad Norm = 0.1441
fusion_layer_arousal.0.bias: Grad Norm = 0.0319
fusion_layer_arousal.1.weight: Grad Norm = 0.0055
fusion_layer_arousal.1.bias: Grad Norm = 0.0050
fusion_layer_arousal.4.weight: Grad Norm = 0.2090
fusion_layer_arousal.4.bias: Grad Norm = 0.0109
fusion_layer_dominance.0.weight: Grad Norm = 0.2972
fusion_layer_dominance.0.bias: Grad Norm = 0.0712
fusion_layer_dominance.1.weight: Grad Norm = 0.0150
fusion_layer_dominance.1.bias: Grad Norm = 0.0113
fusion_layer_dominance.4.weight: Grad Norm = 0.4277
fusion_layer_dominance.4.bias: Grad Norm = 0.0198
shared_fc_arousal.0.weight: Grad Norm = 0.1622
shared_fc_arousal.0.bias: Grad Norm = 0.0185
shared_fc_arousal.1.weight: Grad Norm = 0.0090
shared_fc_arousal.1.bias: Grad Norm = 0.0082
shared_fc_dominance.0.weight: Grad Norm = 0.3625
shared_fc_dominance.0.bias: Grad Norm = 0.0339
shared_fc_dominance.1.weight: Grad Norm = 0.0207
shared_fc_dominance.1.bias: Grad Norm = 0.0187
output_branch_arousal.0.

Epoch 3/15 [Train]:  20%|██        | 201/1004 [01:51<06:36,  2.02it/s, loss=0.1526, a_loss=0.1708, d_loss=0.1465]

fusion_layer_arousal.0.weight: Grad Norm = 0.1453
fusion_layer_arousal.0.bias: Grad Norm = 0.0272
fusion_layer_arousal.1.weight: Grad Norm = 0.0062
fusion_layer_arousal.1.bias: Grad Norm = 0.0050
fusion_layer_arousal.4.weight: Grad Norm = 0.1521
fusion_layer_arousal.4.bias: Grad Norm = 0.0077
fusion_layer_dominance.0.weight: Grad Norm = 0.2580
fusion_layer_dominance.0.bias: Grad Norm = 0.0569
fusion_layer_dominance.1.weight: Grad Norm = 0.0125
fusion_layer_dominance.1.bias: Grad Norm = 0.0105
fusion_layer_dominance.4.weight: Grad Norm = 0.3702
fusion_layer_dominance.4.bias: Grad Norm = 0.0182
shared_fc_arousal.0.weight: Grad Norm = 0.1225
shared_fc_arousal.0.bias: Grad Norm = 0.0134
shared_fc_arousal.1.weight: Grad Norm = 0.0072
shared_fc_arousal.1.bias: Grad Norm = 0.0058
shared_fc_dominance.0.weight: Grad Norm = 0.3158
shared_fc_dominance.0.bias: Grad Norm = 0.0310
shared_fc_dominance.1.weight: Grad Norm = 0.0204
shared_fc_dominance.1.bias: Grad Norm = 0.0176
output_branch_arousal.0.

Epoch 3/15 [Train]:  30%|██▉       | 301/1004 [02:46<06:32,  1.79it/s, loss=0.1488, a_loss=0.0352, d_loss=0.1867]

fusion_layer_arousal.0.weight: Grad Norm = 0.0663
fusion_layer_arousal.0.bias: Grad Norm = 0.0143
fusion_layer_arousal.1.weight: Grad Norm = 0.0028
fusion_layer_arousal.1.bias: Grad Norm = 0.0024
fusion_layer_arousal.4.weight: Grad Norm = 0.0725
fusion_layer_arousal.4.bias: Grad Norm = 0.0041
fusion_layer_dominance.0.weight: Grad Norm = 0.2755
fusion_layer_dominance.0.bias: Grad Norm = 0.0623
fusion_layer_dominance.1.weight: Grad Norm = 0.0139
fusion_layer_dominance.1.bias: Grad Norm = 0.0118
fusion_layer_dominance.4.weight: Grad Norm = 0.4007
fusion_layer_dominance.4.bias: Grad Norm = 0.0218
shared_fc_arousal.0.weight: Grad Norm = 0.0588
shared_fc_arousal.0.bias: Grad Norm = 0.0067
shared_fc_arousal.1.weight: Grad Norm = 0.0029
shared_fc_arousal.1.bias: Grad Norm = 0.0029
shared_fc_dominance.0.weight: Grad Norm = 0.3522
shared_fc_dominance.0.bias: Grad Norm = 0.0335
shared_fc_dominance.1.weight: Grad Norm = 0.0235
shared_fc_dominance.1.bias: Grad Norm = 0.0209
output_branch_arousal.0.

Epoch 3/15 [Train]:  40%|███▉      | 401/1004 [03:40<06:02,  1.66it/s, loss=0.2226, a_loss=0.1722, d_loss=0.2393]

fusion_layer_arousal.0.weight: Grad Norm = 0.1123
fusion_layer_arousal.0.bias: Grad Norm = 0.0310
fusion_layer_arousal.1.weight: Grad Norm = 0.0042
fusion_layer_arousal.1.bias: Grad Norm = 0.0040
fusion_layer_arousal.4.weight: Grad Norm = 0.1355
fusion_layer_arousal.4.bias: Grad Norm = 0.0073
fusion_layer_dominance.0.weight: Grad Norm = 0.2696
fusion_layer_dominance.0.bias: Grad Norm = 0.0714
fusion_layer_dominance.1.weight: Grad Norm = 0.0146
fusion_layer_dominance.1.bias: Grad Norm = 0.0115
fusion_layer_dominance.4.weight: Grad Norm = 0.3767
fusion_layer_dominance.4.bias: Grad Norm = 0.0188
shared_fc_arousal.0.weight: Grad Norm = 0.1059
shared_fc_arousal.0.bias: Grad Norm = 0.0119
shared_fc_arousal.1.weight: Grad Norm = 0.0055
shared_fc_arousal.1.bias: Grad Norm = 0.0053
shared_fc_dominance.0.weight: Grad Norm = 0.3079
shared_fc_dominance.0.bias: Grad Norm = 0.0311
shared_fc_dominance.1.weight: Grad Norm = 0.0186
shared_fc_dominance.1.bias: Grad Norm = 0.0165
output_branch_arousal.0.

Epoch 3/15 [Train]:  50%|████▉     | 501/1004 [04:33<05:03,  1.66it/s, loss=0.1525, a_loss=0.0832, d_loss=0.1756]

fusion_layer_arousal.0.weight: Grad Norm = 0.0811
fusion_layer_arousal.0.bias: Grad Norm = 0.0219
fusion_layer_arousal.1.weight: Grad Norm = 0.0034
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.1036
fusion_layer_arousal.4.bias: Grad Norm = 0.0051
fusion_layer_dominance.0.weight: Grad Norm = 0.2050
fusion_layer_dominance.0.bias: Grad Norm = 0.0353
fusion_layer_dominance.1.weight: Grad Norm = 0.0113
fusion_layer_dominance.1.bias: Grad Norm = 0.0090
fusion_layer_dominance.4.weight: Grad Norm = 0.3131
fusion_layer_dominance.4.bias: Grad Norm = 0.0141
shared_fc_arousal.0.weight: Grad Norm = 0.0823
shared_fc_arousal.0.bias: Grad Norm = 0.0087
shared_fc_arousal.1.weight: Grad Norm = 0.0042
shared_fc_arousal.1.bias: Grad Norm = 0.0041
shared_fc_dominance.0.weight: Grad Norm = 0.2727
shared_fc_dominance.0.bias: Grad Norm = 0.0248
shared_fc_dominance.1.weight: Grad Norm = 0.0155
shared_fc_dominance.1.bias: Grad Norm = 0.0151
output_branch_arousal.0.

Epoch 3/15 [Train]:  60%|█████▉    | 601/1004 [05:29<04:09,  1.62it/s, loss=0.2961, a_loss=0.1345, d_loss=0.3500]

fusion_layer_arousal.0.weight: Grad Norm = 0.1169
fusion_layer_arousal.0.bias: Grad Norm = 0.0297
fusion_layer_arousal.1.weight: Grad Norm = 0.0046
fusion_layer_arousal.1.bias: Grad Norm = 0.0041
fusion_layer_arousal.4.weight: Grad Norm = 0.1291
fusion_layer_arousal.4.bias: Grad Norm = 0.0071
fusion_layer_dominance.0.weight: Grad Norm = 0.2253
fusion_layer_dominance.0.bias: Grad Norm = 0.0576
fusion_layer_dominance.1.weight: Grad Norm = 0.0107
fusion_layer_dominance.1.bias: Grad Norm = 0.0095
fusion_layer_dominance.4.weight: Grad Norm = 0.3470
fusion_layer_dominance.4.bias: Grad Norm = 0.0169
shared_fc_arousal.0.weight: Grad Norm = 0.1037
shared_fc_arousal.0.bias: Grad Norm = 0.0125
shared_fc_arousal.1.weight: Grad Norm = 0.0049
shared_fc_arousal.1.bias: Grad Norm = 0.0050
shared_fc_dominance.0.weight: Grad Norm = 0.3223
shared_fc_dominance.0.bias: Grad Norm = 0.0291
shared_fc_dominance.1.weight: Grad Norm = 0.0222
shared_fc_dominance.1.bias: Grad Norm = 0.0195
output_branch_arousal.0.

Epoch 3/15 [Train]:  70%|██████▉   | 701/1004 [06:25<02:41,  1.87it/s, loss=0.2472, a_loss=0.2428, d_loss=0.2487]

fusion_layer_arousal.0.weight: Grad Norm = 0.1566
fusion_layer_arousal.0.bias: Grad Norm = 0.0342
fusion_layer_arousal.1.weight: Grad Norm = 0.0060
fusion_layer_arousal.1.bias: Grad Norm = 0.0056
fusion_layer_arousal.4.weight: Grad Norm = 0.1832
fusion_layer_arousal.4.bias: Grad Norm = 0.0093
fusion_layer_dominance.0.weight: Grad Norm = 0.2086
fusion_layer_dominance.0.bias: Grad Norm = 0.0496
fusion_layer_dominance.1.weight: Grad Norm = 0.0111
fusion_layer_dominance.1.bias: Grad Norm = 0.0089
fusion_layer_dominance.4.weight: Grad Norm = 0.3116
fusion_layer_dominance.4.bias: Grad Norm = 0.0138
shared_fc_arousal.0.weight: Grad Norm = 0.1528
shared_fc_arousal.0.bias: Grad Norm = 0.0166
shared_fc_arousal.1.weight: Grad Norm = 0.0077
shared_fc_arousal.1.bias: Grad Norm = 0.0079
shared_fc_dominance.0.weight: Grad Norm = 0.2755
shared_fc_dominance.0.bias: Grad Norm = 0.0247
shared_fc_dominance.1.weight: Grad Norm = 0.0185
shared_fc_dominance.1.bias: Grad Norm = 0.0156
output_branch_arousal.0.

Epoch 3/15 [Train]:  80%|███████▉  | 801/1004 [07:18<01:56,  1.75it/s, loss=0.0717, a_loss=0.0386, d_loss=0.0828]

fusion_layer_arousal.0.weight: Grad Norm = 0.0471
fusion_layer_arousal.0.bias: Grad Norm = 0.0103
fusion_layer_arousal.1.weight: Grad Norm = 0.0017
fusion_layer_arousal.1.bias: Grad Norm = 0.0017
fusion_layer_arousal.4.weight: Grad Norm = 0.0595
fusion_layer_arousal.4.bias: Grad Norm = 0.0028
fusion_layer_dominance.0.weight: Grad Norm = 0.1366
fusion_layer_dominance.0.bias: Grad Norm = 0.0299
fusion_layer_dominance.1.weight: Grad Norm = 0.0069
fusion_layer_dominance.1.bias: Grad Norm = 0.0062
fusion_layer_dominance.4.weight: Grad Norm = 0.1932
fusion_layer_dominance.4.bias: Grad Norm = 0.0098
shared_fc_arousal.0.weight: Grad Norm = 0.0483
shared_fc_arousal.0.bias: Grad Norm = 0.0046
shared_fc_arousal.1.weight: Grad Norm = 0.0033
shared_fc_arousal.1.bias: Grad Norm = 0.0025
shared_fc_dominance.0.weight: Grad Norm = 0.1676
shared_fc_dominance.0.bias: Grad Norm = 0.0157
shared_fc_dominance.1.weight: Grad Norm = 0.0098
shared_fc_dominance.1.bias: Grad Norm = 0.0100
output_branch_arousal.0.

Epoch 3/15 [Train]:  90%|████████▉ | 901/1004 [08:13<00:59,  1.75it/s, loss=0.1788, a_loss=0.2013, d_loss=0.1713]

fusion_layer_arousal.0.weight: Grad Norm = 0.1316
fusion_layer_arousal.0.bias: Grad Norm = 0.0246
fusion_layer_arousal.1.weight: Grad Norm = 0.0051
fusion_layer_arousal.1.bias: Grad Norm = 0.0046
fusion_layer_arousal.4.weight: Grad Norm = 0.1524
fusion_layer_arousal.4.bias: Grad Norm = 0.0093
fusion_layer_dominance.0.weight: Grad Norm = 0.2094
fusion_layer_dominance.0.bias: Grad Norm = 0.0492
fusion_layer_dominance.1.weight: Grad Norm = 0.0121
fusion_layer_dominance.1.bias: Grad Norm = 0.0094
fusion_layer_dominance.4.weight: Grad Norm = 0.2592
fusion_layer_dominance.4.bias: Grad Norm = 0.0150
shared_fc_arousal.0.weight: Grad Norm = 0.1165
shared_fc_arousal.0.bias: Grad Norm = 0.0143
shared_fc_arousal.1.weight: Grad Norm = 0.0064
shared_fc_arousal.1.bias: Grad Norm = 0.0060
shared_fc_dominance.0.weight: Grad Norm = 0.2193
shared_fc_dominance.0.bias: Grad Norm = 0.0229
shared_fc_dominance.1.weight: Grad Norm = 0.0135
shared_fc_dominance.1.bias: Grad Norm = 0.0128
output_branch_arousal.0.

Epoch 3/15 [Train]: 100%|█████████▉| 1001/1004 [09:08<00:01,  1.92it/s, loss=0.1944, a_loss=0.2009, d_loss=0.1922]

fusion_layer_arousal.0.weight: Grad Norm = 0.1267
fusion_layer_arousal.0.bias: Grad Norm = 0.0327
fusion_layer_arousal.1.weight: Grad Norm = 0.0054
fusion_layer_arousal.1.bias: Grad Norm = 0.0049
fusion_layer_arousal.4.weight: Grad Norm = 0.1570
fusion_layer_arousal.4.bias: Grad Norm = 0.0094
fusion_layer_dominance.0.weight: Grad Norm = 0.2967
fusion_layer_dominance.0.bias: Grad Norm = 0.0836
fusion_layer_dominance.1.weight: Grad Norm = 0.0164
fusion_layer_dominance.1.bias: Grad Norm = 0.0128
fusion_layer_dominance.4.weight: Grad Norm = 0.3590
fusion_layer_dominance.4.bias: Grad Norm = 0.0186
shared_fc_arousal.0.weight: Grad Norm = 0.1247
shared_fc_arousal.0.bias: Grad Norm = 0.0156
shared_fc_arousal.1.weight: Grad Norm = 0.0080
shared_fc_arousal.1.bias: Grad Norm = 0.0072
shared_fc_dominance.0.weight: Grad Norm = 0.3231
shared_fc_dominance.0.bias: Grad Norm = 0.0317
shared_fc_dominance.1.weight: Grad Norm = 0.0161
shared_fc_dominance.1.bias: Grad Norm = 0.0181
output_branch_arousal.0.

Epoch 3/15 [Train]: 100%|██████████| 1004/1004 [09:09<00:00,  1.83it/s, loss=0.1696, a_loss=0.0682, d_loss=0.2034]
Epoch 3/15 [Val]: 100%|██████████| 126/126 [00:44<00:00,  2.81it/s, loss=0.3748, a_loss=0.2896, d_loss=0.4600]


Epoch 3/15 Results:
  Train Loss: 0.1974 (Arousal: 0.1564, Dominance: 0.2110)
  Val Loss: 0.1586 (Arousal: 0.1281, Dominance: 0.1891)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1586)


Epoch 4/15 [Train]:   0%|          | 1/1004 [00:00<11:42,  1.43it/s, loss=0.2031, a_loss=0.0894, d_loss=0.2410]

fusion_layer_arousal.0.weight: Grad Norm = 0.0948
fusion_layer_arousal.0.bias: Grad Norm = 0.0222
fusion_layer_arousal.1.weight: Grad Norm = 0.0040
fusion_layer_arousal.1.bias: Grad Norm = 0.0035
fusion_layer_arousal.4.weight: Grad Norm = 0.1168
fusion_layer_arousal.4.bias: Grad Norm = 0.0065
fusion_layer_dominance.0.weight: Grad Norm = 0.2511
fusion_layer_dominance.0.bias: Grad Norm = 0.0616
fusion_layer_dominance.1.weight: Grad Norm = 0.0130
fusion_layer_dominance.1.bias: Grad Norm = 0.0114
fusion_layer_dominance.4.weight: Grad Norm = 0.3134
fusion_layer_dominance.4.bias: Grad Norm = 0.0188
shared_fc_arousal.0.weight: Grad Norm = 0.0929
shared_fc_arousal.0.bias: Grad Norm = 0.0111
shared_fc_arousal.1.weight: Grad Norm = 0.0062
shared_fc_arousal.1.bias: Grad Norm = 0.0050
shared_fc_dominance.0.weight: Grad Norm = 0.2687
shared_fc_dominance.0.bias: Grad Norm = 0.0282
shared_fc_dominance.1.weight: Grad Norm = 0.0188
shared_fc_dominance.1.bias: Grad Norm = 0.0168
output_branch_arousal.0.

Epoch 4/15 [Train]:  10%|█         | 101/1004 [00:54<09:22,  1.60it/s, loss=0.2306, a_loss=0.0751, d_loss=0.2824]

fusion_layer_arousal.0.weight: Grad Norm = 0.0984
fusion_layer_arousal.0.bias: Grad Norm = 0.0180
fusion_layer_arousal.1.weight: Grad Norm = 0.0043
fusion_layer_arousal.1.bias: Grad Norm = 0.0036
fusion_layer_arousal.4.weight: Grad Norm = 0.1148
fusion_layer_arousal.4.bias: Grad Norm = 0.0066
fusion_layer_dominance.0.weight: Grad Norm = 0.2417
fusion_layer_dominance.0.bias: Grad Norm = 0.0476
fusion_layer_dominance.1.weight: Grad Norm = 0.0113
fusion_layer_dominance.1.bias: Grad Norm = 0.0097
fusion_layer_dominance.4.weight: Grad Norm = 0.3156
fusion_layer_dominance.4.bias: Grad Norm = 0.0161
shared_fc_arousal.0.weight: Grad Norm = 0.0904
shared_fc_arousal.0.bias: Grad Norm = 0.0116
shared_fc_arousal.1.weight: Grad Norm = 0.0056
shared_fc_arousal.1.bias: Grad Norm = 0.0047
shared_fc_dominance.0.weight: Grad Norm = 0.2861
shared_fc_dominance.0.bias: Grad Norm = 0.0271
shared_fc_dominance.1.weight: Grad Norm = 0.0166
shared_fc_dominance.1.bias: Grad Norm = 0.0159
output_branch_arousal.0.

Epoch 4/15 [Train]:  20%|██        | 201/1004 [01:49<08:10,  1.64it/s, loss=0.2142, a_loss=0.0501, d_loss=0.2689]

fusion_layer_arousal.0.weight: Grad Norm = 0.0739
fusion_layer_arousal.0.bias: Grad Norm = 0.0164
fusion_layer_arousal.1.weight: Grad Norm = 0.0031
fusion_layer_arousal.1.bias: Grad Norm = 0.0026
fusion_layer_arousal.4.weight: Grad Norm = 0.0798
fusion_layer_arousal.4.bias: Grad Norm = 0.0041
fusion_layer_dominance.0.weight: Grad Norm = 0.3034
fusion_layer_dominance.0.bias: Grad Norm = 0.0783
fusion_layer_dominance.1.weight: Grad Norm = 0.0159
fusion_layer_dominance.1.bias: Grad Norm = 0.0140
fusion_layer_dominance.4.weight: Grad Norm = 0.3671
fusion_layer_dominance.4.bias: Grad Norm = 0.0247
shared_fc_arousal.0.weight: Grad Norm = 0.0624
shared_fc_arousal.0.bias: Grad Norm = 0.0069
shared_fc_arousal.1.weight: Grad Norm = 0.0033
shared_fc_arousal.1.bias: Grad Norm = 0.0030
shared_fc_dominance.0.weight: Grad Norm = 0.3286
shared_fc_dominance.0.bias: Grad Norm = 0.0389
shared_fc_dominance.1.weight: Grad Norm = 0.0242
shared_fc_dominance.1.bias: Grad Norm = 0.0208
output_branch_arousal.0.

Epoch 4/15 [Train]:  30%|██▉       | 301/1004 [02:44<07:30,  1.56it/s, loss=0.1598, a_loss=0.1785, d_loss=0.1535]

fusion_layer_arousal.0.weight: Grad Norm = 0.1608
fusion_layer_arousal.0.bias: Grad Norm = 0.0451
fusion_layer_arousal.1.weight: Grad Norm = 0.0060
fusion_layer_arousal.1.bias: Grad Norm = 0.0060
fusion_layer_arousal.4.weight: Grad Norm = 0.1967
fusion_layer_arousal.4.bias: Grad Norm = 0.0135
fusion_layer_dominance.0.weight: Grad Norm = 0.1835
fusion_layer_dominance.0.bias: Grad Norm = 0.0413
fusion_layer_dominance.1.weight: Grad Norm = 0.0104
fusion_layer_dominance.1.bias: Grad Norm = 0.0080
fusion_layer_dominance.4.weight: Grad Norm = 0.2727
fusion_layer_dominance.4.bias: Grad Norm = 0.0134
shared_fc_arousal.0.weight: Grad Norm = 0.1425
shared_fc_arousal.0.bias: Grad Norm = 0.0187
shared_fc_arousal.1.weight: Grad Norm = 0.0086
shared_fc_arousal.1.bias: Grad Norm = 0.0078
shared_fc_dominance.0.weight: Grad Norm = 0.2504
shared_fc_dominance.0.bias: Grad Norm = 0.0238
shared_fc_dominance.1.weight: Grad Norm = 0.0172
shared_fc_dominance.1.bias: Grad Norm = 0.0147
output_branch_arousal.0.

Epoch 4/15 [Train]:  40%|███▉      | 401/1004 [03:39<05:30,  1.83it/s, loss=0.1655, a_loss=0.1320, d_loss=0.1766]

fusion_layer_arousal.0.weight: Grad Norm = 0.1155
fusion_layer_arousal.0.bias: Grad Norm = 0.0297
fusion_layer_arousal.1.weight: Grad Norm = 0.0051
fusion_layer_arousal.1.bias: Grad Norm = 0.0043
fusion_layer_arousal.4.weight: Grad Norm = 0.1379
fusion_layer_arousal.4.bias: Grad Norm = 0.0073
fusion_layer_dominance.0.weight: Grad Norm = 0.2175
fusion_layer_dominance.0.bias: Grad Norm = 0.0462
fusion_layer_dominance.1.weight: Grad Norm = 0.0119
fusion_layer_dominance.1.bias: Grad Norm = 0.0093
fusion_layer_dominance.4.weight: Grad Norm = 0.3146
fusion_layer_dominance.4.bias: Grad Norm = 0.0146
shared_fc_arousal.0.weight: Grad Norm = 0.1127
shared_fc_arousal.0.bias: Grad Norm = 0.0121
shared_fc_arousal.1.weight: Grad Norm = 0.0058
shared_fc_arousal.1.bias: Grad Norm = 0.0059
shared_fc_dominance.0.weight: Grad Norm = 0.2822
shared_fc_dominance.0.bias: Grad Norm = 0.0264
shared_fc_dominance.1.weight: Grad Norm = 0.0160
shared_fc_dominance.1.bias: Grad Norm = 0.0165
output_branch_arousal.0.

Epoch 4/15 [Train]:  50%|████▉     | 501/1004 [04:35<05:00,  1.68it/s, loss=0.2485, a_loss=0.1303, d_loss=0.2878]

fusion_layer_arousal.0.weight: Grad Norm = 0.1353
fusion_layer_arousal.0.bias: Grad Norm = 0.0382
fusion_layer_arousal.1.weight: Grad Norm = 0.0063
fusion_layer_arousal.1.bias: Grad Norm = 0.0051
fusion_layer_arousal.4.weight: Grad Norm = 0.1522
fusion_layer_arousal.4.bias: Grad Norm = 0.0083
fusion_layer_dominance.0.weight: Grad Norm = 0.2580
fusion_layer_dominance.0.bias: Grad Norm = 0.0694
fusion_layer_dominance.1.weight: Grad Norm = 0.0137
fusion_layer_dominance.1.bias: Grad Norm = 0.0112
fusion_layer_dominance.4.weight: Grad Norm = 0.3337
fusion_layer_dominance.4.bias: Grad Norm = 0.0163
shared_fc_arousal.0.weight: Grad Norm = 0.1213
shared_fc_arousal.0.bias: Grad Norm = 0.0145
shared_fc_arousal.1.weight: Grad Norm = 0.0079
shared_fc_arousal.1.bias: Grad Norm = 0.0064
shared_fc_dominance.0.weight: Grad Norm = 0.2988
shared_fc_dominance.0.bias: Grad Norm = 0.0287
shared_fc_dominance.1.weight: Grad Norm = 0.0189
shared_fc_dominance.1.bias: Grad Norm = 0.0173
output_branch_arousal.0.

Epoch 4/15 [Train]:  60%|█████▉    | 601/1004 [05:29<03:41,  1.82it/s, loss=0.1089, a_loss=0.1527, d_loss=0.0943]

fusion_layer_arousal.0.weight: Grad Norm = 0.1098
fusion_layer_arousal.0.bias: Grad Norm = 0.0259
fusion_layer_arousal.1.weight: Grad Norm = 0.0045
fusion_layer_arousal.1.bias: Grad Norm = 0.0039
fusion_layer_arousal.4.weight: Grad Norm = 0.1390
fusion_layer_arousal.4.bias: Grad Norm = 0.0071
fusion_layer_dominance.0.weight: Grad Norm = 0.1636
fusion_layer_dominance.0.bias: Grad Norm = 0.0439
fusion_layer_dominance.1.weight: Grad Norm = 0.0096
fusion_layer_dominance.1.bias: Grad Norm = 0.0074
fusion_layer_dominance.4.weight: Grad Norm = 0.2144
fusion_layer_dominance.4.bias: Grad Norm = 0.0114
shared_fc_arousal.0.weight: Grad Norm = 0.1098
shared_fc_arousal.0.bias: Grad Norm = 0.0124
shared_fc_arousal.1.weight: Grad Norm = 0.0059
shared_fc_arousal.1.bias: Grad Norm = 0.0057
shared_fc_dominance.0.weight: Grad Norm = 0.2026
shared_fc_dominance.0.bias: Grad Norm = 0.0180
shared_fc_dominance.1.weight: Grad Norm = 0.0129
shared_fc_dominance.1.bias: Grad Norm = 0.0127
output_branch_arousal.0.

Epoch 4/15 [Train]:  70%|██████▉   | 701/1004 [06:23<02:39,  1.90it/s, loss=0.2665, a_loss=0.0792, d_loss=0.3290]

fusion_layer_arousal.0.weight: Grad Norm = 0.0860
fusion_layer_arousal.0.bias: Grad Norm = 0.0215
fusion_layer_arousal.1.weight: Grad Norm = 0.0042
fusion_layer_arousal.1.bias: Grad Norm = 0.0031
fusion_layer_arousal.4.weight: Grad Norm = 0.1078
fusion_layer_arousal.4.bias: Grad Norm = 0.0057
fusion_layer_dominance.0.weight: Grad Norm = 0.2442
fusion_layer_dominance.0.bias: Grad Norm = 0.0545
fusion_layer_dominance.1.weight: Grad Norm = 0.0128
fusion_layer_dominance.1.bias: Grad Norm = 0.0103
fusion_layer_dominance.4.weight: Grad Norm = 0.3408
fusion_layer_dominance.4.bias: Grad Norm = 0.0147
shared_fc_arousal.0.weight: Grad Norm = 0.0837
shared_fc_arousal.0.bias: Grad Norm = 0.0094
shared_fc_arousal.1.weight: Grad Norm = 0.0053
shared_fc_arousal.1.bias: Grad Norm = 0.0043
shared_fc_dominance.0.weight: Grad Norm = 0.2768
shared_fc_dominance.0.bias: Grad Norm = 0.0242
shared_fc_dominance.1.weight: Grad Norm = 0.0158
shared_fc_dominance.1.bias: Grad Norm = 0.0152
output_branch_arousal.0.

Epoch 4/15 [Train]:  80%|███████▉  | 801/1004 [07:18<01:55,  1.76it/s, loss=0.0777, a_loss=0.0733, d_loss=0.0791]

fusion_layer_arousal.0.weight: Grad Norm = 0.0803
fusion_layer_arousal.0.bias: Grad Norm = 0.0192
fusion_layer_arousal.1.weight: Grad Norm = 0.0034
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0976
fusion_layer_arousal.4.bias: Grad Norm = 0.0048
fusion_layer_dominance.0.weight: Grad Norm = 0.1276
fusion_layer_dominance.0.bias: Grad Norm = 0.0323
fusion_layer_dominance.1.weight: Grad Norm = 0.0078
fusion_layer_dominance.1.bias: Grad Norm = 0.0056
fusion_layer_dominance.4.weight: Grad Norm = 0.1747
fusion_layer_dominance.4.bias: Grad Norm = 0.0083
shared_fc_arousal.0.weight: Grad Norm = 0.0812
shared_fc_arousal.0.bias: Grad Norm = 0.0086
shared_fc_arousal.1.weight: Grad Norm = 0.0047
shared_fc_arousal.1.bias: Grad Norm = 0.0040
shared_fc_dominance.0.weight: Grad Norm = 0.1558
shared_fc_dominance.0.bias: Grad Norm = 0.0128
shared_fc_dominance.1.weight: Grad Norm = 0.0092
shared_fc_dominance.1.bias: Grad Norm = 0.0089
output_branch_arousal.0.

Epoch 4/15 [Train]:  90%|████████▉ | 901/1004 [08:13<00:57,  1.80it/s, loss=0.1689, a_loss=0.1159, d_loss=0.1866]

fusion_layer_arousal.0.weight: Grad Norm = 0.0989
fusion_layer_arousal.0.bias: Grad Norm = 0.0224
fusion_layer_arousal.1.weight: Grad Norm = 0.0041
fusion_layer_arousal.1.bias: Grad Norm = 0.0035
fusion_layer_arousal.4.weight: Grad Norm = 0.1220
fusion_layer_arousal.4.bias: Grad Norm = 0.0062
fusion_layer_dominance.0.weight: Grad Norm = 0.1472
fusion_layer_dominance.0.bias: Grad Norm = 0.0283
fusion_layer_dominance.1.weight: Grad Norm = 0.0084
fusion_layer_dominance.1.bias: Grad Norm = 0.0065
fusion_layer_dominance.4.weight: Grad Norm = 0.2306
fusion_layer_dominance.4.bias: Grad Norm = 0.0104
shared_fc_arousal.0.weight: Grad Norm = 0.1002
shared_fc_arousal.0.bias: Grad Norm = 0.0104
shared_fc_arousal.1.weight: Grad Norm = 0.0057
shared_fc_arousal.1.bias: Grad Norm = 0.0050
shared_fc_dominance.0.weight: Grad Norm = 0.1988
shared_fc_dominance.0.bias: Grad Norm = 0.0189
shared_fc_dominance.1.weight: Grad Norm = 0.0117
shared_fc_dominance.1.bias: Grad Norm = 0.0111
output_branch_arousal.0.

Epoch 4/15 [Train]: 100%|█████████▉| 1001/1004 [09:07<00:01,  1.84it/s, loss=0.0990, a_loss=0.0668, d_loss=0.1097]

fusion_layer_arousal.0.weight: Grad Norm = 0.0877
fusion_layer_arousal.0.bias: Grad Norm = 0.0130
fusion_layer_arousal.1.weight: Grad Norm = 0.0037
fusion_layer_arousal.1.bias: Grad Norm = 0.0032
fusion_layer_arousal.4.weight: Grad Norm = 0.0963
fusion_layer_arousal.4.bias: Grad Norm = 0.0057
fusion_layer_dominance.0.weight: Grad Norm = 0.2074
fusion_layer_dominance.0.bias: Grad Norm = 0.0347
fusion_layer_dominance.1.weight: Grad Norm = 0.0103
fusion_layer_dominance.1.bias: Grad Norm = 0.0095
fusion_layer_dominance.4.weight: Grad Norm = 0.2563
fusion_layer_dominance.4.bias: Grad Norm = 0.0152
shared_fc_arousal.0.weight: Grad Norm = 0.0790
shared_fc_arousal.0.bias: Grad Norm = 0.0086
shared_fc_arousal.1.weight: Grad Norm = 0.0051
shared_fc_arousal.1.bias: Grad Norm = 0.0043
shared_fc_dominance.0.weight: Grad Norm = 0.2234
shared_fc_dominance.0.bias: Grad Norm = 0.0236
shared_fc_dominance.1.weight: Grad Norm = 0.0146
shared_fc_dominance.1.bias: Grad Norm = 0.0139
output_branch_arousal.0.

Epoch 4/15 [Train]: 100%|██████████| 1004/1004 [09:09<00:00,  1.83it/s, loss=0.2937, a_loss=0.0904, d_loss=0.3615]
Epoch 4/15 [Val]: 100%|██████████| 126/126 [00:44<00:00,  2.81it/s, loss=0.4390, a_loss=0.3600, d_loss=0.5180]


Epoch 4/15 Results:
  Train Loss: 0.1842 (Arousal: 0.1485, Dominance: 0.1961)
  Val Loss: 0.1729 (Arousal: 0.1413, Dominance: 0.2045)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500


Epoch 5/15 [Train]:   0%|          | 1/1004 [00:00<08:34,  1.95it/s, loss=0.2605, a_loss=0.0652, d_loss=0.3256]

fusion_layer_arousal.0.weight: Grad Norm = 0.0635
fusion_layer_arousal.0.bias: Grad Norm = 0.0149
fusion_layer_arousal.1.weight: Grad Norm = 0.0026
fusion_layer_arousal.1.bias: Grad Norm = 0.0023
fusion_layer_arousal.4.weight: Grad Norm = 0.0720
fusion_layer_arousal.4.bias: Grad Norm = 0.0041
fusion_layer_dominance.0.weight: Grad Norm = 0.2378
fusion_layer_dominance.0.bias: Grad Norm = 0.0542
fusion_layer_dominance.1.weight: Grad Norm = 0.0131
fusion_layer_dominance.1.bias: Grad Norm = 0.0113
fusion_layer_dominance.4.weight: Grad Norm = 0.2841
fusion_layer_dominance.4.bias: Grad Norm = 0.0152
shared_fc_arousal.0.weight: Grad Norm = 0.0623
shared_fc_arousal.0.bias: Grad Norm = 0.0070
shared_fc_arousal.1.weight: Grad Norm = 0.0035
shared_fc_arousal.1.bias: Grad Norm = 0.0034
shared_fc_dominance.0.weight: Grad Norm = 0.2454
shared_fc_dominance.0.bias: Grad Norm = 0.0264
shared_fc_dominance.1.weight: Grad Norm = 0.0153
shared_fc_dominance.1.bias: Grad Norm = 0.0143
output_branch_arousal.0.

Epoch 5/15 [Train]:  10%|█         | 101/1004 [00:55<08:00,  1.88it/s, loss=0.2570, a_loss=0.2067, d_loss=0.2737]

fusion_layer_arousal.0.weight: Grad Norm = 0.1173
fusion_layer_arousal.0.bias: Grad Norm = 0.0248
fusion_layer_arousal.1.weight: Grad Norm = 0.0053
fusion_layer_arousal.1.bias: Grad Norm = 0.0044
fusion_layer_arousal.4.weight: Grad Norm = 0.1478
fusion_layer_arousal.4.bias: Grad Norm = 0.0081
fusion_layer_dominance.0.weight: Grad Norm = 0.2660
fusion_layer_dominance.0.bias: Grad Norm = 0.0581
fusion_layer_dominance.1.weight: Grad Norm = 0.0152
fusion_layer_dominance.1.bias: Grad Norm = 0.0128
fusion_layer_dominance.4.weight: Grad Norm = 0.3172
fusion_layer_dominance.4.bias: Grad Norm = 0.0193
shared_fc_arousal.0.weight: Grad Norm = 0.1186
shared_fc_arousal.0.bias: Grad Norm = 0.0139
shared_fc_arousal.1.weight: Grad Norm = 0.0066
shared_fc_arousal.1.bias: Grad Norm = 0.0063
shared_fc_dominance.0.weight: Grad Norm = 0.2747
shared_fc_dominance.0.bias: Grad Norm = 0.0284
shared_fc_dominance.1.weight: Grad Norm = 0.0147
shared_fc_dominance.1.bias: Grad Norm = 0.0171
output_branch_arousal.0.

Epoch 5/15 [Train]:  20%|██        | 201/1004 [01:49<08:02,  1.67it/s, loss=0.2247, a_loss=0.3705, d_loss=0.1761]

fusion_layer_arousal.0.weight: Grad Norm = 0.1388
fusion_layer_arousal.0.bias: Grad Norm = 0.0255
fusion_layer_arousal.1.weight: Grad Norm = 0.0057
fusion_layer_arousal.1.bias: Grad Norm = 0.0047
fusion_layer_arousal.4.weight: Grad Norm = 0.1561
fusion_layer_arousal.4.bias: Grad Norm = 0.0078
fusion_layer_dominance.0.weight: Grad Norm = 0.1832
fusion_layer_dominance.0.bias: Grad Norm = 0.0355
fusion_layer_dominance.1.weight: Grad Norm = 0.0090
fusion_layer_dominance.1.bias: Grad Norm = 0.0079
fusion_layer_dominance.4.weight: Grad Norm = 0.2608
fusion_layer_dominance.4.bias: Grad Norm = 0.0124
shared_fc_arousal.0.weight: Grad Norm = 0.1293
shared_fc_arousal.0.bias: Grad Norm = 0.0132
shared_fc_arousal.1.weight: Grad Norm = 0.0067
shared_fc_arousal.1.bias: Grad Norm = 0.0061
shared_fc_dominance.0.weight: Grad Norm = 0.2358
shared_fc_dominance.0.bias: Grad Norm = 0.0213
shared_fc_dominance.1.weight: Grad Norm = 0.0153
shared_fc_dominance.1.bias: Grad Norm = 0.0141
output_branch_arousal.0.

Epoch 5/15 [Train]:  30%|██▉       | 301/1004 [02:44<06:15,  1.87it/s, loss=0.1506, a_loss=0.0398, d_loss=0.1875]

fusion_layer_arousal.0.weight: Grad Norm = 0.0590
fusion_layer_arousal.0.bias: Grad Norm = 0.0108
fusion_layer_arousal.1.weight: Grad Norm = 0.0024
fusion_layer_arousal.1.bias: Grad Norm = 0.0021
fusion_layer_arousal.4.weight: Grad Norm = 0.0644
fusion_layer_arousal.4.bias: Grad Norm = 0.0038
fusion_layer_dominance.0.weight: Grad Norm = 0.2125
fusion_layer_dominance.0.bias: Grad Norm = 0.0386
fusion_layer_dominance.1.weight: Grad Norm = 0.0112
fusion_layer_dominance.1.bias: Grad Norm = 0.0093
fusion_layer_dominance.4.weight: Grad Norm = 0.2598
fusion_layer_dominance.4.bias: Grad Norm = 0.0136
shared_fc_arousal.0.weight: Grad Norm = 0.0512
shared_fc_arousal.0.bias: Grad Norm = 0.0057
shared_fc_arousal.1.weight: Grad Norm = 0.0025
shared_fc_arousal.1.bias: Grad Norm = 0.0027
shared_fc_dominance.0.weight: Grad Norm = 0.2211
shared_fc_dominance.0.bias: Grad Norm = 0.0238
shared_fc_dominance.1.weight: Grad Norm = 0.0126
shared_fc_dominance.1.bias: Grad Norm = 0.0129
output_branch_arousal.0.

Epoch 5/15 [Train]:  40%|███▉      | 401/1004 [03:40<06:24,  1.57it/s, loss=0.3145, a_loss=0.1712, d_loss=0.3623]

fusion_layer_arousal.0.weight: Grad Norm = 0.1350
fusion_layer_arousal.0.bias: Grad Norm = 0.0290
fusion_layer_arousal.1.weight: Grad Norm = 0.0056
fusion_layer_arousal.1.bias: Grad Norm = 0.0049
fusion_layer_arousal.4.weight: Grad Norm = 0.1614
fusion_layer_arousal.4.bias: Grad Norm = 0.0080
fusion_layer_dominance.0.weight: Grad Norm = 0.2401
fusion_layer_dominance.0.bias: Grad Norm = 0.0516
fusion_layer_dominance.1.weight: Grad Norm = 0.0128
fusion_layer_dominance.1.bias: Grad Norm = 0.0110
fusion_layer_dominance.4.weight: Grad Norm = 0.3108
fusion_layer_dominance.4.bias: Grad Norm = 0.0156
shared_fc_arousal.0.weight: Grad Norm = 0.1331
shared_fc_arousal.0.bias: Grad Norm = 0.0146
shared_fc_arousal.1.weight: Grad Norm = 0.0072
shared_fc_arousal.1.bias: Grad Norm = 0.0068
shared_fc_dominance.0.weight: Grad Norm = 0.2814
shared_fc_dominance.0.bias: Grad Norm = 0.0254
shared_fc_dominance.1.weight: Grad Norm = 0.0191
shared_fc_dominance.1.bias: Grad Norm = 0.0177
output_branch_arousal.0.

Epoch 5/15 [Train]:  50%|████▉     | 501/1004 [04:34<05:05,  1.65it/s, loss=0.2067, a_loss=0.1326, d_loss=0.2314]

fusion_layer_arousal.0.weight: Grad Norm = 0.1270
fusion_layer_arousal.0.bias: Grad Norm = 0.0288
fusion_layer_arousal.1.weight: Grad Norm = 0.0053
fusion_layer_arousal.1.bias: Grad Norm = 0.0046
fusion_layer_arousal.4.weight: Grad Norm = 0.1245
fusion_layer_arousal.4.bias: Grad Norm = 0.0079
fusion_layer_dominance.0.weight: Grad Norm = 0.1845
fusion_layer_dominance.0.bias: Grad Norm = 0.0348
fusion_layer_dominance.1.weight: Grad Norm = 0.0104
fusion_layer_dominance.1.bias: Grad Norm = 0.0087
fusion_layer_dominance.4.weight: Grad Norm = 0.2655
fusion_layer_dominance.4.bias: Grad Norm = 0.0140
shared_fc_arousal.0.weight: Grad Norm = 0.1051
shared_fc_arousal.0.bias: Grad Norm = 0.0127
shared_fc_arousal.1.weight: Grad Norm = 0.0067
shared_fc_arousal.1.bias: Grad Norm = 0.0058
shared_fc_dominance.0.weight: Grad Norm = 0.2332
shared_fc_dominance.0.bias: Grad Norm = 0.0226
shared_fc_dominance.1.weight: Grad Norm = 0.0165
shared_fc_dominance.1.bias: Grad Norm = 0.0150
output_branch_arousal.0.

Epoch 5/15 [Train]:  60%|█████▉    | 601/1004 [05:28<04:00,  1.67it/s, loss=0.1360, a_loss=0.0224, d_loss=0.1739]

fusion_layer_arousal.0.weight: Grad Norm = 0.0543
fusion_layer_arousal.0.bias: Grad Norm = 0.0140
fusion_layer_arousal.1.weight: Grad Norm = 0.0024
fusion_layer_arousal.1.bias: Grad Norm = 0.0020
fusion_layer_arousal.4.weight: Grad Norm = 0.0603
fusion_layer_arousal.4.bias: Grad Norm = 0.0036
fusion_layer_dominance.0.weight: Grad Norm = 0.1802
fusion_layer_dominance.0.bias: Grad Norm = 0.0460
fusion_layer_dominance.1.weight: Grad Norm = 0.0100
fusion_layer_dominance.1.bias: Grad Norm = 0.0086
fusion_layer_dominance.4.weight: Grad Norm = 0.2597
fusion_layer_dominance.4.bias: Grad Norm = 0.0151
shared_fc_arousal.0.weight: Grad Norm = 0.0512
shared_fc_arousal.0.bias: Grad Norm = 0.0057
shared_fc_arousal.1.weight: Grad Norm = 0.0030
shared_fc_arousal.1.bias: Grad Norm = 0.0027
shared_fc_dominance.0.weight: Grad Norm = 0.2375
shared_fc_dominance.0.bias: Grad Norm = 0.0236
shared_fc_dominance.1.weight: Grad Norm = 0.0151
shared_fc_dominance.1.bias: Grad Norm = 0.0161
output_branch_arousal.0.

Epoch 5/15 [Train]:  70%|██████▉   | 701/1004 [06:24<02:49,  1.79it/s, loss=0.1601, a_loss=0.0512, d_loss=0.1965]

fusion_layer_arousal.0.weight: Grad Norm = 0.0603
fusion_layer_arousal.0.bias: Grad Norm = 0.0135
fusion_layer_arousal.1.weight: Grad Norm = 0.0023
fusion_layer_arousal.1.bias: Grad Norm = 0.0022
fusion_layer_arousal.4.weight: Grad Norm = 0.0758
fusion_layer_arousal.4.bias: Grad Norm = 0.0046
fusion_layer_dominance.0.weight: Grad Norm = 0.1876
fusion_layer_dominance.0.bias: Grad Norm = 0.0388
fusion_layer_dominance.1.weight: Grad Norm = 0.0101
fusion_layer_dominance.1.bias: Grad Norm = 0.0082
fusion_layer_dominance.4.weight: Grad Norm = 0.2576
fusion_layer_dominance.4.bias: Grad Norm = 0.0132
shared_fc_arousal.0.weight: Grad Norm = 0.0610
shared_fc_arousal.0.bias: Grad Norm = 0.0073
shared_fc_arousal.1.weight: Grad Norm = 0.0040
shared_fc_arousal.1.bias: Grad Norm = 0.0034
shared_fc_dominance.0.weight: Grad Norm = 0.2231
shared_fc_dominance.0.bias: Grad Norm = 0.0207
shared_fc_dominance.1.weight: Grad Norm = 0.0136
shared_fc_dominance.1.bias: Grad Norm = 0.0139
output_branch_arousal.0.

Epoch 5/15 [Train]:  80%|███████▉  | 801/1004 [07:19<01:51,  1.82it/s, loss=0.1294, a_loss=0.1366, d_loss=0.1270]

fusion_layer_arousal.0.weight: Grad Norm = 0.1160
fusion_layer_arousal.0.bias: Grad Norm = 0.0246
fusion_layer_arousal.1.weight: Grad Norm = 0.0048
fusion_layer_arousal.1.bias: Grad Norm = 0.0044
fusion_layer_arousal.4.weight: Grad Norm = 0.1187
fusion_layer_arousal.4.bias: Grad Norm = 0.0065
fusion_layer_dominance.0.weight: Grad Norm = 0.1452
fusion_layer_dominance.0.bias: Grad Norm = 0.0317
fusion_layer_dominance.1.weight: Grad Norm = 0.0082
fusion_layer_dominance.1.bias: Grad Norm = 0.0069
fusion_layer_dominance.4.weight: Grad Norm = 0.1966
fusion_layer_dominance.4.bias: Grad Norm = 0.0099
shared_fc_arousal.0.weight: Grad Norm = 0.1028
shared_fc_arousal.0.bias: Grad Norm = 0.0122
shared_fc_arousal.1.weight: Grad Norm = 0.0071
shared_fc_arousal.1.bias: Grad Norm = 0.0056
shared_fc_dominance.0.weight: Grad Norm = 0.1688
shared_fc_dominance.0.bias: Grad Norm = 0.0155
shared_fc_dominance.1.weight: Grad Norm = 0.0107
shared_fc_dominance.1.bias: Grad Norm = 0.0103
output_branch_arousal.0.

Epoch 5/15 [Train]:  90%|████████▉ | 901/1004 [08:13<00:52,  1.95it/s, loss=0.1433, a_loss=0.0525, d_loss=0.1736]

fusion_layer_arousal.0.weight: Grad Norm = 0.0514
fusion_layer_arousal.0.bias: Grad Norm = 0.0112
fusion_layer_arousal.1.weight: Grad Norm = 0.0023
fusion_layer_arousal.1.bias: Grad Norm = 0.0019
fusion_layer_arousal.4.weight: Grad Norm = 0.0592
fusion_layer_arousal.4.bias: Grad Norm = 0.0032
fusion_layer_dominance.0.weight: Grad Norm = 0.1414
fusion_layer_dominance.0.bias: Grad Norm = 0.0331
fusion_layer_dominance.1.weight: Grad Norm = 0.0092
fusion_layer_dominance.1.bias: Grad Norm = 0.0064
fusion_layer_dominance.4.weight: Grad Norm = 0.1917
fusion_layer_dominance.4.bias: Grad Norm = 0.0074
shared_fc_arousal.0.weight: Grad Norm = 0.0508
shared_fc_arousal.0.bias: Grad Norm = 0.0054
shared_fc_arousal.1.weight: Grad Norm = 0.0026
shared_fc_arousal.1.bias: Grad Norm = 0.0027
shared_fc_dominance.0.weight: Grad Norm = 0.1677
shared_fc_dominance.0.bias: Grad Norm = 0.0135
shared_fc_dominance.1.weight: Grad Norm = 0.0104
shared_fc_dominance.1.bias: Grad Norm = 0.0098
output_branch_arousal.0.

Epoch 5/15 [Train]: 100%|█████████▉| 1001/1004 [09:08<00:01,  1.94it/s, loss=0.1598, a_loss=0.0852, d_loss=0.1846]

fusion_layer_arousal.0.weight: Grad Norm = 0.0794
fusion_layer_arousal.0.bias: Grad Norm = 0.0180
fusion_layer_arousal.1.weight: Grad Norm = 0.0030
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0870
fusion_layer_arousal.4.bias: Grad Norm = 0.0050
fusion_layer_dominance.0.weight: Grad Norm = 0.2141
fusion_layer_dominance.0.bias: Grad Norm = 0.0466
fusion_layer_dominance.1.weight: Grad Norm = 0.0121
fusion_layer_dominance.1.bias: Grad Norm = 0.0099
fusion_layer_dominance.4.weight: Grad Norm = 0.2505
fusion_layer_dominance.4.bias: Grad Norm = 0.0126
shared_fc_arousal.0.weight: Grad Norm = 0.0694
shared_fc_arousal.0.bias: Grad Norm = 0.0073
shared_fc_arousal.1.weight: Grad Norm = 0.0046
shared_fc_arousal.1.bias: Grad Norm = 0.0037
shared_fc_dominance.0.weight: Grad Norm = 0.2143
shared_fc_dominance.0.bias: Grad Norm = 0.0192
shared_fc_dominance.1.weight: Grad Norm = 0.0123
shared_fc_dominance.1.bias: Grad Norm = 0.0133
output_branch_arousal.0.

Epoch 5/15 [Train]: 100%|██████████| 1004/1004 [09:09<00:00,  1.83it/s, loss=0.2476, a_loss=0.1780, d_loss=0.2708]
Epoch 5/15 [Val]: 100%|██████████| 126/126 [00:44<00:00,  2.81it/s, loss=0.2817, a_loss=0.2688, d_loss=0.2946]


Epoch 5/15 Results:
  Train Loss: 0.1815 (Arousal: 0.1436, Dominance: 0.1941)
  Val Loss: 0.1504 (Arousal: 0.1204, Dominance: 0.1804)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1504)


Epoch 6/15 [Train]:   0%|          | 1/1004 [00:00<10:10,  1.64it/s, loss=0.1683, a_loss=0.2026, d_loss=0.1569]

fusion_layer_arousal.0.weight: Grad Norm = 0.1055
fusion_layer_arousal.0.bias: Grad Norm = 0.0197
fusion_layer_arousal.1.weight: Grad Norm = 0.0045
fusion_layer_arousal.1.bias: Grad Norm = 0.0037
fusion_layer_arousal.4.weight: Grad Norm = 0.1248
fusion_layer_arousal.4.bias: Grad Norm = 0.0069
fusion_layer_dominance.0.weight: Grad Norm = 0.1510
fusion_layer_dominance.0.bias: Grad Norm = 0.0310
fusion_layer_dominance.1.weight: Grad Norm = 0.0086
fusion_layer_dominance.1.bias: Grad Norm = 0.0065
fusion_layer_dominance.4.weight: Grad Norm = 0.1941
fusion_layer_dominance.4.bias: Grad Norm = 0.0092
shared_fc_arousal.0.weight: Grad Norm = 0.1030
shared_fc_arousal.0.bias: Grad Norm = 0.0112
shared_fc_arousal.1.weight: Grad Norm = 0.0056
shared_fc_arousal.1.bias: Grad Norm = 0.0052
shared_fc_dominance.0.weight: Grad Norm = 0.1820
shared_fc_dominance.0.bias: Grad Norm = 0.0166
shared_fc_dominance.1.weight: Grad Norm = 0.0122
shared_fc_dominance.1.bias: Grad Norm = 0.0114
output_branch_arousal.0.

Epoch 6/15 [Train]:  10%|█         | 101/1004 [00:54<08:42,  1.73it/s, loss=0.0962, a_loss=0.0797, d_loss=0.1016]

fusion_layer_arousal.0.weight: Grad Norm = 0.0974
fusion_layer_arousal.0.bias: Grad Norm = 0.0208
fusion_layer_arousal.1.weight: Grad Norm = 0.0036
fusion_layer_arousal.1.bias: Grad Norm = 0.0034
fusion_layer_arousal.4.weight: Grad Norm = 0.1021
fusion_layer_arousal.4.bias: Grad Norm = 0.0058
fusion_layer_dominance.0.weight: Grad Norm = 0.1184
fusion_layer_dominance.0.bias: Grad Norm = 0.0236
fusion_layer_dominance.1.weight: Grad Norm = 0.0071
fusion_layer_dominance.1.bias: Grad Norm = 0.0053
fusion_layer_dominance.4.weight: Grad Norm = 0.1680
fusion_layer_dominance.4.bias: Grad Norm = 0.0074
shared_fc_arousal.0.weight: Grad Norm = 0.0786
shared_fc_arousal.0.bias: Grad Norm = 0.0093
shared_fc_arousal.1.weight: Grad Norm = 0.0040
shared_fc_arousal.1.bias: Grad Norm = 0.0042
shared_fc_dominance.0.weight: Grad Norm = 0.1415
shared_fc_dominance.0.bias: Grad Norm = 0.0128
shared_fc_dominance.1.weight: Grad Norm = 0.0101
shared_fc_dominance.1.bias: Grad Norm = 0.0087
output_branch_arousal.0.

Epoch 6/15 [Train]:  20%|██        | 201/1004 [01:49<07:59,  1.68it/s, loss=0.1311, a_loss=0.2150, d_loss=0.1031]

fusion_layer_arousal.0.weight: Grad Norm = 0.0988
fusion_layer_arousal.0.bias: Grad Norm = 0.0167
fusion_layer_arousal.1.weight: Grad Norm = 0.0042
fusion_layer_arousal.1.bias: Grad Norm = 0.0035
fusion_layer_arousal.4.weight: Grad Norm = 0.0985
fusion_layer_arousal.4.bias: Grad Norm = 0.0053
fusion_layer_dominance.0.weight: Grad Norm = 0.1194
fusion_layer_dominance.0.bias: Grad Norm = 0.0247
fusion_layer_dominance.1.weight: Grad Norm = 0.0055
fusion_layer_dominance.1.bias: Grad Norm = 0.0052
fusion_layer_dominance.4.weight: Grad Norm = 0.1405
fusion_layer_dominance.4.bias: Grad Norm = 0.0066
shared_fc_arousal.0.weight: Grad Norm = 0.0884
shared_fc_arousal.0.bias: Grad Norm = 0.0101
shared_fc_arousal.1.weight: Grad Norm = 0.0056
shared_fc_arousal.1.bias: Grad Norm = 0.0047
shared_fc_dominance.0.weight: Grad Norm = 0.1179
shared_fc_dominance.0.bias: Grad Norm = 0.0110
shared_fc_dominance.1.weight: Grad Norm = 0.0074
shared_fc_dominance.1.bias: Grad Norm = 0.0071
output_branch_arousal.0.

Epoch 6/15 [Train]:  30%|██▉       | 301/1004 [02:45<07:15,  1.61it/s, loss=0.0955, a_loss=0.1013, d_loss=0.0936]

fusion_layer_arousal.0.weight: Grad Norm = 0.0913
fusion_layer_arousal.0.bias: Grad Norm = 0.0252
fusion_layer_arousal.1.weight: Grad Norm = 0.0042
fusion_layer_arousal.1.bias: Grad Norm = 0.0032
fusion_layer_arousal.4.weight: Grad Norm = 0.1064
fusion_layer_arousal.4.bias: Grad Norm = 0.0047
fusion_layer_dominance.0.weight: Grad Norm = 0.1616
fusion_layer_dominance.0.bias: Grad Norm = 0.0378
fusion_layer_dominance.1.weight: Grad Norm = 0.0088
fusion_layer_dominance.1.bias: Grad Norm = 0.0079
fusion_layer_dominance.4.weight: Grad Norm = 0.1774
fusion_layer_dominance.4.bias: Grad Norm = 0.0126
shared_fc_arousal.0.weight: Grad Norm = 0.0857
shared_fc_arousal.0.bias: Grad Norm = 0.0087
shared_fc_arousal.1.weight: Grad Norm = 0.0045
shared_fc_arousal.1.bias: Grad Norm = 0.0045
shared_fc_dominance.0.weight: Grad Norm = 0.1498
shared_fc_dominance.0.bias: Grad Norm = 0.0177
shared_fc_dominance.1.weight: Grad Norm = 0.0101
shared_fc_dominance.1.bias: Grad Norm = 0.0103
output_branch_arousal.0.

Epoch 6/15 [Train]:  40%|███▉      | 401/1004 [03:39<05:29,  1.83it/s, loss=0.1037, a_loss=0.0833, d_loss=0.1105]

fusion_layer_arousal.0.weight: Grad Norm = 0.0758
fusion_layer_arousal.0.bias: Grad Norm = 0.0153
fusion_layer_arousal.1.weight: Grad Norm = 0.0029
fusion_layer_arousal.1.bias: Grad Norm = 0.0028
fusion_layer_arousal.4.weight: Grad Norm = 0.0942
fusion_layer_arousal.4.bias: Grad Norm = 0.0054
fusion_layer_dominance.0.weight: Grad Norm = 0.1520
fusion_layer_dominance.0.bias: Grad Norm = 0.0288
fusion_layer_dominance.1.weight: Grad Norm = 0.0091
fusion_layer_dominance.1.bias: Grad Norm = 0.0072
fusion_layer_dominance.4.weight: Grad Norm = 0.1919
fusion_layer_dominance.4.bias: Grad Norm = 0.0116
shared_fc_arousal.0.weight: Grad Norm = 0.0779
shared_fc_arousal.0.bias: Grad Norm = 0.0081
shared_fc_arousal.1.weight: Grad Norm = 0.0045
shared_fc_arousal.1.bias: Grad Norm = 0.0042
shared_fc_dominance.0.weight: Grad Norm = 0.1659
shared_fc_dominance.0.bias: Grad Norm = 0.0174
shared_fc_dominance.1.weight: Grad Norm = 0.0113
shared_fc_dominance.1.bias: Grad Norm = 0.0112
output_branch_arousal.0.

Epoch 6/15 [Train]:  50%|████▉     | 501/1004 [04:34<05:05,  1.65it/s, loss=0.2223, a_loss=0.1593, d_loss=0.2433]

fusion_layer_arousal.0.weight: Grad Norm = 0.1086
fusion_layer_arousal.0.bias: Grad Norm = 0.0286
fusion_layer_arousal.1.weight: Grad Norm = 0.0043
fusion_layer_arousal.1.bias: Grad Norm = 0.0040
fusion_layer_arousal.4.weight: Grad Norm = 0.1207
fusion_layer_arousal.4.bias: Grad Norm = 0.0066
fusion_layer_dominance.0.weight: Grad Norm = 0.2389
fusion_layer_dominance.0.bias: Grad Norm = 0.0490
fusion_layer_dominance.1.weight: Grad Norm = 0.0133
fusion_layer_dominance.1.bias: Grad Norm = 0.0109
fusion_layer_dominance.4.weight: Grad Norm = 0.2840
fusion_layer_dominance.4.bias: Grad Norm = 0.0143
shared_fc_arousal.0.weight: Grad Norm = 0.0943
shared_fc_arousal.0.bias: Grad Norm = 0.0112
shared_fc_arousal.1.weight: Grad Norm = 0.0054
shared_fc_arousal.1.bias: Grad Norm = 0.0052
shared_fc_dominance.0.weight: Grad Norm = 0.2266
shared_fc_dominance.0.bias: Grad Norm = 0.0218
shared_fc_dominance.1.weight: Grad Norm = 0.0139
shared_fc_dominance.1.bias: Grad Norm = 0.0136
output_branch_arousal.0.

Epoch 6/15 [Train]:  60%|█████▉    | 601/1004 [05:30<03:43,  1.80it/s, loss=0.2118, a_loss=0.3446, d_loss=0.1675]

fusion_layer_arousal.0.weight: Grad Norm = 0.1465
fusion_layer_arousal.0.bias: Grad Norm = 0.0341
fusion_layer_arousal.1.weight: Grad Norm = 0.0060
fusion_layer_arousal.1.bias: Grad Norm = 0.0054
fusion_layer_arousal.4.weight: Grad Norm = 0.1550
fusion_layer_arousal.4.bias: Grad Norm = 0.0079
fusion_layer_dominance.0.weight: Grad Norm = 0.1261
fusion_layer_dominance.0.bias: Grad Norm = 0.0282
fusion_layer_dominance.1.weight: Grad Norm = 0.0067
fusion_layer_dominance.1.bias: Grad Norm = 0.0060
fusion_layer_dominance.4.weight: Grad Norm = 0.1599
fusion_layer_dominance.4.bias: Grad Norm = 0.0090
shared_fc_arousal.0.weight: Grad Norm = 0.1254
shared_fc_arousal.0.bias: Grad Norm = 0.0136
shared_fc_arousal.1.weight: Grad Norm = 0.0079
shared_fc_arousal.1.bias: Grad Norm = 0.0064
shared_fc_dominance.0.weight: Grad Norm = 0.1403
shared_fc_dominance.0.bias: Grad Norm = 0.0130
shared_fc_dominance.1.weight: Grad Norm = 0.0080
shared_fc_dominance.1.bias: Grad Norm = 0.0096
output_branch_arousal.0.

Epoch 6/15 [Train]:  70%|██████▉   | 701/1004 [06:25<02:43,  1.85it/s, loss=0.3446, a_loss=0.1435, d_loss=0.4116]

fusion_layer_arousal.0.weight: Grad Norm = 0.0790
fusion_layer_arousal.0.bias: Grad Norm = 0.0179
fusion_layer_arousal.1.weight: Grad Norm = 0.0033
fusion_layer_arousal.1.bias: Grad Norm = 0.0028
fusion_layer_arousal.4.weight: Grad Norm = 0.0966
fusion_layer_arousal.4.bias: Grad Norm = 0.0048
fusion_layer_dominance.0.weight: Grad Norm = 0.1208
fusion_layer_dominance.0.bias: Grad Norm = 0.0301
fusion_layer_dominance.1.weight: Grad Norm = 0.0072
fusion_layer_dominance.1.bias: Grad Norm = 0.0058
fusion_layer_dominance.4.weight: Grad Norm = 0.1791
fusion_layer_dominance.4.bias: Grad Norm = 0.0086
shared_fc_arousal.0.weight: Grad Norm = 0.0844
shared_fc_arousal.0.bias: Grad Norm = 0.0088
shared_fc_arousal.1.weight: Grad Norm = 0.0052
shared_fc_arousal.1.bias: Grad Norm = 0.0044
shared_fc_dominance.0.weight: Grad Norm = 0.1730
shared_fc_dominance.0.bias: Grad Norm = 0.0151
shared_fc_dominance.1.weight: Grad Norm = 0.0129
shared_fc_dominance.1.bias: Grad Norm = 0.0111
output_branch_arousal.0.

Epoch 6/15 [Train]:  80%|███████▉  | 801/1004 [07:19<01:42,  1.98it/s, loss=0.2434, a_loss=0.2012, d_loss=0.2574]

fusion_layer_arousal.0.weight: Grad Norm = 0.0897
fusion_layer_arousal.0.bias: Grad Norm = 0.0190
fusion_layer_arousal.1.weight: Grad Norm = 0.0039
fusion_layer_arousal.1.bias: Grad Norm = 0.0033
fusion_layer_arousal.4.weight: Grad Norm = 0.1028
fusion_layer_arousal.4.bias: Grad Norm = 0.0057
fusion_layer_dominance.0.weight: Grad Norm = 0.1892
fusion_layer_dominance.0.bias: Grad Norm = 0.0440
fusion_layer_dominance.1.weight: Grad Norm = 0.0105
fusion_layer_dominance.1.bias: Grad Norm = 0.0092
fusion_layer_dominance.4.weight: Grad Norm = 0.2348
fusion_layer_dominance.4.bias: Grad Norm = 0.0119
shared_fc_arousal.0.weight: Grad Norm = 0.0884
shared_fc_arousal.0.bias: Grad Norm = 0.0097
shared_fc_arousal.1.weight: Grad Norm = 0.0059
shared_fc_arousal.1.bias: Grad Norm = 0.0049
shared_fc_dominance.0.weight: Grad Norm = 0.2048
shared_fc_dominance.0.bias: Grad Norm = 0.0187
shared_fc_dominance.1.weight: Grad Norm = 0.0146
shared_fc_dominance.1.bias: Grad Norm = 0.0127
output_branch_arousal.0.

Epoch 6/15 [Train]:  90%|████████▉ | 901/1004 [08:14<00:53,  1.93it/s, loss=0.2546, a_loss=0.1061, d_loss=0.3041]

fusion_layer_arousal.0.weight: Grad Norm = 0.0823
fusion_layer_arousal.0.bias: Grad Norm = 0.0183
fusion_layer_arousal.1.weight: Grad Norm = 0.0035
fusion_layer_arousal.1.bias: Grad Norm = 0.0030
fusion_layer_arousal.4.weight: Grad Norm = 0.0956
fusion_layer_arousal.4.bias: Grad Norm = 0.0049
fusion_layer_dominance.0.weight: Grad Norm = 0.2100
fusion_layer_dominance.0.bias: Grad Norm = 0.0466
fusion_layer_dominance.1.weight: Grad Norm = 0.0110
fusion_layer_dominance.1.bias: Grad Norm = 0.0096
fusion_layer_dominance.4.weight: Grad Norm = 0.2607
fusion_layer_dominance.4.bias: Grad Norm = 0.0122
shared_fc_arousal.0.weight: Grad Norm = 0.0794
shared_fc_arousal.0.bias: Grad Norm = 0.0085
shared_fc_arousal.1.weight: Grad Norm = 0.0045
shared_fc_arousal.1.bias: Grad Norm = 0.0043
shared_fc_dominance.0.weight: Grad Norm = 0.2317
shared_fc_dominance.0.bias: Grad Norm = 0.0207
shared_fc_dominance.1.weight: Grad Norm = 0.0153
shared_fc_dominance.1.bias: Grad Norm = 0.0142
output_branch_arousal.0.

Epoch 6/15 [Train]: 100%|█████████▉| 1001/1004 [09:08<00:01,  2.01it/s, loss=0.0543, a_loss=0.1471, d_loss=0.0234]

fusion_layer_arousal.0.weight: Grad Norm = 0.0956
fusion_layer_arousal.0.bias: Grad Norm = 0.0213
fusion_layer_arousal.1.weight: Grad Norm = 0.0034
fusion_layer_arousal.1.bias: Grad Norm = 0.0034
fusion_layer_arousal.4.weight: Grad Norm = 0.1130
fusion_layer_arousal.4.bias: Grad Norm = 0.0067
fusion_layer_dominance.0.weight: Grad Norm = 0.0719
fusion_layer_dominance.0.bias: Grad Norm = 0.0177
fusion_layer_dominance.1.weight: Grad Norm = 0.0040
fusion_layer_dominance.1.bias: Grad Norm = 0.0035
fusion_layer_dominance.4.weight: Grad Norm = 0.0885
fusion_layer_dominance.4.bias: Grad Norm = 0.0042
shared_fc_arousal.0.weight: Grad Norm = 0.0891
shared_fc_arousal.0.bias: Grad Norm = 0.0099
shared_fc_arousal.1.weight: Grad Norm = 0.0047
shared_fc_arousal.1.bias: Grad Norm = 0.0049
shared_fc_dominance.0.weight: Grad Norm = 0.0783
shared_fc_dominance.0.bias: Grad Norm = 0.0066
shared_fc_dominance.1.weight: Grad Norm = 0.0054
shared_fc_dominance.1.bias: Grad Norm = 0.0047
output_branch_arousal.0.

Epoch 6/15 [Train]: 100%|██████████| 1004/1004 [09:10<00:00,  1.82it/s, loss=0.1644, a_loss=0.1186, d_loss=0.1797]
Epoch 6/15 [Val]: 100%|██████████| 126/126 [00:44<00:00,  2.80it/s, loss=0.3242, a_loss=0.2792, d_loss=0.3692]


Epoch 6/15 Results:
  Train Loss: 0.1701 (Arousal: 0.1343, Dominance: 0.1821)
  Val Loss: 0.1501 (Arousal: 0.1198, Dominance: 0.1804)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1501)


Epoch 7/15 [Train]:   0%|          | 1/1004 [00:00<09:45,  1.71it/s, loss=0.0776, a_loss=0.0823, d_loss=0.0761]

fusion_layer_arousal.0.weight: Grad Norm = 0.0849
fusion_layer_arousal.0.bias: Grad Norm = 0.0168
fusion_layer_arousal.1.weight: Grad Norm = 0.0032
fusion_layer_arousal.1.bias: Grad Norm = 0.0031
fusion_layer_arousal.4.weight: Grad Norm = 0.0971
fusion_layer_arousal.4.bias: Grad Norm = 0.0058
fusion_layer_dominance.0.weight: Grad Norm = 0.0910
fusion_layer_dominance.0.bias: Grad Norm = 0.0187
fusion_layer_dominance.1.weight: Grad Norm = 0.0045
fusion_layer_dominance.1.bias: Grad Norm = 0.0042
fusion_layer_dominance.4.weight: Grad Norm = 0.1319
fusion_layer_dominance.4.bias: Grad Norm = 0.0069
shared_fc_arousal.0.weight: Grad Norm = 0.0819
shared_fc_arousal.0.bias: Grad Norm = 0.0093
shared_fc_arousal.1.weight: Grad Norm = 0.0054
shared_fc_arousal.1.bias: Grad Norm = 0.0048
shared_fc_dominance.0.weight: Grad Norm = 0.1199
shared_fc_dominance.0.bias: Grad Norm = 0.0112
shared_fc_dominance.1.weight: Grad Norm = 0.0087
shared_fc_dominance.1.bias: Grad Norm = 0.0077
output_branch_arousal.0.

Epoch 7/15 [Train]:  10%|█         | 101/1004 [00:55<08:39,  1.74it/s, loss=0.1010, a_loss=0.1491, d_loss=0.0850]

fusion_layer_arousal.0.weight: Grad Norm = 0.1145
fusion_layer_arousal.0.bias: Grad Norm = 0.0311
fusion_layer_arousal.1.weight: Grad Norm = 0.0049
fusion_layer_arousal.1.bias: Grad Norm = 0.0044
fusion_layer_arousal.4.weight: Grad Norm = 0.1313
fusion_layer_arousal.4.bias: Grad Norm = 0.0078
fusion_layer_dominance.0.weight: Grad Norm = 0.1089
fusion_layer_dominance.0.bias: Grad Norm = 0.0223
fusion_layer_dominance.1.weight: Grad Norm = 0.0052
fusion_layer_dominance.1.bias: Grad Norm = 0.0047
fusion_layer_dominance.4.weight: Grad Norm = 0.1384
fusion_layer_dominance.4.bias: Grad Norm = 0.0068
shared_fc_arousal.0.weight: Grad Norm = 0.1036
shared_fc_arousal.0.bias: Grad Norm = 0.0123
shared_fc_arousal.1.weight: Grad Norm = 0.0067
shared_fc_arousal.1.bias: Grad Norm = 0.0059
shared_fc_dominance.0.weight: Grad Norm = 0.1196
shared_fc_dominance.0.bias: Grad Norm = 0.0114
shared_fc_dominance.1.weight: Grad Norm = 0.0081
shared_fc_dominance.1.bias: Grad Norm = 0.0077
output_branch_arousal.0.

Epoch 7/15 [Train]:  20%|██        | 201/1004 [01:50<07:14,  1.85it/s, loss=0.1679, a_loss=0.1274, d_loss=0.1814]

fusion_layer_arousal.0.weight: Grad Norm = 0.1026
fusion_layer_arousal.0.bias: Grad Norm = 0.0198
fusion_layer_arousal.1.weight: Grad Norm = 0.0037
fusion_layer_arousal.1.bias: Grad Norm = 0.0035
fusion_layer_arousal.4.weight: Grad Norm = 0.1051
fusion_layer_arousal.4.bias: Grad Norm = 0.0052
fusion_layer_dominance.0.weight: Grad Norm = 0.2145
fusion_layer_dominance.0.bias: Grad Norm = 0.0561
fusion_layer_dominance.1.weight: Grad Norm = 0.0114
fusion_layer_dominance.1.bias: Grad Norm = 0.0098
fusion_layer_dominance.4.weight: Grad Norm = 0.2664
fusion_layer_dominance.4.bias: Grad Norm = 0.0135
shared_fc_arousal.0.weight: Grad Norm = 0.0828
shared_fc_arousal.0.bias: Grad Norm = 0.0091
shared_fc_arousal.1.weight: Grad Norm = 0.0044
shared_fc_arousal.1.bias: Grad Norm = 0.0042
shared_fc_dominance.0.weight: Grad Norm = 0.2289
shared_fc_dominance.0.bias: Grad Norm = 0.0218
shared_fc_dominance.1.weight: Grad Norm = 0.0150
shared_fc_dominance.1.bias: Grad Norm = 0.0150
output_branch_arousal.0.

Epoch 7/15 [Train]:  30%|██▉       | 301/1004 [02:44<05:45,  2.04it/s, loss=0.0825, a_loss=0.0809, d_loss=0.0830]

fusion_layer_arousal.0.weight: Grad Norm = 0.0557
fusion_layer_arousal.0.bias: Grad Norm = 0.0135
fusion_layer_arousal.1.weight: Grad Norm = 0.0023
fusion_layer_arousal.1.bias: Grad Norm = 0.0021
fusion_layer_arousal.4.weight: Grad Norm = 0.0625
fusion_layer_arousal.4.bias: Grad Norm = 0.0035
fusion_layer_dominance.0.weight: Grad Norm = 0.1557
fusion_layer_dominance.0.bias: Grad Norm = 0.0391
fusion_layer_dominance.1.weight: Grad Norm = 0.0089
fusion_layer_dominance.1.bias: Grad Norm = 0.0077
fusion_layer_dominance.4.weight: Grad Norm = 0.1775
fusion_layer_dominance.4.bias: Grad Norm = 0.0111
shared_fc_arousal.0.weight: Grad Norm = 0.0495
shared_fc_arousal.0.bias: Grad Norm = 0.0056
shared_fc_arousal.1.weight: Grad Norm = 0.0030
shared_fc_arousal.1.bias: Grad Norm = 0.0027
shared_fc_dominance.0.weight: Grad Norm = 0.1596
shared_fc_dominance.0.bias: Grad Norm = 0.0156
shared_fc_dominance.1.weight: Grad Norm = 0.0106
shared_fc_dominance.1.bias: Grad Norm = 0.0113
output_branch_arousal.0.

Epoch 7/15 [Train]:  40%|███▉      | 401/1004 [03:40<05:31,  1.82it/s, loss=0.1311, a_loss=0.0801, d_loss=0.1481]

fusion_layer_arousal.0.weight: Grad Norm = 0.0835
fusion_layer_arousal.0.bias: Grad Norm = 0.0170
fusion_layer_arousal.1.weight: Grad Norm = 0.0035
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0920
fusion_layer_arousal.4.bias: Grad Norm = 0.0048
fusion_layer_dominance.0.weight: Grad Norm = 0.2498
fusion_layer_dominance.0.bias: Grad Norm = 0.0735
fusion_layer_dominance.1.weight: Grad Norm = 0.0142
fusion_layer_dominance.1.bias: Grad Norm = 0.0126
fusion_layer_dominance.4.weight: Grad Norm = 0.2971
fusion_layer_dominance.4.bias: Grad Norm = 0.0176
shared_fc_arousal.0.weight: Grad Norm = 0.0743
shared_fc_arousal.0.bias: Grad Norm = 0.0081
shared_fc_arousal.1.weight: Grad Norm = 0.0038
shared_fc_arousal.1.bias: Grad Norm = 0.0038
shared_fc_dominance.0.weight: Grad Norm = 0.2574
shared_fc_dominance.0.bias: Grad Norm = 0.0266
shared_fc_dominance.1.weight: Grad Norm = 0.0162
shared_fc_dominance.1.bias: Grad Norm = 0.0173
output_branch_arousal.0.

Epoch 7/15 [Train]:  50%|████▉     | 501/1004 [04:36<04:28,  1.87it/s, loss=0.1907, a_loss=0.0844, d_loss=0.2261]

fusion_layer_arousal.0.weight: Grad Norm = 0.0712
fusion_layer_arousal.0.bias: Grad Norm = 0.0168
fusion_layer_arousal.1.weight: Grad Norm = 0.0030
fusion_layer_arousal.1.bias: Grad Norm = 0.0026
fusion_layer_arousal.4.weight: Grad Norm = 0.0813
fusion_layer_arousal.4.bias: Grad Norm = 0.0045
fusion_layer_dominance.0.weight: Grad Norm = 0.2343
fusion_layer_dominance.0.bias: Grad Norm = 0.0584
fusion_layer_dominance.1.weight: Grad Norm = 0.0119
fusion_layer_dominance.1.bias: Grad Norm = 0.0107
fusion_layer_dominance.4.weight: Grad Norm = 0.2929
fusion_layer_dominance.4.bias: Grad Norm = 0.0159
shared_fc_arousal.0.weight: Grad Norm = 0.0687
shared_fc_arousal.0.bias: Grad Norm = 0.0079
shared_fc_arousal.1.weight: Grad Norm = 0.0034
shared_fc_arousal.1.bias: Grad Norm = 0.0037
shared_fc_dominance.0.weight: Grad Norm = 0.2566
shared_fc_dominance.0.bias: Grad Norm = 0.0253
shared_fc_dominance.1.weight: Grad Norm = 0.0166
shared_fc_dominance.1.bias: Grad Norm = 0.0161
output_branch_arousal.0.

Epoch 7/15 [Train]:  60%|█████▉    | 601/1004 [05:31<03:34,  1.87it/s, loss=0.2368, a_loss=0.0944, d_loss=0.2843]

fusion_layer_arousal.0.weight: Grad Norm = 0.0759
fusion_layer_arousal.0.bias: Grad Norm = 0.0200
fusion_layer_arousal.1.weight: Grad Norm = 0.0028
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0846
fusion_layer_arousal.4.bias: Grad Norm = 0.0048
fusion_layer_dominance.0.weight: Grad Norm = 0.2177
fusion_layer_dominance.0.bias: Grad Norm = 0.0494
fusion_layer_dominance.1.weight: Grad Norm = 0.0109
fusion_layer_dominance.1.bias: Grad Norm = 0.0099
fusion_layer_dominance.4.weight: Grad Norm = 0.2839
fusion_layer_dominance.4.bias: Grad Norm = 0.0158
shared_fc_arousal.0.weight: Grad Norm = 0.0687
shared_fc_arousal.0.bias: Grad Norm = 0.0077
shared_fc_arousal.1.weight: Grad Norm = 0.0043
shared_fc_arousal.1.bias: Grad Norm = 0.0040
shared_fc_dominance.0.weight: Grad Norm = 0.2549
shared_fc_dominance.0.bias: Grad Norm = 0.0255
shared_fc_dominance.1.weight: Grad Norm = 0.0167
shared_fc_dominance.1.bias: Grad Norm = 0.0168
output_branch_arousal.0.

Epoch 7/15 [Train]:  70%|██████▉   | 701/1004 [06:26<02:45,  1.83it/s, loss=0.1779, a_loss=0.1439, d_loss=0.1893]

fusion_layer_arousal.0.weight: Grad Norm = 0.1438
fusion_layer_arousal.0.bias: Grad Norm = 0.0345
fusion_layer_arousal.1.weight: Grad Norm = 0.0057
fusion_layer_arousal.1.bias: Grad Norm = 0.0054
fusion_layer_arousal.4.weight: Grad Norm = 0.1632
fusion_layer_arousal.4.bias: Grad Norm = 0.0104
fusion_layer_dominance.0.weight: Grad Norm = 0.1291
fusion_layer_dominance.0.bias: Grad Norm = 0.0258
fusion_layer_dominance.1.weight: Grad Norm = 0.0058
fusion_layer_dominance.1.bias: Grad Norm = 0.0055
fusion_layer_dominance.4.weight: Grad Norm = 0.1652
fusion_layer_dominance.4.bias: Grad Norm = 0.0081
shared_fc_arousal.0.weight: Grad Norm = 0.1243
shared_fc_arousal.0.bias: Grad Norm = 0.0157
shared_fc_arousal.1.weight: Grad Norm = 0.0063
shared_fc_arousal.1.bias: Grad Norm = 0.0073
shared_fc_dominance.0.weight: Grad Norm = 0.1515
shared_fc_dominance.0.bias: Grad Norm = 0.0131
shared_fc_dominance.1.weight: Grad Norm = 0.0097
shared_fc_dominance.1.bias: Grad Norm = 0.0099
output_branch_arousal.0.

Epoch 7/15 [Train]:  80%|███████▉  | 801/1004 [07:22<01:49,  1.86it/s, loss=0.1640, a_loss=0.2367, d_loss=0.1398]

fusion_layer_arousal.0.weight: Grad Norm = 0.0810
fusion_layer_arousal.0.bias: Grad Norm = 0.0193
fusion_layer_arousal.1.weight: Grad Norm = 0.0036
fusion_layer_arousal.1.bias: Grad Norm = 0.0030
fusion_layer_arousal.4.weight: Grad Norm = 0.1048
fusion_layer_arousal.4.bias: Grad Norm = 0.0053
fusion_layer_dominance.0.weight: Grad Norm = 0.2351
fusion_layer_dominance.0.bias: Grad Norm = 0.0589
fusion_layer_dominance.1.weight: Grad Norm = 0.0136
fusion_layer_dominance.1.bias: Grad Norm = 0.0108
fusion_layer_dominance.4.weight: Grad Norm = 0.2740
fusion_layer_dominance.4.bias: Grad Norm = 0.0164
shared_fc_arousal.0.weight: Grad Norm = 0.0925
shared_fc_arousal.0.bias: Grad Norm = 0.0101
shared_fc_arousal.1.weight: Grad Norm = 0.0051
shared_fc_arousal.1.bias: Grad Norm = 0.0052
shared_fc_dominance.0.weight: Grad Norm = 0.2317
shared_fc_dominance.0.bias: Grad Norm = 0.0228
shared_fc_dominance.1.weight: Grad Norm = 0.0155
shared_fc_dominance.1.bias: Grad Norm = 0.0156
output_branch_arousal.0.

Epoch 7/15 [Train]:  90%|████████▉ | 901/1004 [08:16<01:00,  1.71it/s, loss=0.2370, a_loss=0.1239, d_loss=0.2746]

fusion_layer_arousal.0.weight: Grad Norm = 0.0805
fusion_layer_arousal.0.bias: Grad Norm = 0.0208
fusion_layer_arousal.1.weight: Grad Norm = 0.0035
fusion_layer_arousal.1.bias: Grad Norm = 0.0031
fusion_layer_arousal.4.weight: Grad Norm = 0.0874
fusion_layer_arousal.4.bias: Grad Norm = 0.0045
fusion_layer_dominance.0.weight: Grad Norm = 0.2699
fusion_layer_dominance.0.bias: Grad Norm = 0.0748
fusion_layer_dominance.1.weight: Grad Norm = 0.0160
fusion_layer_dominance.1.bias: Grad Norm = 0.0130
fusion_layer_dominance.4.weight: Grad Norm = 0.2847
fusion_layer_dominance.4.bias: Grad Norm = 0.0168
shared_fc_arousal.0.weight: Grad Norm = 0.0695
shared_fc_arousal.0.bias: Grad Norm = 0.0077
shared_fc_arousal.1.weight: Grad Norm = 0.0040
shared_fc_arousal.1.bias: Grad Norm = 0.0038
shared_fc_dominance.0.weight: Grad Norm = 0.2415
shared_fc_dominance.0.bias: Grad Norm = 0.0240
shared_fc_dominance.1.weight: Grad Norm = 0.0146
shared_fc_dominance.1.bias: Grad Norm = 0.0152
output_branch_arousal.0.

Epoch 7/15 [Train]: 100%|█████████▉| 1001/1004 [09:10<00:01,  1.87it/s, loss=0.2145, a_loss=0.1373, d_loss=0.2402]

fusion_layer_arousal.0.weight: Grad Norm = 0.0744
fusion_layer_arousal.0.bias: Grad Norm = 0.0161
fusion_layer_arousal.1.weight: Grad Norm = 0.0031
fusion_layer_arousal.1.bias: Grad Norm = 0.0026
fusion_layer_arousal.4.weight: Grad Norm = 0.0922
fusion_layer_arousal.4.bias: Grad Norm = 0.0048
fusion_layer_dominance.0.weight: Grad Norm = 0.1548
fusion_layer_dominance.0.bias: Grad Norm = 0.0296
fusion_layer_dominance.1.weight: Grad Norm = 0.0075
fusion_layer_dominance.1.bias: Grad Norm = 0.0065
fusion_layer_dominance.4.weight: Grad Norm = 0.2105
fusion_layer_dominance.4.bias: Grad Norm = 0.0105
shared_fc_arousal.0.weight: Grad Norm = 0.0707
shared_fc_arousal.0.bias: Grad Norm = 0.0075
shared_fc_arousal.1.weight: Grad Norm = 0.0034
shared_fc_arousal.1.bias: Grad Norm = 0.0036
shared_fc_dominance.0.weight: Grad Norm = 0.1882
shared_fc_dominance.0.bias: Grad Norm = 0.0172
shared_fc_dominance.1.weight: Grad Norm = 0.0133
shared_fc_dominance.1.bias: Grad Norm = 0.0123
output_branch_arousal.0.

Epoch 7/15 [Train]: 100%|██████████| 1004/1004 [09:12<00:00,  1.82it/s, loss=0.3454, a_loss=0.3585, d_loss=0.3411]
Epoch 7/15 [Val]: 100%|██████████| 126/126 [00:44<00:00,  2.80it/s, loss=0.3740, a_loss=0.3214, d_loss=0.4266]


Epoch 7/15 Results:
  Train Loss: 0.1681 (Arousal: 0.1341, Dominance: 0.1794)
  Val Loss: 0.1599 (Arousal: 0.1296, Dominance: 0.1902)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500


Epoch 8/15 [Train]:   0%|          | 1/1004 [00:00<09:54,  1.69it/s, loss=0.3105, a_loss=0.1634, d_loss=0.3596]

fusion_layer_arousal.0.weight: Grad Norm = 0.1056
fusion_layer_arousal.0.bias: Grad Norm = 0.0235
fusion_layer_arousal.1.weight: Grad Norm = 0.0043
fusion_layer_arousal.1.bias: Grad Norm = 0.0038
fusion_layer_arousal.4.weight: Grad Norm = 0.1165
fusion_layer_arousal.4.bias: Grad Norm = 0.0063
fusion_layer_dominance.0.weight: Grad Norm = 0.2130
fusion_layer_dominance.0.bias: Grad Norm = 0.0492
fusion_layer_dominance.1.weight: Grad Norm = 0.0115
fusion_layer_dominance.1.bias: Grad Norm = 0.0096
fusion_layer_dominance.4.weight: Grad Norm = 0.2677
fusion_layer_dominance.4.bias: Grad Norm = 0.0144
shared_fc_arousal.0.weight: Grad Norm = 0.0961
shared_fc_arousal.0.bias: Grad Norm = 0.0112
shared_fc_arousal.1.weight: Grad Norm = 0.0055
shared_fc_arousal.1.bias: Grad Norm = 0.0051
shared_fc_dominance.0.weight: Grad Norm = 0.2261
shared_fc_dominance.0.bias: Grad Norm = 0.0226
shared_fc_dominance.1.weight: Grad Norm = 0.0131
shared_fc_dominance.1.bias: Grad Norm = 0.0146
output_branch_arousal.0.

Epoch 8/15 [Train]:  10%|█         | 101/1004 [00:54<07:35,  1.98it/s, loss=0.0685, a_loss=0.0221, d_loss=0.0840]

fusion_layer_arousal.0.weight: Grad Norm = 0.0375
fusion_layer_arousal.0.bias: Grad Norm = 0.0089
fusion_layer_arousal.1.weight: Grad Norm = 0.0014
fusion_layer_arousal.1.bias: Grad Norm = 0.0014
fusion_layer_arousal.4.weight: Grad Norm = 0.0421
fusion_layer_arousal.4.bias: Grad Norm = 0.0026
fusion_layer_dominance.0.weight: Grad Norm = 0.0870
fusion_layer_dominance.0.bias: Grad Norm = 0.0180
fusion_layer_dominance.1.weight: Grad Norm = 0.0050
fusion_layer_dominance.1.bias: Grad Norm = 0.0043
fusion_layer_dominance.4.weight: Grad Norm = 0.1195
fusion_layer_dominance.4.bias: Grad Norm = 0.0055
shared_fc_arousal.0.weight: Grad Norm = 0.0366
shared_fc_arousal.0.bias: Grad Norm = 0.0042
shared_fc_arousal.1.weight: Grad Norm = 0.0022
shared_fc_arousal.1.bias: Grad Norm = 0.0021
shared_fc_dominance.0.weight: Grad Norm = 0.1131
shared_fc_dominance.0.bias: Grad Norm = 0.0091
shared_fc_dominance.1.weight: Grad Norm = 0.0081
shared_fc_dominance.1.bias: Grad Norm = 0.0073
output_branch_arousal.0.

Epoch 8/15 [Train]:  20%|██        | 201/1004 [01:49<07:04,  1.89it/s, loss=0.1821, a_loss=0.1237, d_loss=0.2015]

fusion_layer_arousal.0.weight: Grad Norm = 0.0697
fusion_layer_arousal.0.bias: Grad Norm = 0.0170
fusion_layer_arousal.1.weight: Grad Norm = 0.0033
fusion_layer_arousal.1.bias: Grad Norm = 0.0027
fusion_layer_arousal.4.weight: Grad Norm = 0.0782
fusion_layer_arousal.4.bias: Grad Norm = 0.0045
fusion_layer_dominance.0.weight: Grad Norm = 0.2493
fusion_layer_dominance.0.bias: Grad Norm = 0.0648
fusion_layer_dominance.1.weight: Grad Norm = 0.0147
fusion_layer_dominance.1.bias: Grad Norm = 0.0129
fusion_layer_dominance.4.weight: Grad Norm = 0.2884
fusion_layer_dominance.4.bias: Grad Norm = 0.0183
shared_fc_arousal.0.weight: Grad Norm = 0.0647
shared_fc_arousal.0.bias: Grad Norm = 0.0071
shared_fc_arousal.1.weight: Grad Norm = 0.0042
shared_fc_arousal.1.bias: Grad Norm = 0.0038
shared_fc_dominance.0.weight: Grad Norm = 0.2739
shared_fc_dominance.0.bias: Grad Norm = 0.0266
shared_fc_dominance.1.weight: Grad Norm = 0.0220
shared_fc_dominance.1.bias: Grad Norm = 0.0206
output_branch_arousal.0.

Epoch 8/15 [Train]:  30%|██▉       | 301/1004 [02:44<06:13,  1.88it/s, loss=0.1612, a_loss=0.0785, d_loss=0.1887]

fusion_layer_arousal.0.weight: Grad Norm = 0.0564
fusion_layer_arousal.0.bias: Grad Norm = 0.0109
fusion_layer_arousal.1.weight: Grad Norm = 0.0022
fusion_layer_arousal.1.bias: Grad Norm = 0.0020
fusion_layer_arousal.4.weight: Grad Norm = 0.0684
fusion_layer_arousal.4.bias: Grad Norm = 0.0034
fusion_layer_dominance.0.weight: Grad Norm = 0.1353
fusion_layer_dominance.0.bias: Grad Norm = 0.0275
fusion_layer_dominance.1.weight: Grad Norm = 0.0074
fusion_layer_dominance.1.bias: Grad Norm = 0.0062
fusion_layer_dominance.4.weight: Grad Norm = 0.1788
fusion_layer_dominance.4.bias: Grad Norm = 0.0091
shared_fc_arousal.0.weight: Grad Norm = 0.0589
shared_fc_arousal.0.bias: Grad Norm = 0.0062
shared_fc_arousal.1.weight: Grad Norm = 0.0038
shared_fc_arousal.1.bias: Grad Norm = 0.0032
shared_fc_dominance.0.weight: Grad Norm = 0.1670
shared_fc_dominance.0.bias: Grad Norm = 0.0158
shared_fc_dominance.1.weight: Grad Norm = 0.0146
shared_fc_dominance.1.bias: Grad Norm = 0.0115
output_branch_arousal.0.

Epoch 8/15 [Train]:  40%|███▉      | 401/1004 [03:39<05:02,  1.99it/s, loss=0.2560, a_loss=0.1240, d_loss=0.2999]

fusion_layer_arousal.0.weight: Grad Norm = 0.0905
fusion_layer_arousal.0.bias: Grad Norm = 0.0182
fusion_layer_arousal.1.weight: Grad Norm = 0.0037
fusion_layer_arousal.1.bias: Grad Norm = 0.0032
fusion_layer_arousal.4.weight: Grad Norm = 0.1034
fusion_layer_arousal.4.bias: Grad Norm = 0.0056
fusion_layer_dominance.0.weight: Grad Norm = 0.1803
fusion_layer_dominance.0.bias: Grad Norm = 0.0360
fusion_layer_dominance.1.weight: Grad Norm = 0.0095
fusion_layer_dominance.1.bias: Grad Norm = 0.0085
fusion_layer_dominance.4.weight: Grad Norm = 0.2319
fusion_layer_dominance.4.bias: Grad Norm = 0.0129
shared_fc_arousal.0.weight: Grad Norm = 0.0839
shared_fc_arousal.0.bias: Grad Norm = 0.0089
shared_fc_arousal.1.weight: Grad Norm = 0.0055
shared_fc_arousal.1.bias: Grad Norm = 0.0046
shared_fc_dominance.0.weight: Grad Norm = 0.2130
shared_fc_dominance.0.bias: Grad Norm = 0.0205
shared_fc_dominance.1.weight: Grad Norm = 0.0156
shared_fc_dominance.1.bias: Grad Norm = 0.0147
output_branch_arousal.0.

Epoch 8/15 [Train]:  50%|████▉     | 501/1004 [04:33<04:24,  1.90it/s, loss=0.0915, a_loss=0.0489, d_loss=0.1057]

fusion_layer_arousal.0.weight: Grad Norm = 0.0439
fusion_layer_arousal.0.bias: Grad Norm = 0.0090
fusion_layer_arousal.1.weight: Grad Norm = 0.0018
fusion_layer_arousal.1.bias: Grad Norm = 0.0016
fusion_layer_arousal.4.weight: Grad Norm = 0.0498
fusion_layer_arousal.4.bias: Grad Norm = 0.0024
fusion_layer_dominance.0.weight: Grad Norm = 0.1273
fusion_layer_dominance.0.bias: Grad Norm = 0.0238
fusion_layer_dominance.1.weight: Grad Norm = 0.0071
fusion_layer_dominance.1.bias: Grad Norm = 0.0060
fusion_layer_dominance.4.weight: Grad Norm = 0.1358
fusion_layer_dominance.4.bias: Grad Norm = 0.0069
shared_fc_arousal.0.weight: Grad Norm = 0.0404
shared_fc_arousal.0.bias: Grad Norm = 0.0042
shared_fc_arousal.1.weight: Grad Norm = 0.0023
shared_fc_arousal.1.bias: Grad Norm = 0.0021
shared_fc_dominance.0.weight: Grad Norm = 0.1196
shared_fc_dominance.0.bias: Grad Norm = 0.0109
shared_fc_dominance.1.weight: Grad Norm = 0.0086
shared_fc_dominance.1.bias: Grad Norm = 0.0078
output_branch_arousal.0.

Epoch 8/15 [Train]:  60%|█████▉    | 601/1004 [05:28<03:18,  2.03it/s, loss=0.1772, a_loss=0.1270, d_loss=0.1939]

fusion_layer_arousal.0.weight: Grad Norm = 0.0690
fusion_layer_arousal.0.bias: Grad Norm = 0.0143
fusion_layer_arousal.1.weight: Grad Norm = 0.0031
fusion_layer_arousal.1.bias: Grad Norm = 0.0026
fusion_layer_arousal.4.weight: Grad Norm = 0.0773
fusion_layer_arousal.4.bias: Grad Norm = 0.0039
fusion_layer_dominance.0.weight: Grad Norm = 0.1127
fusion_layer_dominance.0.bias: Grad Norm = 0.0258
fusion_layer_dominance.1.weight: Grad Norm = 0.0068
fusion_layer_dominance.1.bias: Grad Norm = 0.0057
fusion_layer_dominance.4.weight: Grad Norm = 0.1352
fusion_layer_dominance.4.bias: Grad Norm = 0.0064
shared_fc_arousal.0.weight: Grad Norm = 0.0629
shared_fc_arousal.0.bias: Grad Norm = 0.0066
shared_fc_arousal.1.weight: Grad Norm = 0.0030
shared_fc_arousal.1.bias: Grad Norm = 0.0035
shared_fc_dominance.0.weight: Grad Norm = 0.1356
shared_fc_dominance.0.bias: Grad Norm = 0.0107
shared_fc_dominance.1.weight: Grad Norm = 0.0105
shared_fc_dominance.1.bias: Grad Norm = 0.0092
output_branch_arousal.0.

Epoch 8/15 [Train]:  70%|██████▉   | 701/1004 [06:23<02:51,  1.77it/s, loss=0.2291, a_loss=0.1592, d_loss=0.2525]

fusion_layer_arousal.0.weight: Grad Norm = 0.0764
fusion_layer_arousal.0.bias: Grad Norm = 0.0150
fusion_layer_arousal.1.weight: Grad Norm = 0.0032
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0799
fusion_layer_arousal.4.bias: Grad Norm = 0.0046
fusion_layer_dominance.0.weight: Grad Norm = 0.1463
fusion_layer_dominance.0.bias: Grad Norm = 0.0288
fusion_layer_dominance.1.weight: Grad Norm = 0.0082
fusion_layer_dominance.1.bias: Grad Norm = 0.0073
fusion_layer_dominance.4.weight: Grad Norm = 0.1822
fusion_layer_dominance.4.bias: Grad Norm = 0.0091
shared_fc_arousal.0.weight: Grad Norm = 0.0668
shared_fc_arousal.0.bias: Grad Norm = 0.0072
shared_fc_arousal.1.weight: Grad Norm = 0.0044
shared_fc_arousal.1.bias: Grad Norm = 0.0038
shared_fc_dominance.0.weight: Grad Norm = 0.1694
shared_fc_dominance.0.bias: Grad Norm = 0.0147
shared_fc_dominance.1.weight: Grad Norm = 0.0123
shared_fc_dominance.1.bias: Grad Norm = 0.0117
output_branch_arousal.0.

Epoch 8/15 [Train]:  80%|███████▉  | 801/1004 [07:17<01:52,  1.81it/s, loss=0.1566, a_loss=0.1001, d_loss=0.1754]

fusion_layer_arousal.0.weight: Grad Norm = 0.0899
fusion_layer_arousal.0.bias: Grad Norm = 0.0188
fusion_layer_arousal.1.weight: Grad Norm = 0.0041
fusion_layer_arousal.1.bias: Grad Norm = 0.0033
fusion_layer_arousal.4.weight: Grad Norm = 0.1087
fusion_layer_arousal.4.bias: Grad Norm = 0.0055
fusion_layer_dominance.0.weight: Grad Norm = 0.1613
fusion_layer_dominance.0.bias: Grad Norm = 0.0297
fusion_layer_dominance.1.weight: Grad Norm = 0.0094
fusion_layer_dominance.1.bias: Grad Norm = 0.0077
fusion_layer_dominance.4.weight: Grad Norm = 0.2126
fusion_layer_dominance.4.bias: Grad Norm = 0.0116
shared_fc_arousal.0.weight: Grad Norm = 0.0880
shared_fc_arousal.0.bias: Grad Norm = 0.0096
shared_fc_arousal.1.weight: Grad Norm = 0.0044
shared_fc_arousal.1.bias: Grad Norm = 0.0046
shared_fc_dominance.0.weight: Grad Norm = 0.1933
shared_fc_dominance.0.bias: Grad Norm = 0.0184
shared_fc_dominance.1.weight: Grad Norm = 0.0140
shared_fc_dominance.1.bias: Grad Norm = 0.0133
output_branch_arousal.0.

Epoch 8/15 [Train]:  90%|████████▉ | 901/1004 [08:12<00:57,  1.78it/s, loss=0.2429, a_loss=0.1395, d_loss=0.2773]

fusion_layer_arousal.0.weight: Grad Norm = 0.0701
fusion_layer_arousal.0.bias: Grad Norm = 0.0149
fusion_layer_arousal.1.weight: Grad Norm = 0.0031
fusion_layer_arousal.1.bias: Grad Norm = 0.0027
fusion_layer_arousal.4.weight: Grad Norm = 0.0794
fusion_layer_arousal.4.bias: Grad Norm = 0.0040
fusion_layer_dominance.0.weight: Grad Norm = 0.2200
fusion_layer_dominance.0.bias: Grad Norm = 0.0460
fusion_layer_dominance.1.weight: Grad Norm = 0.0130
fusion_layer_dominance.1.bias: Grad Norm = 0.0107
fusion_layer_dominance.4.weight: Grad Norm = 0.2385
fusion_layer_dominance.4.bias: Grad Norm = 0.0131
shared_fc_arousal.0.weight: Grad Norm = 0.0685
shared_fc_arousal.0.bias: Grad Norm = 0.0071
shared_fc_arousal.1.weight: Grad Norm = 0.0042
shared_fc_arousal.1.bias: Grad Norm = 0.0037
shared_fc_dominance.0.weight: Grad Norm = 0.2043
shared_fc_dominance.0.bias: Grad Norm = 0.0174
shared_fc_dominance.1.weight: Grad Norm = 0.0145
shared_fc_dominance.1.bias: Grad Norm = 0.0131
output_branch_arousal.0.

Epoch 8/15 [Train]: 100%|█████████▉| 1001/1004 [09:08<00:01,  1.59it/s, loss=0.2364, a_loss=0.0974, d_loss=0.2827]

fusion_layer_arousal.0.weight: Grad Norm = 0.0725
fusion_layer_arousal.0.bias: Grad Norm = 0.0191
fusion_layer_arousal.1.weight: Grad Norm = 0.0030
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0802
fusion_layer_arousal.4.bias: Grad Norm = 0.0046
fusion_layer_dominance.0.weight: Grad Norm = 0.1623
fusion_layer_dominance.0.bias: Grad Norm = 0.0432
fusion_layer_dominance.1.weight: Grad Norm = 0.0091
fusion_layer_dominance.1.bias: Grad Norm = 0.0082
fusion_layer_dominance.4.weight: Grad Norm = 0.1912
fusion_layer_dominance.4.bias: Grad Norm = 0.0101
shared_fc_arousal.0.weight: Grad Norm = 0.0673
shared_fc_arousal.0.bias: Grad Norm = 0.0073
shared_fc_arousal.1.weight: Grad Norm = 0.0040
shared_fc_arousal.1.bias: Grad Norm = 0.0039
shared_fc_dominance.0.weight: Grad Norm = 0.1644
shared_fc_dominance.0.bias: Grad Norm = 0.0145
shared_fc_dominance.1.weight: Grad Norm = 0.0122
shared_fc_dominance.1.bias: Grad Norm = 0.0113
output_branch_arousal.0.

Epoch 8/15 [Train]: 100%|██████████| 1004/1004 [09:09<00:00,  1.83it/s, loss=0.2987, a_loss=0.0291, d_loss=0.3886]
Epoch 8/15 [Val]: 100%|██████████| 126/126 [00:44<00:00,  2.81it/s, loss=0.2099, a_loss=0.1610, d_loss=0.2588]


Epoch 8/15 Results:
  Train Loss: 0.1706 (Arousal: 0.1335, Dominance: 0.1829)
  Val Loss: 0.1611 (Arousal: 0.1222, Dominance: 0.2001)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500


Epoch 9/15 [Train]:   0%|          | 1/1004 [00:00<09:01,  1.85it/s, loss=0.2297, a_loss=0.2077, d_loss=0.2371]

fusion_layer_arousal.0.weight: Grad Norm = 0.0969
fusion_layer_arousal.0.bias: Grad Norm = 0.0181
fusion_layer_arousal.1.weight: Grad Norm = 0.0038
fusion_layer_arousal.1.bias: Grad Norm = 0.0033
fusion_layer_arousal.4.weight: Grad Norm = 0.1090
fusion_layer_arousal.4.bias: Grad Norm = 0.0052
fusion_layer_dominance.0.weight: Grad Norm = 0.1355
fusion_layer_dominance.0.bias: Grad Norm = 0.0326
fusion_layer_dominance.1.weight: Grad Norm = 0.0095
fusion_layer_dominance.1.bias: Grad Norm = 0.0071
fusion_layer_dominance.4.weight: Grad Norm = 0.1792
fusion_layer_dominance.4.bias: Grad Norm = 0.0088
shared_fc_arousal.0.weight: Grad Norm = 0.0869
shared_fc_arousal.0.bias: Grad Norm = 0.0087
shared_fc_arousal.1.weight: Grad Norm = 0.0050
shared_fc_arousal.1.bias: Grad Norm = 0.0046
shared_fc_dominance.0.weight: Grad Norm = 0.1763
shared_fc_dominance.0.bias: Grad Norm = 0.0149
shared_fc_dominance.1.weight: Grad Norm = 0.0139
shared_fc_dominance.1.bias: Grad Norm = 0.0124
output_branch_arousal.0.

Epoch 9/15 [Train]:  10%|█         | 101/1004 [00:56<08:02,  1.87it/s, loss=0.1340, a_loss=0.1452, d_loss=0.1302]

fusion_layer_arousal.0.weight: Grad Norm = 0.0907
fusion_layer_arousal.0.bias: Grad Norm = 0.0226
fusion_layer_arousal.1.weight: Grad Norm = 0.0035
fusion_layer_arousal.1.bias: Grad Norm = 0.0035
fusion_layer_arousal.4.weight: Grad Norm = 0.0981
fusion_layer_arousal.4.bias: Grad Norm = 0.0050
fusion_layer_dominance.0.weight: Grad Norm = 0.2396
fusion_layer_dominance.0.bias: Grad Norm = 0.0581
fusion_layer_dominance.1.weight: Grad Norm = 0.0127
fusion_layer_dominance.1.bias: Grad Norm = 0.0114
fusion_layer_dominance.4.weight: Grad Norm = 0.2302
fusion_layer_dominance.4.bias: Grad Norm = 0.0150
shared_fc_arousal.0.weight: Grad Norm = 0.0817
shared_fc_arousal.0.bias: Grad Norm = 0.0079
shared_fc_arousal.1.weight: Grad Norm = 0.0043
shared_fc_arousal.1.bias: Grad Norm = 0.0042
shared_fc_dominance.0.weight: Grad Norm = 0.1909
shared_fc_dominance.0.bias: Grad Norm = 0.0191
shared_fc_dominance.1.weight: Grad Norm = 0.0144
shared_fc_dominance.1.bias: Grad Norm = 0.0143
output_branch_arousal.0.

Epoch 9/15 [Train]:  20%|██        | 201/1004 [01:51<07:09,  1.87it/s, loss=0.2110, a_loss=0.0729, d_loss=0.2570]

fusion_layer_arousal.0.weight: Grad Norm = 0.0487
fusion_layer_arousal.0.bias: Grad Norm = 0.0120
fusion_layer_arousal.1.weight: Grad Norm = 0.0019
fusion_layer_arousal.1.bias: Grad Norm = 0.0020
fusion_layer_arousal.4.weight: Grad Norm = 0.0583
fusion_layer_arousal.4.bias: Grad Norm = 0.0034
fusion_layer_dominance.0.weight: Grad Norm = 0.1760
fusion_layer_dominance.0.bias: Grad Norm = 0.0429
fusion_layer_dominance.1.weight: Grad Norm = 0.0096
fusion_layer_dominance.1.bias: Grad Norm = 0.0097
fusion_layer_dominance.4.weight: Grad Norm = 0.2054
fusion_layer_dominance.4.bias: Grad Norm = 0.0120
shared_fc_arousal.0.weight: Grad Norm = 0.0482
shared_fc_arousal.0.bias: Grad Norm = 0.0049
shared_fc_arousal.1.weight: Grad Norm = 0.0030
shared_fc_arousal.1.bias: Grad Norm = 0.0030
shared_fc_dominance.0.weight: Grad Norm = 0.1786
shared_fc_dominance.0.bias: Grad Norm = 0.0160
shared_fc_dominance.1.weight: Grad Norm = 0.0126
shared_fc_dominance.1.bias: Grad Norm = 0.0133
output_branch_arousal.0.

Epoch 9/15 [Train]:  30%|██▉       | 301/1004 [02:47<07:00,  1.67it/s, loss=0.1095, a_loss=0.0436, d_loss=0.1314]

fusion_layer_arousal.0.weight: Grad Norm = 0.0444
fusion_layer_arousal.0.bias: Grad Norm = 0.0082
fusion_layer_arousal.1.weight: Grad Norm = 0.0019
fusion_layer_arousal.1.bias: Grad Norm = 0.0017
fusion_layer_arousal.4.weight: Grad Norm = 0.0525
fusion_layer_arousal.4.bias: Grad Norm = 0.0029
fusion_layer_dominance.0.weight: Grad Norm = 0.1269
fusion_layer_dominance.0.bias: Grad Norm = 0.0227
fusion_layer_dominance.1.weight: Grad Norm = 0.0066
fusion_layer_dominance.1.bias: Grad Norm = 0.0061
fusion_layer_dominance.4.weight: Grad Norm = 0.1609
fusion_layer_dominance.4.bias: Grad Norm = 0.0088
shared_fc_arousal.0.weight: Grad Norm = 0.0444
shared_fc_arousal.0.bias: Grad Norm = 0.0046
shared_fc_arousal.1.weight: Grad Norm = 0.0022
shared_fc_arousal.1.bias: Grad Norm = 0.0025
shared_fc_dominance.0.weight: Grad Norm = 0.1576
shared_fc_dominance.0.bias: Grad Norm = 0.0123
shared_fc_dominance.1.weight: Grad Norm = 0.0111
shared_fc_dominance.1.bias: Grad Norm = 0.0116
output_branch_arousal.0.

Epoch 9/15 [Train]:  40%|███▉      | 401/1004 [03:43<06:06,  1.65it/s, loss=0.1485, a_loss=0.2129, d_loss=0.1270]

fusion_layer_arousal.0.weight: Grad Norm = 0.0772
fusion_layer_arousal.0.bias: Grad Norm = 0.0168
fusion_layer_arousal.1.weight: Grad Norm = 0.0033
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0967
fusion_layer_arousal.4.bias: Grad Norm = 0.0049
fusion_layer_dominance.0.weight: Grad Norm = 0.0985
fusion_layer_dominance.0.bias: Grad Norm = 0.0169
fusion_layer_dominance.1.weight: Grad Norm = 0.0059
fusion_layer_dominance.1.bias: Grad Norm = 0.0047
fusion_layer_dominance.4.weight: Grad Norm = 0.1219
fusion_layer_dominance.4.bias: Grad Norm = 0.0059
shared_fc_arousal.0.weight: Grad Norm = 0.0818
shared_fc_arousal.0.bias: Grad Norm = 0.0080
shared_fc_arousal.1.weight: Grad Norm = 0.0064
shared_fc_arousal.1.bias: Grad Norm = 0.0049
shared_fc_dominance.0.weight: Grad Norm = 0.1167
shared_fc_dominance.0.bias: Grad Norm = 0.0102
shared_fc_dominance.1.weight: Grad Norm = 0.0092
shared_fc_dominance.1.bias: Grad Norm = 0.0081
output_branch_arousal.0.

Epoch 9/15 [Train]:  50%|████▉     | 501/1004 [04:37<04:56,  1.70it/s, loss=0.0916, a_loss=0.1669, d_loss=0.0665]

fusion_layer_arousal.0.weight: Grad Norm = 0.1162
fusion_layer_arousal.0.bias: Grad Norm = 0.0303
fusion_layer_arousal.1.weight: Grad Norm = 0.0051
fusion_layer_arousal.1.bias: Grad Norm = 0.0045
fusion_layer_arousal.4.weight: Grad Norm = 0.1179
fusion_layer_arousal.4.bias: Grad Norm = 0.0077
fusion_layer_dominance.0.weight: Grad Norm = 0.1093
fusion_layer_dominance.0.bias: Grad Norm = 0.0268
fusion_layer_dominance.1.weight: Grad Norm = 0.0068
fusion_layer_dominance.1.bias: Grad Norm = 0.0060
fusion_layer_dominance.4.weight: Grad Norm = 0.1250
fusion_layer_dominance.4.bias: Grad Norm = 0.0077
shared_fc_arousal.0.weight: Grad Norm = 0.0903
shared_fc_arousal.0.bias: Grad Norm = 0.0108
shared_fc_arousal.1.weight: Grad Norm = 0.0057
shared_fc_arousal.1.bias: Grad Norm = 0.0058
shared_fc_dominance.0.weight: Grad Norm = 0.1104
shared_fc_dominance.0.bias: Grad Norm = 0.0106
shared_fc_dominance.1.weight: Grad Norm = 0.0081
shared_fc_dominance.1.bias: Grad Norm = 0.0083
output_branch_arousal.0.

Epoch 9/15 [Train]:  60%|█████▉    | 601/1004 [05:33<03:57,  1.70it/s, loss=0.1249, a_loss=0.1007, d_loss=0.1329]

fusion_layer_arousal.0.weight: Grad Norm = 0.0654
fusion_layer_arousal.0.bias: Grad Norm = 0.0092
fusion_layer_arousal.1.weight: Grad Norm = 0.0022
fusion_layer_arousal.1.bias: Grad Norm = 0.0022
fusion_layer_arousal.4.weight: Grad Norm = 0.0663
fusion_layer_arousal.4.bias: Grad Norm = 0.0037
fusion_layer_dominance.0.weight: Grad Norm = 0.1120
fusion_layer_dominance.0.bias: Grad Norm = 0.0248
fusion_layer_dominance.1.weight: Grad Norm = 0.0068
fusion_layer_dominance.1.bias: Grad Norm = 0.0054
fusion_layer_dominance.4.weight: Grad Norm = 0.1447
fusion_layer_dominance.4.bias: Grad Norm = 0.0067
shared_fc_arousal.0.weight: Grad Norm = 0.0566
shared_fc_arousal.0.bias: Grad Norm = 0.0068
shared_fc_arousal.1.weight: Grad Norm = 0.0030
shared_fc_arousal.1.bias: Grad Norm = 0.0029
shared_fc_dominance.0.weight: Grad Norm = 0.1255
shared_fc_dominance.0.bias: Grad Norm = 0.0103
shared_fc_dominance.1.weight: Grad Norm = 0.0086
shared_fc_dominance.1.bias: Grad Norm = 0.0086
output_branch_arousal.0.

Epoch 9/15 [Train]:  70%|██████▉   | 701/1004 [06:27<02:59,  1.69it/s, loss=0.1974, a_loss=0.0811, d_loss=0.2362]

fusion_layer_arousal.0.weight: Grad Norm = 0.0644
fusion_layer_arousal.0.bias: Grad Norm = 0.0135
fusion_layer_arousal.1.weight: Grad Norm = 0.0028
fusion_layer_arousal.1.bias: Grad Norm = 0.0024
fusion_layer_arousal.4.weight: Grad Norm = 0.0651
fusion_layer_arousal.4.bias: Grad Norm = 0.0034
fusion_layer_dominance.0.weight: Grad Norm = 0.1502
fusion_layer_dominance.0.bias: Grad Norm = 0.0336
fusion_layer_dominance.1.weight: Grad Norm = 0.0095
fusion_layer_dominance.1.bias: Grad Norm = 0.0080
fusion_layer_dominance.4.weight: Grad Norm = 0.1810
fusion_layer_dominance.4.bias: Grad Norm = 0.0094
shared_fc_arousal.0.weight: Grad Norm = 0.0514
shared_fc_arousal.0.bias: Grad Norm = 0.0053
shared_fc_arousal.1.weight: Grad Norm = 0.0033
shared_fc_arousal.1.bias: Grad Norm = 0.0030
shared_fc_dominance.0.weight: Grad Norm = 0.1725
shared_fc_dominance.0.bias: Grad Norm = 0.0144
shared_fc_dominance.1.weight: Grad Norm = 0.0136
shared_fc_dominance.1.bias: Grad Norm = 0.0128
output_branch_arousal.0.

Epoch 9/15 [Train]:  80%|███████▉  | 801/1004 [07:23<01:55,  1.76it/s, loss=0.1402, a_loss=0.1208, d_loss=0.1466]

fusion_layer_arousal.0.weight: Grad Norm = 0.0759
fusion_layer_arousal.0.bias: Grad Norm = 0.0164
fusion_layer_arousal.1.weight: Grad Norm = 0.0031
fusion_layer_arousal.1.bias: Grad Norm = 0.0031
fusion_layer_arousal.4.weight: Grad Norm = 0.0835
fusion_layer_arousal.4.bias: Grad Norm = 0.0052
fusion_layer_dominance.0.weight: Grad Norm = 0.1291
fusion_layer_dominance.0.bias: Grad Norm = 0.0257
fusion_layer_dominance.1.weight: Grad Norm = 0.0081
fusion_layer_dominance.1.bias: Grad Norm = 0.0066
fusion_layer_dominance.4.weight: Grad Norm = 0.1711
fusion_layer_dominance.4.bias: Grad Norm = 0.0084
shared_fc_arousal.0.weight: Grad Norm = 0.0706
shared_fc_arousal.0.bias: Grad Norm = 0.0077
shared_fc_arousal.1.weight: Grad Norm = 0.0042
shared_fc_arousal.1.bias: Grad Norm = 0.0045
shared_fc_dominance.0.weight: Grad Norm = 0.1711
shared_fc_dominance.0.bias: Grad Norm = 0.0149
shared_fc_dominance.1.weight: Grad Norm = 0.0129
shared_fc_dominance.1.bias: Grad Norm = 0.0119
output_branch_arousal.0.

Epoch 9/15 [Train]:  90%|████████▉ | 901/1004 [08:18<00:54,  1.88it/s, loss=0.1355, a_loss=0.1648, d_loss=0.1257]

fusion_layer_arousal.0.weight: Grad Norm = 0.0812
fusion_layer_arousal.0.bias: Grad Norm = 0.0191
fusion_layer_arousal.1.weight: Grad Norm = 0.0037
fusion_layer_arousal.1.bias: Grad Norm = 0.0034
fusion_layer_arousal.4.weight: Grad Norm = 0.0916
fusion_layer_arousal.4.bias: Grad Norm = 0.0055
fusion_layer_dominance.0.weight: Grad Norm = 0.1045
fusion_layer_dominance.0.bias: Grad Norm = 0.0213
fusion_layer_dominance.1.weight: Grad Norm = 0.0058
fusion_layer_dominance.1.bias: Grad Norm = 0.0052
fusion_layer_dominance.4.weight: Grad Norm = 0.1300
fusion_layer_dominance.4.bias: Grad Norm = 0.0069
shared_fc_arousal.0.weight: Grad Norm = 0.0726
shared_fc_arousal.0.bias: Grad Norm = 0.0076
shared_fc_arousal.1.weight: Grad Norm = 0.0040
shared_fc_arousal.1.bias: Grad Norm = 0.0043
shared_fc_dominance.0.weight: Grad Norm = 0.1167
shared_fc_dominance.0.bias: Grad Norm = 0.0106
shared_fc_dominance.1.weight: Grad Norm = 0.0089
shared_fc_dominance.1.bias: Grad Norm = 0.0087
output_branch_arousal.0.

Epoch 9/15 [Train]: 100%|█████████▉| 1001/1004 [09:13<00:01,  1.86it/s, loss=0.3031, a_loss=0.2789, d_loss=0.3111]

fusion_layer_arousal.0.weight: Grad Norm = 0.1272
fusion_layer_arousal.0.bias: Grad Norm = 0.0239
fusion_layer_arousal.1.weight: Grad Norm = 0.0044
fusion_layer_arousal.1.bias: Grad Norm = 0.0045
fusion_layer_arousal.4.weight: Grad Norm = 0.1131
fusion_layer_arousal.4.bias: Grad Norm = 0.0072
fusion_layer_dominance.0.weight: Grad Norm = 0.1736
fusion_layer_dominance.0.bias: Grad Norm = 0.0438
fusion_layer_dominance.1.weight: Grad Norm = 0.0100
fusion_layer_dominance.1.bias: Grad Norm = 0.0091
fusion_layer_dominance.4.weight: Grad Norm = 0.1964
fusion_layer_dominance.4.bias: Grad Norm = 0.0109
shared_fc_arousal.0.weight: Grad Norm = 0.0901
shared_fc_arousal.0.bias: Grad Norm = 0.0111
shared_fc_arousal.1.weight: Grad Norm = 0.0057
shared_fc_arousal.1.bias: Grad Norm = 0.0052
shared_fc_dominance.0.weight: Grad Norm = 0.1718
shared_fc_dominance.0.bias: Grad Norm = 0.0159
shared_fc_dominance.1.weight: Grad Norm = 0.0119
shared_fc_dominance.1.bias: Grad Norm = 0.0128
output_branch_arousal.0.

Epoch 9/15 [Train]: 100%|██████████| 1004/1004 [09:15<00:00,  1.81it/s, loss=0.0989, a_loss=0.0440, d_loss=0.1171]
Epoch 9/15 [Val]: 100%|██████████| 126/126 [00:44<00:00,  2.80it/s, loss=0.2968, a_loss=0.1694, d_loss=0.4243]


Epoch 9/15 Results:
  Train Loss: 0.1658 (Arousal: 0.1269, Dominance: 0.1788)
  Val Loss: 0.1460 (Arousal: 0.1099, Dominance: 0.1821)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1460)


Epoch 10/15 [Train]:   0%|          | 1/1004 [00:00<15:06,  1.11it/s, loss=0.2754, a_loss=0.1738, d_loss=0.3092]

fusion_layer_arousal.0.weight: Grad Norm = 0.0850
fusion_layer_arousal.0.bias: Grad Norm = 0.0156
fusion_layer_arousal.1.weight: Grad Norm = 0.0032
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0925
fusion_layer_arousal.4.bias: Grad Norm = 0.0044
fusion_layer_dominance.0.weight: Grad Norm = 0.2038
fusion_layer_dominance.0.bias: Grad Norm = 0.0440
fusion_layer_dominance.1.weight: Grad Norm = 0.0125
fusion_layer_dominance.1.bias: Grad Norm = 0.0115
fusion_layer_dominance.4.weight: Grad Norm = 0.2482
fusion_layer_dominance.4.bias: Grad Norm = 0.0161
shared_fc_arousal.0.weight: Grad Norm = 0.0739
shared_fc_arousal.0.bias: Grad Norm = 0.0079
shared_fc_arousal.1.weight: Grad Norm = 0.0041
shared_fc_arousal.1.bias: Grad Norm = 0.0039
shared_fc_dominance.0.weight: Grad Norm = 0.2386
shared_fc_dominance.0.bias: Grad Norm = 0.0211
shared_fc_dominance.1.weight: Grad Norm = 0.0190
shared_fc_dominance.1.bias: Grad Norm = 0.0190
output_branch_arousal.0.

Epoch 10/15 [Train]:  10%|█         | 101/1004 [00:55<08:12,  1.83it/s, loss=0.1290, a_loss=0.0585, d_loss=0.1526]

fusion_layer_arousal.0.weight: Grad Norm = 0.0470
fusion_layer_arousal.0.bias: Grad Norm = 0.0107
fusion_layer_arousal.1.weight: Grad Norm = 0.0017
fusion_layer_arousal.1.bias: Grad Norm = 0.0018
fusion_layer_arousal.4.weight: Grad Norm = 0.0495
fusion_layer_arousal.4.bias: Grad Norm = 0.0031
fusion_layer_dominance.0.weight: Grad Norm = 0.1157
fusion_layer_dominance.0.bias: Grad Norm = 0.0265
fusion_layer_dominance.1.weight: Grad Norm = 0.0066
fusion_layer_dominance.1.bias: Grad Norm = 0.0054
fusion_layer_dominance.4.weight: Grad Norm = 0.1362
fusion_layer_dominance.4.bias: Grad Norm = 0.0068
shared_fc_arousal.0.weight: Grad Norm = 0.0408
shared_fc_arousal.0.bias: Grad Norm = 0.0045
shared_fc_arousal.1.weight: Grad Norm = 0.0027
shared_fc_arousal.1.bias: Grad Norm = 0.0025
shared_fc_dominance.0.weight: Grad Norm = 0.1206
shared_fc_dominance.0.bias: Grad Norm = 0.0101
shared_fc_dominance.1.weight: Grad Norm = 0.0106
shared_fc_dominance.1.bias: Grad Norm = 0.0090
output_branch_arousal.0.

Epoch 10/15 [Train]:  20%|██        | 201/1004 [01:50<07:06,  1.88it/s, loss=0.1922, a_loss=0.0741, d_loss=0.2316]

fusion_layer_arousal.0.weight: Grad Norm = 0.0843
fusion_layer_arousal.0.bias: Grad Norm = 0.0194
fusion_layer_arousal.1.weight: Grad Norm = 0.0033
fusion_layer_arousal.1.bias: Grad Norm = 0.0034
fusion_layer_arousal.4.weight: Grad Norm = 0.0851
fusion_layer_arousal.4.bias: Grad Norm = 0.0052
fusion_layer_dominance.0.weight: Grad Norm = 0.1565
fusion_layer_dominance.0.bias: Grad Norm = 0.0309
fusion_layer_dominance.1.weight: Grad Norm = 0.0094
fusion_layer_dominance.1.bias: Grad Norm = 0.0081
fusion_layer_dominance.4.weight: Grad Norm = 0.1720
fusion_layer_dominance.4.bias: Grad Norm = 0.0088
shared_fc_arousal.0.weight: Grad Norm = 0.0684
shared_fc_arousal.0.bias: Grad Norm = 0.0083
shared_fc_arousal.1.weight: Grad Norm = 0.0039
shared_fc_arousal.1.bias: Grad Norm = 0.0039
shared_fc_dominance.0.weight: Grad Norm = 0.1505
shared_fc_dominance.0.bias: Grad Norm = 0.0121
shared_fc_dominance.1.weight: Grad Norm = 0.0101
shared_fc_dominance.1.bias: Grad Norm = 0.0103
output_branch_arousal.0.

Epoch 10/15 [Train]:  30%|██▉       | 301/1004 [02:46<06:50,  1.71it/s, loss=0.1153, a_loss=0.0932, d_loss=0.1227]

fusion_layer_arousal.0.weight: Grad Norm = 0.0683
fusion_layer_arousal.0.bias: Grad Norm = 0.0142
fusion_layer_arousal.1.weight: Grad Norm = 0.0027
fusion_layer_arousal.1.bias: Grad Norm = 0.0026
fusion_layer_arousal.4.weight: Grad Norm = 0.0639
fusion_layer_arousal.4.bias: Grad Norm = 0.0037
fusion_layer_dominance.0.weight: Grad Norm = 0.1124
fusion_layer_dominance.0.bias: Grad Norm = 0.0235
fusion_layer_dominance.1.weight: Grad Norm = 0.0063
fusion_layer_dominance.1.bias: Grad Norm = 0.0059
fusion_layer_dominance.4.weight: Grad Norm = 0.1265
fusion_layer_dominance.4.bias: Grad Norm = 0.0071
shared_fc_arousal.0.weight: Grad Norm = 0.0517
shared_fc_arousal.0.bias: Grad Norm = 0.0056
shared_fc_arousal.1.weight: Grad Norm = 0.0032
shared_fc_arousal.1.bias: Grad Norm = 0.0030
shared_fc_dominance.0.weight: Grad Norm = 0.1114
shared_fc_dominance.0.bias: Grad Norm = 0.0096
shared_fc_dominance.1.weight: Grad Norm = 0.0083
shared_fc_dominance.1.bias: Grad Norm = 0.0081
output_branch_arousal.0.

Epoch 10/15 [Train]:  40%|███▉      | 401/1004 [03:41<05:06,  1.97it/s, loss=0.2851, a_loss=0.1101, d_loss=0.3435]

fusion_layer_arousal.0.weight: Grad Norm = 0.0691
fusion_layer_arousal.0.bias: Grad Norm = 0.0145
fusion_layer_arousal.1.weight: Grad Norm = 0.0032
fusion_layer_arousal.1.bias: Grad Norm = 0.0028
fusion_layer_arousal.4.weight: Grad Norm = 0.0810
fusion_layer_arousal.4.bias: Grad Norm = 0.0042
fusion_layer_dominance.0.weight: Grad Norm = 0.1457
fusion_layer_dominance.0.bias: Grad Norm = 0.0332
fusion_layer_dominance.1.weight: Grad Norm = 0.0088
fusion_layer_dominance.1.bias: Grad Norm = 0.0073
fusion_layer_dominance.4.weight: Grad Norm = 0.1705
fusion_layer_dominance.4.bias: Grad Norm = 0.0084
shared_fc_arousal.0.weight: Grad Norm = 0.0736
shared_fc_arousal.0.bias: Grad Norm = 0.0074
shared_fc_arousal.1.weight: Grad Norm = 0.0048
shared_fc_arousal.1.bias: Grad Norm = 0.0042
shared_fc_dominance.0.weight: Grad Norm = 0.1564
shared_fc_dominance.0.bias: Grad Norm = 0.0133
shared_fc_dominance.1.weight: Grad Norm = 0.0118
shared_fc_dominance.1.bias: Grad Norm = 0.0110
output_branch_arousal.0.

Epoch 10/15 [Train]:  50%|████▉     | 501/1004 [04:36<04:40,  1.80it/s, loss=0.1428, a_loss=0.0300, d_loss=0.1805]

fusion_layer_arousal.0.weight: Grad Norm = 0.0316
fusion_layer_arousal.0.bias: Grad Norm = 0.0069
fusion_layer_arousal.1.weight: Grad Norm = 0.0013
fusion_layer_arousal.1.bias: Grad Norm = 0.0012
fusion_layer_arousal.4.weight: Grad Norm = 0.0337
fusion_layer_arousal.4.bias: Grad Norm = 0.0019
fusion_layer_dominance.0.weight: Grad Norm = 0.1290
fusion_layer_dominance.0.bias: Grad Norm = 0.0319
fusion_layer_dominance.1.weight: Grad Norm = 0.0074
fusion_layer_dominance.1.bias: Grad Norm = 0.0067
fusion_layer_dominance.4.weight: Grad Norm = 0.1598
fusion_layer_dominance.4.bias: Grad Norm = 0.0087
shared_fc_arousal.0.weight: Grad Norm = 0.0279
shared_fc_arousal.0.bias: Grad Norm = 0.0030
shared_fc_arousal.1.weight: Grad Norm = 0.0017
shared_fc_arousal.1.bias: Grad Norm = 0.0016
shared_fc_dominance.0.weight: Grad Norm = 0.1537
shared_fc_dominance.0.bias: Grad Norm = 0.0128
shared_fc_dominance.1.weight: Grad Norm = 0.0108
shared_fc_dominance.1.bias: Grad Norm = 0.0115
output_branch_arousal.0.

Epoch 10/15 [Train]:  60%|█████▉    | 601/1004 [05:31<03:27,  1.95it/s, loss=0.1982, a_loss=0.1208, d_loss=0.2240]

fusion_layer_arousal.0.weight: Grad Norm = 0.0602
fusion_layer_arousal.0.bias: Grad Norm = 0.0135
fusion_layer_arousal.1.weight: Grad Norm = 0.0027
fusion_layer_arousal.1.bias: Grad Norm = 0.0024
fusion_layer_arousal.4.weight: Grad Norm = 0.0714
fusion_layer_arousal.4.bias: Grad Norm = 0.0040
fusion_layer_dominance.0.weight: Grad Norm = 0.1493
fusion_layer_dominance.0.bias: Grad Norm = 0.0332
fusion_layer_dominance.1.weight: Grad Norm = 0.0092
fusion_layer_dominance.1.bias: Grad Norm = 0.0084
fusion_layer_dominance.4.weight: Grad Norm = 0.1821
fusion_layer_dominance.4.bias: Grad Norm = 0.0099
shared_fc_arousal.0.weight: Grad Norm = 0.0605
shared_fc_arousal.0.bias: Grad Norm = 0.0063
shared_fc_arousal.1.weight: Grad Norm = 0.0038
shared_fc_arousal.1.bias: Grad Norm = 0.0036
shared_fc_dominance.0.weight: Grad Norm = 0.1612
shared_fc_dominance.0.bias: Grad Norm = 0.0154
shared_fc_dominance.1.weight: Grad Norm = 0.0134
shared_fc_dominance.1.bias: Grad Norm = 0.0119
output_branch_arousal.0.

Epoch 10/15 [Train]:  70%|██████▉   | 701/1004 [06:26<02:43,  1.85it/s, loss=0.2326, a_loss=0.1981, d_loss=0.2441]

fusion_layer_arousal.0.weight: Grad Norm = 0.0810
fusion_layer_arousal.0.bias: Grad Norm = 0.0155
fusion_layer_arousal.1.weight: Grad Norm = 0.0032
fusion_layer_arousal.1.bias: Grad Norm = 0.0031
fusion_layer_arousal.4.weight: Grad Norm = 0.0838
fusion_layer_arousal.4.bias: Grad Norm = 0.0052
fusion_layer_dominance.0.weight: Grad Norm = 0.1276
fusion_layer_dominance.0.bias: Grad Norm = 0.0241
fusion_layer_dominance.1.weight: Grad Norm = 0.0071
fusion_layer_dominance.1.bias: Grad Norm = 0.0063
fusion_layer_dominance.4.weight: Grad Norm = 0.1679
fusion_layer_dominance.4.bias: Grad Norm = 0.0095
shared_fc_arousal.0.weight: Grad Norm = 0.0703
shared_fc_arousal.0.bias: Grad Norm = 0.0080
shared_fc_arousal.1.weight: Grad Norm = 0.0046
shared_fc_arousal.1.bias: Grad Norm = 0.0043
shared_fc_dominance.0.weight: Grad Norm = 0.1604
shared_fc_dominance.0.bias: Grad Norm = 0.0158
shared_fc_dominance.1.weight: Grad Norm = 0.0119
shared_fc_dominance.1.bias: Grad Norm = 0.0122
output_branch_arousal.0.

Epoch 10/15 [Train]:  80%|███████▉  | 801/1004 [07:21<01:44,  1.93it/s, loss=0.0765, a_loss=0.1525, d_loss=0.0511]

fusion_layer_arousal.0.weight: Grad Norm = 0.0902
fusion_layer_arousal.0.bias: Grad Norm = 0.0170
fusion_layer_arousal.1.weight: Grad Norm = 0.0038
fusion_layer_arousal.1.bias: Grad Norm = 0.0032
fusion_layer_arousal.4.weight: Grad Norm = 0.0947
fusion_layer_arousal.4.bias: Grad Norm = 0.0060
fusion_layer_dominance.0.weight: Grad Norm = 0.0807
fusion_layer_dominance.0.bias: Grad Norm = 0.0165
fusion_layer_dominance.1.weight: Grad Norm = 0.0043
fusion_layer_dominance.1.bias: Grad Norm = 0.0041
fusion_layer_dominance.4.weight: Grad Norm = 0.0899
fusion_layer_dominance.4.bias: Grad Norm = 0.0050
shared_fc_arousal.0.weight: Grad Norm = 0.0825
shared_fc_arousal.0.bias: Grad Norm = 0.0094
shared_fc_arousal.1.weight: Grad Norm = 0.0054
shared_fc_arousal.1.bias: Grad Norm = 0.0049
shared_fc_dominance.0.weight: Grad Norm = 0.0781
shared_fc_dominance.0.bias: Grad Norm = 0.0066
shared_fc_dominance.1.weight: Grad Norm = 0.0050
shared_fc_dominance.1.bias: Grad Norm = 0.0055
output_branch_arousal.0.

Epoch 10/15 [Train]:  90%|████████▉ | 901/1004 [08:16<00:52,  1.98it/s, loss=0.1393, a_loss=0.1121, d_loss=0.1484]

fusion_layer_arousal.0.weight: Grad Norm = 0.0554
fusion_layer_arousal.0.bias: Grad Norm = 0.0122
fusion_layer_arousal.1.weight: Grad Norm = 0.0025
fusion_layer_arousal.1.bias: Grad Norm = 0.0024
fusion_layer_arousal.4.weight: Grad Norm = 0.0614
fusion_layer_arousal.4.bias: Grad Norm = 0.0035
fusion_layer_dominance.0.weight: Grad Norm = 0.1285
fusion_layer_dominance.0.bias: Grad Norm = 0.0273
fusion_layer_dominance.1.weight: Grad Norm = 0.0075
fusion_layer_dominance.1.bias: Grad Norm = 0.0073
fusion_layer_dominance.4.weight: Grad Norm = 0.1412
fusion_layer_dominance.4.bias: Grad Norm = 0.0082
shared_fc_arousal.0.weight: Grad Norm = 0.0506
shared_fc_arousal.0.bias: Grad Norm = 0.0056
shared_fc_arousal.1.weight: Grad Norm = 0.0033
shared_fc_arousal.1.bias: Grad Norm = 0.0033
shared_fc_dominance.0.weight: Grad Norm = 0.1200
shared_fc_dominance.0.bias: Grad Norm = 0.0112
shared_fc_dominance.1.weight: Grad Norm = 0.0097
shared_fc_dominance.1.bias: Grad Norm = 0.0093
output_branch_arousal.0.

Epoch 10/15 [Train]: 100%|█████████▉| 1001/1004 [09:11<00:01,  1.94it/s, loss=0.2443, a_loss=0.0728, d_loss=0.3014]

fusion_layer_arousal.0.weight: Grad Norm = 0.0523
fusion_layer_arousal.0.bias: Grad Norm = 0.0136
fusion_layer_arousal.1.weight: Grad Norm = 0.0022
fusion_layer_arousal.1.bias: Grad Norm = 0.0021
fusion_layer_arousal.4.weight: Grad Norm = 0.0579
fusion_layer_arousal.4.bias: Grad Norm = 0.0034
fusion_layer_dominance.0.weight: Grad Norm = 0.1642
fusion_layer_dominance.0.bias: Grad Norm = 0.0390
fusion_layer_dominance.1.weight: Grad Norm = 0.0102
fusion_layer_dominance.1.bias: Grad Norm = 0.0079
fusion_layer_dominance.4.weight: Grad Norm = 0.1801
fusion_layer_dominance.4.bias: Grad Norm = 0.0088
shared_fc_arousal.0.weight: Grad Norm = 0.0479
shared_fc_arousal.0.bias: Grad Norm = 0.0050
shared_fc_arousal.1.weight: Grad Norm = 0.0029
shared_fc_arousal.1.bias: Grad Norm = 0.0029
shared_fc_dominance.0.weight: Grad Norm = 0.1695
shared_fc_dominance.0.bias: Grad Norm = 0.0134
shared_fc_dominance.1.weight: Grad Norm = 0.0131
shared_fc_dominance.1.bias: Grad Norm = 0.0121
output_branch_arousal.0.

Epoch 10/15 [Train]: 100%|██████████| 1004/1004 [09:13<00:00,  1.82it/s, loss=0.1353, a_loss=0.1815, d_loss=0.1199]
Epoch 10/15 [Val]: 100%|██████████| 126/126 [00:45<00:00,  2.80it/s, loss=0.3531, a_loss=0.2773, d_loss=0.4289]


Epoch 10/15 Results:
  Train Loss: 0.1619 (Arousal: 0.1233, Dominance: 0.1748)
  Val Loss: 0.1447 (Arousal: 0.1108, Dominance: 0.1787)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1447)


Epoch 11/15 [Train]:   0%|          | 1/1004 [00:00<09:03,  1.85it/s, loss=0.2436, a_loss=0.0973, d_loss=0.2924]

fusion_layer_arousal.0.weight: Grad Norm = 0.0654
fusion_layer_arousal.0.bias: Grad Norm = 0.0161
fusion_layer_arousal.1.weight: Grad Norm = 0.0028
fusion_layer_arousal.1.bias: Grad Norm = 0.0025
fusion_layer_arousal.4.weight: Grad Norm = 0.0715
fusion_layer_arousal.4.bias: Grad Norm = 0.0040
fusion_layer_dominance.0.weight: Grad Norm = 0.1824
fusion_layer_dominance.0.bias: Grad Norm = 0.0519
fusion_layer_dominance.1.weight: Grad Norm = 0.0103
fusion_layer_dominance.1.bias: Grad Norm = 0.0095
fusion_layer_dominance.4.weight: Grad Norm = 0.2030
fusion_layer_dominance.4.bias: Grad Norm = 0.0107
shared_fc_arousal.0.weight: Grad Norm = 0.0576
shared_fc_arousal.0.bias: Grad Norm = 0.0060
shared_fc_arousal.1.weight: Grad Norm = 0.0035
shared_fc_arousal.1.bias: Grad Norm = 0.0030
shared_fc_dominance.0.weight: Grad Norm = 0.1856
shared_fc_dominance.0.bias: Grad Norm = 0.0160
shared_fc_dominance.1.weight: Grad Norm = 0.0140
shared_fc_dominance.1.bias: Grad Norm = 0.0137
output_branch_arousal.0.

Epoch 11/15 [Train]:  10%|█         | 101/1004 [00:56<08:23,  1.79it/s, loss=0.2403, a_loss=0.2489, d_loss=0.2374]

fusion_layer_arousal.0.weight: Grad Norm = 0.1020
fusion_layer_arousal.0.bias: Grad Norm = 0.0214
fusion_layer_arousal.1.weight: Grad Norm = 0.0043
fusion_layer_arousal.1.bias: Grad Norm = 0.0037
fusion_layer_arousal.4.weight: Grad Norm = 0.1014
fusion_layer_arousal.4.bias: Grad Norm = 0.0057
fusion_layer_dominance.0.weight: Grad Norm = 0.1625
fusion_layer_dominance.0.bias: Grad Norm = 0.0293
fusion_layer_dominance.1.weight: Grad Norm = 0.0081
fusion_layer_dominance.1.bias: Grad Norm = 0.0077
fusion_layer_dominance.4.weight: Grad Norm = 0.1676
fusion_layer_dominance.4.bias: Grad Norm = 0.0085
shared_fc_arousal.0.weight: Grad Norm = 0.0840
shared_fc_arousal.0.bias: Grad Norm = 0.0099
shared_fc_arousal.1.weight: Grad Norm = 0.0051
shared_fc_arousal.1.bias: Grad Norm = 0.0048
shared_fc_dominance.0.weight: Grad Norm = 0.1531
shared_fc_dominance.0.bias: Grad Norm = 0.0149
shared_fc_dominance.1.weight: Grad Norm = 0.0098
shared_fc_dominance.1.bias: Grad Norm = 0.0107
output_branch_arousal.0.

Epoch 11/15 [Train]:  20%|██        | 201/1004 [01:51<07:03,  1.89it/s, loss=0.1220, a_loss=0.1783, d_loss=0.1032]

fusion_layer_arousal.0.weight: Grad Norm = 0.1055
fusion_layer_arousal.0.bias: Grad Norm = 0.0215
fusion_layer_arousal.1.weight: Grad Norm = 0.0039
fusion_layer_arousal.1.bias: Grad Norm = 0.0040
fusion_layer_arousal.4.weight: Grad Norm = 0.1057
fusion_layer_arousal.4.bias: Grad Norm = 0.0067
fusion_layer_dominance.0.weight: Grad Norm = 0.0790
fusion_layer_dominance.0.bias: Grad Norm = 0.0179
fusion_layer_dominance.1.weight: Grad Norm = 0.0048
fusion_layer_dominance.1.bias: Grad Norm = 0.0043
fusion_layer_dominance.4.weight: Grad Norm = 0.1050
fusion_layer_dominance.4.bias: Grad Norm = 0.0060
shared_fc_arousal.0.weight: Grad Norm = 0.0834
shared_fc_arousal.0.bias: Grad Norm = 0.0098
shared_fc_arousal.1.weight: Grad Norm = 0.0055
shared_fc_arousal.1.bias: Grad Norm = 0.0050
shared_fc_dominance.0.weight: Grad Norm = 0.1036
shared_fc_dominance.0.bias: Grad Norm = 0.0094
shared_fc_dominance.1.weight: Grad Norm = 0.0080
shared_fc_dominance.1.bias: Grad Norm = 0.0076
output_branch_arousal.0.

Epoch 11/15 [Train]:  30%|██▉       | 301/1004 [02:45<06:17,  1.86it/s, loss=0.2954, a_loss=0.1725, d_loss=0.3364]

fusion_layer_arousal.0.weight: Grad Norm = 0.1015
fusion_layer_arousal.0.bias: Grad Norm = 0.0197
fusion_layer_arousal.1.weight: Grad Norm = 0.0040
fusion_layer_arousal.1.bias: Grad Norm = 0.0034
fusion_layer_arousal.4.weight: Grad Norm = 0.1112
fusion_layer_arousal.4.bias: Grad Norm = 0.0053
fusion_layer_dominance.0.weight: Grad Norm = 0.1674
fusion_layer_dominance.0.bias: Grad Norm = 0.0361
fusion_layer_dominance.1.weight: Grad Norm = 0.0087
fusion_layer_dominance.1.bias: Grad Norm = 0.0086
fusion_layer_dominance.4.weight: Grad Norm = 0.1803
fusion_layer_dominance.4.bias: Grad Norm = 0.0103
shared_fc_arousal.0.weight: Grad Norm = 0.0881
shared_fc_arousal.0.bias: Grad Norm = 0.0090
shared_fc_arousal.1.weight: Grad Norm = 0.0045
shared_fc_arousal.1.bias: Grad Norm = 0.0047
shared_fc_dominance.0.weight: Grad Norm = 0.1539
shared_fc_dominance.0.bias: Grad Norm = 0.0131
shared_fc_dominance.1.weight: Grad Norm = 0.0110
shared_fc_dominance.1.bias: Grad Norm = 0.0115
output_branch_arousal.0.

Epoch 11/15 [Train]:  40%|███▉      | 401/1004 [03:41<05:04,  1.98it/s, loss=0.1183, a_loss=0.0302, d_loss=0.1476]

fusion_layer_arousal.0.weight: Grad Norm = 0.0369
fusion_layer_arousal.0.bias: Grad Norm = 0.0082
fusion_layer_arousal.1.weight: Grad Norm = 0.0016
fusion_layer_arousal.1.bias: Grad Norm = 0.0015
fusion_layer_arousal.4.weight: Grad Norm = 0.0392
fusion_layer_arousal.4.bias: Grad Norm = 0.0024
fusion_layer_dominance.0.weight: Grad Norm = 0.1326
fusion_layer_dominance.0.bias: Grad Norm = 0.0307
fusion_layer_dominance.1.weight: Grad Norm = 0.0068
fusion_layer_dominance.1.bias: Grad Norm = 0.0060
fusion_layer_dominance.4.weight: Grad Norm = 0.1321
fusion_layer_dominance.4.bias: Grad Norm = 0.0062
shared_fc_arousal.0.weight: Grad Norm = 0.0323
shared_fc_arousal.0.bias: Grad Norm = 0.0037
shared_fc_arousal.1.weight: Grad Norm = 0.0017
shared_fc_arousal.1.bias: Grad Norm = 0.0018
shared_fc_dominance.0.weight: Grad Norm = 0.1317
shared_fc_dominance.0.bias: Grad Norm = 0.0116
shared_fc_dominance.1.weight: Grad Norm = 0.0098
shared_fc_dominance.1.bias: Grad Norm = 0.0096
output_branch_arousal.0.

Epoch 11/15 [Train]:  50%|████▉     | 501/1004 [04:35<04:39,  1.80it/s, loss=0.1531, a_loss=0.1972, d_loss=0.1384]

fusion_layer_arousal.0.weight: Grad Norm = 0.0590
fusion_layer_arousal.0.bias: Grad Norm = 0.0141
fusion_layer_arousal.1.weight: Grad Norm = 0.0023
fusion_layer_arousal.1.bias: Grad Norm = 0.0022
fusion_layer_arousal.4.weight: Grad Norm = 0.0685
fusion_layer_arousal.4.bias: Grad Norm = 0.0034
fusion_layer_dominance.0.weight: Grad Norm = 0.1277
fusion_layer_dominance.0.bias: Grad Norm = 0.0276
fusion_layer_dominance.1.weight: Grad Norm = 0.0076
fusion_layer_dominance.1.bias: Grad Norm = 0.0066
fusion_layer_dominance.4.weight: Grad Norm = 0.1428
fusion_layer_dominance.4.bias: Grad Norm = 0.0079
shared_fc_arousal.0.weight: Grad Norm = 0.0588
shared_fc_arousal.0.bias: Grad Norm = 0.0060
shared_fc_arousal.1.weight: Grad Norm = 0.0040
shared_fc_arousal.1.bias: Grad Norm = 0.0035
shared_fc_dominance.0.weight: Grad Norm = 0.1383
shared_fc_dominance.0.bias: Grad Norm = 0.0120
shared_fc_dominance.1.weight: Grad Norm = 0.0107
shared_fc_dominance.1.bias: Grad Norm = 0.0103
output_branch_arousal.0.

Epoch 11/15 [Train]:  60%|█████▉    | 601/1004 [05:30<03:47,  1.77it/s, loss=0.1304, a_loss=0.1079, d_loss=0.1379]

fusion_layer_arousal.0.weight: Grad Norm = 0.0636
fusion_layer_arousal.0.bias: Grad Norm = 0.0118
fusion_layer_arousal.1.weight: Grad Norm = 0.0026
fusion_layer_arousal.1.bias: Grad Norm = 0.0022
fusion_layer_arousal.4.weight: Grad Norm = 0.0619
fusion_layer_arousal.4.bias: Grad Norm = 0.0038
fusion_layer_dominance.0.weight: Grad Norm = 0.1102
fusion_layer_dominance.0.bias: Grad Norm = 0.0224
fusion_layer_dominance.1.weight: Grad Norm = 0.0061
fusion_layer_dominance.1.bias: Grad Norm = 0.0058
fusion_layer_dominance.4.weight: Grad Norm = 0.1226
fusion_layer_dominance.4.bias: Grad Norm = 0.0069
shared_fc_arousal.0.weight: Grad Norm = 0.0489
shared_fc_arousal.0.bias: Grad Norm = 0.0056
shared_fc_arousal.1.weight: Grad Norm = 0.0031
shared_fc_arousal.1.bias: Grad Norm = 0.0029
shared_fc_dominance.0.weight: Grad Norm = 0.1170
shared_fc_dominance.0.bias: Grad Norm = 0.0101
shared_fc_dominance.1.weight: Grad Norm = 0.0086
shared_fc_dominance.1.bias: Grad Norm = 0.0094
output_branch_arousal.0.

Epoch 11/15 [Train]:  70%|██████▉   | 701/1004 [06:26<03:24,  1.48it/s, loss=0.1830, a_loss=0.0604, d_loss=0.2239]

fusion_layer_arousal.0.weight: Grad Norm = 0.0532
fusion_layer_arousal.0.bias: Grad Norm = 0.0108
fusion_layer_arousal.1.weight: Grad Norm = 0.0019
fusion_layer_arousal.1.bias: Grad Norm = 0.0020
fusion_layer_arousal.4.weight: Grad Norm = 0.0547
fusion_layer_arousal.4.bias: Grad Norm = 0.0032
fusion_layer_dominance.0.weight: Grad Norm = 0.1300
fusion_layer_dominance.0.bias: Grad Norm = 0.0300
fusion_layer_dominance.1.weight: Grad Norm = 0.0083
fusion_layer_dominance.1.bias: Grad Norm = 0.0067
fusion_layer_dominance.4.weight: Grad Norm = 0.1623
fusion_layer_dominance.4.bias: Grad Norm = 0.0084
shared_fc_arousal.0.weight: Grad Norm = 0.0478
shared_fc_arousal.0.bias: Grad Norm = 0.0050
shared_fc_arousal.1.weight: Grad Norm = 0.0028
shared_fc_arousal.1.bias: Grad Norm = 0.0027
shared_fc_dominance.0.weight: Grad Norm = 0.1452
shared_fc_dominance.0.bias: Grad Norm = 0.0123
shared_fc_dominance.1.weight: Grad Norm = 0.0099
shared_fc_dominance.1.bias: Grad Norm = 0.0116
output_branch_arousal.0.

Epoch 11/15 [Train]:  80%|███████▉  | 801/1004 [07:21<02:05,  1.62it/s, loss=0.1664, a_loss=0.1294, d_loss=0.1788]

fusion_layer_arousal.0.weight: Grad Norm = 0.0783
fusion_layer_arousal.0.bias: Grad Norm = 0.0197
fusion_layer_arousal.1.weight: Grad Norm = 0.0035
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0794
fusion_layer_arousal.4.bias: Grad Norm = 0.0042
fusion_layer_dominance.0.weight: Grad Norm = 0.1514
fusion_layer_dominance.0.bias: Grad Norm = 0.0331
fusion_layer_dominance.1.weight: Grad Norm = 0.0097
fusion_layer_dominance.1.bias: Grad Norm = 0.0081
fusion_layer_dominance.4.weight: Grad Norm = 0.1589
fusion_layer_dominance.4.bias: Grad Norm = 0.0102
shared_fc_arousal.0.weight: Grad Norm = 0.0667
shared_fc_arousal.0.bias: Grad Norm = 0.0071
shared_fc_arousal.1.weight: Grad Norm = 0.0037
shared_fc_arousal.1.bias: Grad Norm = 0.0038
shared_fc_dominance.0.weight: Grad Norm = 0.1576
shared_fc_dominance.0.bias: Grad Norm = 0.0153
shared_fc_dominance.1.weight: Grad Norm = 0.0137
shared_fc_dominance.1.bias: Grad Norm = 0.0124
output_branch_arousal.0.

Epoch 11/15 [Train]:  90%|████████▉ | 901/1004 [08:16<00:59,  1.72it/s, loss=0.0928, a_loss=0.0660, d_loss=0.1017]

fusion_layer_arousal.0.weight: Grad Norm = 0.0562
fusion_layer_arousal.0.bias: Grad Norm = 0.0155
fusion_layer_arousal.1.weight: Grad Norm = 0.0023
fusion_layer_arousal.1.bias: Grad Norm = 0.0022
fusion_layer_arousal.4.weight: Grad Norm = 0.0547
fusion_layer_arousal.4.bias: Grad Norm = 0.0032
fusion_layer_dominance.0.weight: Grad Norm = 0.1164
fusion_layer_dominance.0.bias: Grad Norm = 0.0254
fusion_layer_dominance.1.weight: Grad Norm = 0.0054
fusion_layer_dominance.1.bias: Grad Norm = 0.0050
fusion_layer_dominance.4.weight: Grad Norm = 0.1238
fusion_layer_dominance.4.bias: Grad Norm = 0.0053
shared_fc_arousal.0.weight: Grad Norm = 0.0444
shared_fc_arousal.0.bias: Grad Norm = 0.0052
shared_fc_arousal.1.weight: Grad Norm = 0.0026
shared_fc_arousal.1.bias: Grad Norm = 0.0025
shared_fc_dominance.0.weight: Grad Norm = 0.1083
shared_fc_dominance.0.bias: Grad Norm = 0.0089
shared_fc_dominance.1.weight: Grad Norm = 0.0077
shared_fc_dominance.1.bias: Grad Norm = 0.0071
output_branch_arousal.0.

Epoch 11/15 [Train]: 100%|█████████▉| 1001/1004 [09:11<00:01,  1.88it/s, loss=0.2096, a_loss=0.1510, d_loss=0.2292]

fusion_layer_arousal.0.weight: Grad Norm = 0.0898
fusion_layer_arousal.0.bias: Grad Norm = 0.0232
fusion_layer_arousal.1.weight: Grad Norm = 0.0037
fusion_layer_arousal.1.bias: Grad Norm = 0.0032
fusion_layer_arousal.4.weight: Grad Norm = 0.0846
fusion_layer_arousal.4.bias: Grad Norm = 0.0041
fusion_layer_dominance.0.weight: Grad Norm = 0.1305
fusion_layer_dominance.0.bias: Grad Norm = 0.0272
fusion_layer_dominance.1.weight: Grad Norm = 0.0065
fusion_layer_dominance.1.bias: Grad Norm = 0.0060
fusion_layer_dominance.4.weight: Grad Norm = 0.1307
fusion_layer_dominance.4.bias: Grad Norm = 0.0068
shared_fc_arousal.0.weight: Grad Norm = 0.0709
shared_fc_arousal.0.bias: Grad Norm = 0.0070
shared_fc_arousal.1.weight: Grad Norm = 0.0042
shared_fc_arousal.1.bias: Grad Norm = 0.0042
shared_fc_dominance.0.weight: Grad Norm = 0.1146
shared_fc_dominance.0.bias: Grad Norm = 0.0107
shared_fc_dominance.1.weight: Grad Norm = 0.0088
shared_fc_dominance.1.bias: Grad Norm = 0.0085
output_branch_arousal.0.

Epoch 11/15 [Train]: 100%|██████████| 1004/1004 [09:13<00:00,  1.81it/s, loss=0.1476, a_loss=0.0850, d_loss=0.1685]
Epoch 11/15 [Val]: 100%|██████████| 126/126 [00:45<00:00,  2.79it/s, loss=0.3495, a_loss=0.1880, d_loss=0.5110]


Epoch 11/15 Results:
  Train Loss: 0.1554 (Arousal: 0.1183, Dominance: 0.1678)
  Val Loss: 0.1429 (Arousal: 0.1025, Dominance: 0.1833)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1429)


Epoch 12/15 [Train]:   0%|          | 1/1004 [00:00<08:52,  1.88it/s, loss=0.2654, a_loss=0.2000, d_loss=0.2872]

fusion_layer_arousal.0.weight: Grad Norm = 0.0872
fusion_layer_arousal.0.bias: Grad Norm = 0.0181
fusion_layer_arousal.1.weight: Grad Norm = 0.0035
fusion_layer_arousal.1.bias: Grad Norm = 0.0033
fusion_layer_arousal.4.weight: Grad Norm = 0.0897
fusion_layer_arousal.4.bias: Grad Norm = 0.0052
fusion_layer_dominance.0.weight: Grad Norm = 0.1568
fusion_layer_dominance.0.bias: Grad Norm = 0.0326
fusion_layer_dominance.1.weight: Grad Norm = 0.0083
fusion_layer_dominance.1.bias: Grad Norm = 0.0078
fusion_layer_dominance.4.weight: Grad Norm = 0.1584
fusion_layer_dominance.4.bias: Grad Norm = 0.0090
shared_fc_arousal.0.weight: Grad Norm = 0.0774
shared_fc_arousal.0.bias: Grad Norm = 0.0087
shared_fc_arousal.1.weight: Grad Norm = 0.0045
shared_fc_arousal.1.bias: Grad Norm = 0.0044
shared_fc_dominance.0.weight: Grad Norm = 0.1515
shared_fc_dominance.0.bias: Grad Norm = 0.0138
shared_fc_dominance.1.weight: Grad Norm = 0.0129
shared_fc_dominance.1.bias: Grad Norm = 0.0121
output_branch_arousal.0.

Epoch 12/15 [Train]:  10%|█         | 101/1004 [00:56<08:36,  1.75it/s, loss=0.1375, a_loss=0.1125, d_loss=0.1458]

fusion_layer_arousal.0.weight: Grad Norm = 0.0679
fusion_layer_arousal.0.bias: Grad Norm = 0.0120
fusion_layer_arousal.1.weight: Grad Norm = 0.0029
fusion_layer_arousal.1.bias: Grad Norm = 0.0023
fusion_layer_arousal.4.weight: Grad Norm = 0.0758
fusion_layer_arousal.4.bias: Grad Norm = 0.0036
fusion_layer_dominance.0.weight: Grad Norm = 0.1488
fusion_layer_dominance.0.bias: Grad Norm = 0.0418
fusion_layer_dominance.1.weight: Grad Norm = 0.0075
fusion_layer_dominance.1.bias: Grad Norm = 0.0073
fusion_layer_dominance.4.weight: Grad Norm = 0.1523
fusion_layer_dominance.4.bias: Grad Norm = 0.0082
shared_fc_arousal.0.weight: Grad Norm = 0.0661
shared_fc_arousal.0.bias: Grad Norm = 0.0061
shared_fc_arousal.1.weight: Grad Norm = 0.0039
shared_fc_arousal.1.bias: Grad Norm = 0.0037
shared_fc_dominance.0.weight: Grad Norm = 0.1376
shared_fc_dominance.0.bias: Grad Norm = 0.0128
shared_fc_dominance.1.weight: Grad Norm = 0.0105
shared_fc_dominance.1.bias: Grad Norm = 0.0097
output_branch_arousal.0.

Epoch 12/15 [Train]:  20%|██        | 201/1004 [01:51<07:56,  1.69it/s, loss=0.1075, a_loss=0.1869, d_loss=0.0810]

fusion_layer_arousal.0.weight: Grad Norm = 0.0834
fusion_layer_arousal.0.bias: Grad Norm = 0.0188
fusion_layer_arousal.1.weight: Grad Norm = 0.0030
fusion_layer_arousal.1.bias: Grad Norm = 0.0032
fusion_layer_arousal.4.weight: Grad Norm = 0.0830
fusion_layer_arousal.4.bias: Grad Norm = 0.0052
fusion_layer_dominance.0.weight: Grad Norm = 0.1140
fusion_layer_dominance.0.bias: Grad Norm = 0.0261
fusion_layer_dominance.1.weight: Grad Norm = 0.0066
fusion_layer_dominance.1.bias: Grad Norm = 0.0060
fusion_layer_dominance.4.weight: Grad Norm = 0.1214
fusion_layer_dominance.4.bias: Grad Norm = 0.0067
shared_fc_arousal.0.weight: Grad Norm = 0.0657
shared_fc_arousal.0.bias: Grad Norm = 0.0076
shared_fc_arousal.1.weight: Grad Norm = 0.0038
shared_fc_arousal.1.bias: Grad Norm = 0.0038
shared_fc_dominance.0.weight: Grad Norm = 0.1097
shared_fc_dominance.0.bias: Grad Norm = 0.0091
shared_fc_dominance.1.weight: Grad Norm = 0.0085
shared_fc_dominance.1.bias: Grad Norm = 0.0082
output_branch_arousal.0.

Epoch 12/15 [Train]:  30%|██▉       | 301/1004 [02:46<06:48,  1.72it/s, loss=0.2052, a_loss=0.1020, d_loss=0.2396]

fusion_layer_arousal.0.weight: Grad Norm = 0.0700
fusion_layer_arousal.0.bias: Grad Norm = 0.0216
fusion_layer_arousal.1.weight: Grad Norm = 0.0027
fusion_layer_arousal.1.bias: Grad Norm = 0.0027
fusion_layer_arousal.4.weight: Grad Norm = 0.0719
fusion_layer_arousal.4.bias: Grad Norm = 0.0040
fusion_layer_dominance.0.weight: Grad Norm = 0.1448
fusion_layer_dominance.0.bias: Grad Norm = 0.0335
fusion_layer_dominance.1.weight: Grad Norm = 0.0079
fusion_layer_dominance.1.bias: Grad Norm = 0.0071
fusion_layer_dominance.4.weight: Grad Norm = 0.1574
fusion_layer_dominance.4.bias: Grad Norm = 0.0074
shared_fc_arousal.0.weight: Grad Norm = 0.0649
shared_fc_arousal.0.bias: Grad Norm = 0.0071
shared_fc_arousal.1.weight: Grad Norm = 0.0037
shared_fc_arousal.1.bias: Grad Norm = 0.0036
shared_fc_dominance.0.weight: Grad Norm = 0.1440
shared_fc_dominance.0.bias: Grad Norm = 0.0114
shared_fc_dominance.1.weight: Grad Norm = 0.0100
shared_fc_dominance.1.bias: Grad Norm = 0.0103
output_branch_arousal.0.

Epoch 12/15 [Train]:  40%|███▉      | 401/1004 [03:42<05:35,  1.80it/s, loss=0.2155, a_loss=0.1552, d_loss=0.2356]

fusion_layer_arousal.0.weight: Grad Norm = 0.0700
fusion_layer_arousal.0.bias: Grad Norm = 0.0140
fusion_layer_arousal.1.weight: Grad Norm = 0.0032
fusion_layer_arousal.1.bias: Grad Norm = 0.0027
fusion_layer_arousal.4.weight: Grad Norm = 0.0819
fusion_layer_arousal.4.bias: Grad Norm = 0.0041
fusion_layer_dominance.0.weight: Grad Norm = 0.1586
fusion_layer_dominance.0.bias: Grad Norm = 0.0286
fusion_layer_dominance.1.weight: Grad Norm = 0.0084
fusion_layer_dominance.1.bias: Grad Norm = 0.0073
fusion_layer_dominance.4.weight: Grad Norm = 0.1617
fusion_layer_dominance.4.bias: Grad Norm = 0.0081
shared_fc_arousal.0.weight: Grad Norm = 0.0683
shared_fc_arousal.0.bias: Grad Norm = 0.0075
shared_fc_arousal.1.weight: Grad Norm = 0.0039
shared_fc_arousal.1.bias: Grad Norm = 0.0039
shared_fc_dominance.0.weight: Grad Norm = 0.1510
shared_fc_dominance.0.bias: Grad Norm = 0.0127
shared_fc_dominance.1.weight: Grad Norm = 0.0128
shared_fc_dominance.1.bias: Grad Norm = 0.0108
output_branch_arousal.0.

Epoch 12/15 [Train]:  50%|████▉     | 501/1004 [04:37<04:08,  2.02it/s, loss=0.1179, a_loss=0.1361, d_loss=0.1118]

fusion_layer_arousal.0.weight: Grad Norm = 0.0757
fusion_layer_arousal.0.bias: Grad Norm = 0.0167
fusion_layer_arousal.1.weight: Grad Norm = 0.0034
fusion_layer_arousal.1.bias: Grad Norm = 0.0028
fusion_layer_arousal.4.weight: Grad Norm = 0.0910
fusion_layer_arousal.4.bias: Grad Norm = 0.0046
fusion_layer_dominance.0.weight: Grad Norm = 0.1377
fusion_layer_dominance.0.bias: Grad Norm = 0.0327
fusion_layer_dominance.1.weight: Grad Norm = 0.0082
fusion_layer_dominance.1.bias: Grad Norm = 0.0074
fusion_layer_dominance.4.weight: Grad Norm = 0.1306
fusion_layer_dominance.4.bias: Grad Norm = 0.0074
shared_fc_arousal.0.weight: Grad Norm = 0.0745
shared_fc_arousal.0.bias: Grad Norm = 0.0074
shared_fc_arousal.1.weight: Grad Norm = 0.0047
shared_fc_arousal.1.bias: Grad Norm = 0.0046
shared_fc_dominance.0.weight: Grad Norm = 0.1205
shared_fc_dominance.0.bias: Grad Norm = 0.0116
shared_fc_dominance.1.weight: Grad Norm = 0.0092
shared_fc_dominance.1.bias: Grad Norm = 0.0092
output_branch_arousal.0.

Epoch 12/15 [Train]:  60%|█████▉    | 601/1004 [05:33<03:38,  1.84it/s, loss=0.1231, a_loss=0.2168, d_loss=0.0919]

fusion_layer_arousal.0.weight: Grad Norm = 0.1106
fusion_layer_arousal.0.bias: Grad Norm = 0.0280
fusion_layer_arousal.1.weight: Grad Norm = 0.0043
fusion_layer_arousal.1.bias: Grad Norm = 0.0046
fusion_layer_arousal.4.weight: Grad Norm = 0.1134
fusion_layer_arousal.4.bias: Grad Norm = 0.0075
fusion_layer_dominance.0.weight: Grad Norm = 0.1234
fusion_layer_dominance.0.bias: Grad Norm = 0.0363
fusion_layer_dominance.1.weight: Grad Norm = 0.0078
fusion_layer_dominance.1.bias: Grad Norm = 0.0064
fusion_layer_dominance.4.weight: Grad Norm = 0.1339
fusion_layer_dominance.4.bias: Grad Norm = 0.0074
shared_fc_arousal.0.weight: Grad Norm = 0.0900
shared_fc_arousal.0.bias: Grad Norm = 0.0119
shared_fc_arousal.1.weight: Grad Norm = 0.0052
shared_fc_arousal.1.bias: Grad Norm = 0.0052
shared_fc_dominance.0.weight: Grad Norm = 0.1190
shared_fc_dominance.0.bias: Grad Norm = 0.0103
shared_fc_dominance.1.weight: Grad Norm = 0.0090
shared_fc_dominance.1.bias: Grad Norm = 0.0086
output_branch_arousal.0.

Epoch 12/15 [Train]:  70%|██████▉   | 701/1004 [06:28<02:51,  1.76it/s, loss=0.1321, a_loss=0.1038, d_loss=0.1415]

fusion_layer_arousal.0.weight: Grad Norm = 0.0669
fusion_layer_arousal.0.bias: Grad Norm = 0.0124
fusion_layer_arousal.1.weight: Grad Norm = 0.0026
fusion_layer_arousal.1.bias: Grad Norm = 0.0023
fusion_layer_arousal.4.weight: Grad Norm = 0.0638
fusion_layer_arousal.4.bias: Grad Norm = 0.0029
fusion_layer_dominance.0.weight: Grad Norm = 0.1082
fusion_layer_dominance.0.bias: Grad Norm = 0.0168
fusion_layer_dominance.1.weight: Grad Norm = 0.0055
fusion_layer_dominance.1.bias: Grad Norm = 0.0052
fusion_layer_dominance.4.weight: Grad Norm = 0.1286
fusion_layer_dominance.4.bias: Grad Norm = 0.0060
shared_fc_arousal.0.weight: Grad Norm = 0.0534
shared_fc_arousal.0.bias: Grad Norm = 0.0050
shared_fc_arousal.1.weight: Grad Norm = 0.0035
shared_fc_arousal.1.bias: Grad Norm = 0.0027
shared_fc_dominance.0.weight: Grad Norm = 0.1211
shared_fc_dominance.0.bias: Grad Norm = 0.0110
shared_fc_dominance.1.weight: Grad Norm = 0.0092
shared_fc_dominance.1.bias: Grad Norm = 0.0081
output_branch_arousal.0.

Epoch 12/15 [Train]:  80%|███████▉  | 801/1004 [07:22<01:49,  1.85it/s, loss=0.1932, a_loss=0.2122, d_loss=0.1869]

fusion_layer_arousal.0.weight: Grad Norm = 0.0913
fusion_layer_arousal.0.bias: Grad Norm = 0.0170
fusion_layer_arousal.1.weight: Grad Norm = 0.0035
fusion_layer_arousal.1.bias: Grad Norm = 0.0032
fusion_layer_arousal.4.weight: Grad Norm = 0.0986
fusion_layer_arousal.4.bias: Grad Norm = 0.0055
fusion_layer_dominance.0.weight: Grad Norm = 0.1612
fusion_layer_dominance.0.bias: Grad Norm = 0.0326
fusion_layer_dominance.1.weight: Grad Norm = 0.0098
fusion_layer_dominance.1.bias: Grad Norm = 0.0089
fusion_layer_dominance.4.weight: Grad Norm = 0.1777
fusion_layer_dominance.4.bias: Grad Norm = 0.0100
shared_fc_arousal.0.weight: Grad Norm = 0.0853
shared_fc_arousal.0.bias: Grad Norm = 0.0091
shared_fc_arousal.1.weight: Grad Norm = 0.0050
shared_fc_arousal.1.bias: Grad Norm = 0.0050
shared_fc_dominance.0.weight: Grad Norm = 0.1728
shared_fc_dominance.0.bias: Grad Norm = 0.0146
shared_fc_dominance.1.weight: Grad Norm = 0.0133
shared_fc_dominance.1.bias: Grad Norm = 0.0130
output_branch_arousal.0.

Epoch 12/15 [Train]:  90%|████████▉ | 901/1004 [08:18<00:57,  1.79it/s, loss=0.0734, a_loss=0.1557, d_loss=0.0459]

fusion_layer_arousal.0.weight: Grad Norm = 0.0579
fusion_layer_arousal.0.bias: Grad Norm = 0.0133
fusion_layer_arousal.1.weight: Grad Norm = 0.0025
fusion_layer_arousal.1.bias: Grad Norm = 0.0022
fusion_layer_arousal.4.weight: Grad Norm = 0.0658
fusion_layer_arousal.4.bias: Grad Norm = 0.0033
fusion_layer_dominance.0.weight: Grad Norm = 0.0705
fusion_layer_dominance.0.bias: Grad Norm = 0.0137
fusion_layer_dominance.1.weight: Grad Norm = 0.0039
fusion_layer_dominance.1.bias: Grad Norm = 0.0035
fusion_layer_dominance.4.weight: Grad Norm = 0.0778
fusion_layer_dominance.4.bias: Grad Norm = 0.0042
shared_fc_arousal.0.weight: Grad Norm = 0.0592
shared_fc_arousal.0.bias: Grad Norm = 0.0057
shared_fc_arousal.1.weight: Grad Norm = 0.0036
shared_fc_arousal.1.bias: Grad Norm = 0.0036
shared_fc_dominance.0.weight: Grad Norm = 0.0674
shared_fc_dominance.0.bias: Grad Norm = 0.0067
shared_fc_dominance.1.weight: Grad Norm = 0.0047
shared_fc_dominance.1.bias: Grad Norm = 0.0049
output_branch_arousal.0.

Epoch 12/15 [Train]: 100%|█████████▉| 1001/1004 [09:15<00:01,  1.60it/s, loss=0.2070, a_loss=0.1954, d_loss=0.2108]

fusion_layer_arousal.0.weight: Grad Norm = 0.0685
fusion_layer_arousal.0.bias: Grad Norm = 0.0161
fusion_layer_arousal.1.weight: Grad Norm = 0.0026
fusion_layer_arousal.1.bias: Grad Norm = 0.0025
fusion_layer_arousal.4.weight: Grad Norm = 0.0770
fusion_layer_arousal.4.bias: Grad Norm = 0.0036
fusion_layer_dominance.0.weight: Grad Norm = 0.1825
fusion_layer_dominance.0.bias: Grad Norm = 0.0416
fusion_layer_dominance.1.weight: Grad Norm = 0.0091
fusion_layer_dominance.1.bias: Grad Norm = 0.0087
fusion_layer_dominance.4.weight: Grad Norm = 0.1661
fusion_layer_dominance.4.bias: Grad Norm = 0.0087
shared_fc_arousal.0.weight: Grad Norm = 0.0577
shared_fc_arousal.0.bias: Grad Norm = 0.0060
shared_fc_arousal.1.weight: Grad Norm = 0.0035
shared_fc_arousal.1.bias: Grad Norm = 0.0032
shared_fc_dominance.0.weight: Grad Norm = 0.1409
shared_fc_dominance.0.bias: Grad Norm = 0.0131
shared_fc_dominance.1.weight: Grad Norm = 0.0106
shared_fc_dominance.1.bias: Grad Norm = 0.0100
output_branch_arousal.0.

Epoch 12/15 [Train]: 100%|██████████| 1004/1004 [09:17<00:00,  1.80it/s, loss=0.1806, a_loss=0.0770, d_loss=0.2152]
Epoch 12/15 [Val]: 100%|██████████| 126/126 [00:45<00:00,  2.79it/s, loss=0.3088, a_loss=0.1785, d_loss=0.4391]


Epoch 12/15 Results:
  Train Loss: 0.1538 (Arousal: 0.1164, Dominance: 0.1663)
  Val Loss: 0.1394 (Arousal: 0.1030, Dominance: 0.1758)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1394)


Epoch 13/15 [Train]:   0%|          | 1/1004 [00:00<09:06,  1.83it/s, loss=0.2851, a_loss=0.0412, d_loss=0.3664]

fusion_layer_arousal.0.weight: Grad Norm = 0.0414
fusion_layer_arousal.0.bias: Grad Norm = 0.0104
fusion_layer_arousal.1.weight: Grad Norm = 0.0015
fusion_layer_arousal.1.bias: Grad Norm = 0.0014
fusion_layer_arousal.4.weight: Grad Norm = 0.0417
fusion_layer_arousal.4.bias: Grad Norm = 0.0020
fusion_layer_dominance.0.weight: Grad Norm = 0.1521
fusion_layer_dominance.0.bias: Grad Norm = 0.0355
fusion_layer_dominance.1.weight: Grad Norm = 0.0077
fusion_layer_dominance.1.bias: Grad Norm = 0.0072
fusion_layer_dominance.4.weight: Grad Norm = 0.1546
fusion_layer_dominance.4.bias: Grad Norm = 0.0074
shared_fc_arousal.0.weight: Grad Norm = 0.0331
shared_fc_arousal.0.bias: Grad Norm = 0.0033
shared_fc_arousal.1.weight: Grad Norm = 0.0019
shared_fc_arousal.1.bias: Grad Norm = 0.0018
shared_fc_dominance.0.weight: Grad Norm = 0.1373
shared_fc_dominance.0.bias: Grad Norm = 0.0124
shared_fc_dominance.1.weight: Grad Norm = 0.0102
shared_fc_dominance.1.bias: Grad Norm = 0.0095
output_branch_arousal.0.

Epoch 13/15 [Train]:  10%|█         | 101/1004 [00:55<07:56,  1.89it/s, loss=0.1007, a_loss=0.0708, d_loss=0.1107]

fusion_layer_arousal.0.weight: Grad Norm = 0.0528
fusion_layer_arousal.0.bias: Grad Norm = 0.0120
fusion_layer_arousal.1.weight: Grad Norm = 0.0020
fusion_layer_arousal.1.bias: Grad Norm = 0.0020
fusion_layer_arousal.4.weight: Grad Norm = 0.0548
fusion_layer_arousal.4.bias: Grad Norm = 0.0032
fusion_layer_dominance.0.weight: Grad Norm = 0.1371
fusion_layer_dominance.0.bias: Grad Norm = 0.0295
fusion_layer_dominance.1.weight: Grad Norm = 0.0067
fusion_layer_dominance.1.bias: Grad Norm = 0.0065
fusion_layer_dominance.4.weight: Grad Norm = 0.1294
fusion_layer_dominance.4.bias: Grad Norm = 0.0082
shared_fc_arousal.0.weight: Grad Norm = 0.0433
shared_fc_arousal.0.bias: Grad Norm = 0.0048
shared_fc_arousal.1.weight: Grad Norm = 0.0024
shared_fc_arousal.1.bias: Grad Norm = 0.0025
shared_fc_dominance.0.weight: Grad Norm = 0.1125
shared_fc_dominance.0.bias: Grad Norm = 0.0112
shared_fc_dominance.1.weight: Grad Norm = 0.0080
shared_fc_dominance.1.bias: Grad Norm = 0.0084
output_branch_arousal.0.

Epoch 13/15 [Train]:  20%|██        | 201/1004 [01:51<07:14,  1.85it/s, loss=0.1571, a_loss=0.1002, d_loss=0.1761]

fusion_layer_arousal.0.weight: Grad Norm = 0.0824
fusion_layer_arousal.0.bias: Grad Norm = 0.0221
fusion_layer_arousal.1.weight: Grad Norm = 0.0030
fusion_layer_arousal.1.bias: Grad Norm = 0.0030
fusion_layer_arousal.4.weight: Grad Norm = 0.0814
fusion_layer_arousal.4.bias: Grad Norm = 0.0048
fusion_layer_dominance.0.weight: Grad Norm = 0.1355
fusion_layer_dominance.0.bias: Grad Norm = 0.0389
fusion_layer_dominance.1.weight: Grad Norm = 0.0073
fusion_layer_dominance.1.bias: Grad Norm = 0.0064
fusion_layer_dominance.4.weight: Grad Norm = 0.1399
fusion_layer_dominance.4.bias: Grad Norm = 0.0060
shared_fc_arousal.0.weight: Grad Norm = 0.0671
shared_fc_arousal.0.bias: Grad Norm = 0.0075
shared_fc_arousal.1.weight: Grad Norm = 0.0033
shared_fc_arousal.1.bias: Grad Norm = 0.0037
shared_fc_dominance.0.weight: Grad Norm = 0.1248
shared_fc_dominance.0.bias: Grad Norm = 0.0098
shared_fc_dominance.1.weight: Grad Norm = 0.0082
shared_fc_dominance.1.bias: Grad Norm = 0.0090
output_branch_arousal.0.

Epoch 13/15 [Train]:  30%|██▉       | 301/1004 [02:47<06:12,  1.89it/s, loss=0.1309, a_loss=0.1042, d_loss=0.1398]

fusion_layer_arousal.0.weight: Grad Norm = 0.0695
fusion_layer_arousal.0.bias: Grad Norm = 0.0127
fusion_layer_arousal.1.weight: Grad Norm = 0.0026
fusion_layer_arousal.1.bias: Grad Norm = 0.0024
fusion_layer_arousal.4.weight: Grad Norm = 0.0735
fusion_layer_arousal.4.bias: Grad Norm = 0.0044
fusion_layer_dominance.0.weight: Grad Norm = 0.1233
fusion_layer_dominance.0.bias: Grad Norm = 0.0229
fusion_layer_dominance.1.weight: Grad Norm = 0.0069
fusion_layer_dominance.1.bias: Grad Norm = 0.0062
fusion_layer_dominance.4.weight: Grad Norm = 0.1389
fusion_layer_dominance.4.bias: Grad Norm = 0.0072
shared_fc_arousal.0.weight: Grad Norm = 0.0603
shared_fc_arousal.0.bias: Grad Norm = 0.0067
shared_fc_arousal.1.weight: Grad Norm = 0.0043
shared_fc_arousal.1.bias: Grad Norm = 0.0037
shared_fc_dominance.0.weight: Grad Norm = 0.1334
shared_fc_dominance.0.bias: Grad Norm = 0.0113
shared_fc_dominance.1.weight: Grad Norm = 0.0101
shared_fc_dominance.1.bias: Grad Norm = 0.0105
output_branch_arousal.0.

Epoch 13/15 [Train]:  40%|███▉      | 401/1004 [03:42<06:01,  1.67it/s, loss=0.1615, a_loss=0.1216, d_loss=0.1748]

fusion_layer_arousal.0.weight: Grad Norm = 0.0999
fusion_layer_arousal.0.bias: Grad Norm = 0.0253
fusion_layer_arousal.1.weight: Grad Norm = 0.0036
fusion_layer_arousal.1.bias: Grad Norm = 0.0039
fusion_layer_arousal.4.weight: Grad Norm = 0.1028
fusion_layer_arousal.4.bias: Grad Norm = 0.0069
fusion_layer_dominance.0.weight: Grad Norm = 0.1591
fusion_layer_dominance.0.bias: Grad Norm = 0.0312
fusion_layer_dominance.1.weight: Grad Norm = 0.0072
fusion_layer_dominance.1.bias: Grad Norm = 0.0065
fusion_layer_dominance.4.weight: Grad Norm = 0.1350
fusion_layer_dominance.4.bias: Grad Norm = 0.0052
shared_fc_arousal.0.weight: Grad Norm = 0.0860
shared_fc_arousal.0.bias: Grad Norm = 0.0102
shared_fc_arousal.1.weight: Grad Norm = 0.0043
shared_fc_arousal.1.bias: Grad Norm = 0.0053
shared_fc_dominance.0.weight: Grad Norm = 0.1221
shared_fc_dominance.0.bias: Grad Norm = 0.0097
shared_fc_dominance.1.weight: Grad Norm = 0.0085
shared_fc_dominance.1.bias: Grad Norm = 0.0086
output_branch_arousal.0.

Epoch 13/15 [Train]:  50%|████▉     | 501/1004 [04:37<05:03,  1.66it/s, loss=0.2197, a_loss=0.1123, d_loss=0.2554]

fusion_layer_arousal.0.weight: Grad Norm = 0.0535
fusion_layer_arousal.0.bias: Grad Norm = 0.0102
fusion_layer_arousal.1.weight: Grad Norm = 0.0021
fusion_layer_arousal.1.bias: Grad Norm = 0.0019
fusion_layer_arousal.4.weight: Grad Norm = 0.0575
fusion_layer_arousal.4.bias: Grad Norm = 0.0031
fusion_layer_dominance.0.weight: Grad Norm = 0.1825
fusion_layer_dominance.0.bias: Grad Norm = 0.0372
fusion_layer_dominance.1.weight: Grad Norm = 0.0098
fusion_layer_dominance.1.bias: Grad Norm = 0.0090
fusion_layer_dominance.4.weight: Grad Norm = 0.1702
fusion_layer_dominance.4.bias: Grad Norm = 0.0103
shared_fc_arousal.0.weight: Grad Norm = 0.0491
shared_fc_arousal.0.bias: Grad Norm = 0.0052
shared_fc_arousal.1.weight: Grad Norm = 0.0029
shared_fc_arousal.1.bias: Grad Norm = 0.0029
shared_fc_dominance.0.weight: Grad Norm = 0.1495
shared_fc_dominance.0.bias: Grad Norm = 0.0138
shared_fc_dominance.1.weight: Grad Norm = 0.0113
shared_fc_dominance.1.bias: Grad Norm = 0.0117
output_branch_arousal.0.

Epoch 13/15 [Train]:  60%|█████▉    | 601/1004 [05:33<03:54,  1.72it/s, loss=0.1357, a_loss=0.0673, d_loss=0.1585]

fusion_layer_arousal.0.weight: Grad Norm = 0.0436
fusion_layer_arousal.0.bias: Grad Norm = 0.0110
fusion_layer_arousal.1.weight: Grad Norm = 0.0016
fusion_layer_arousal.1.bias: Grad Norm = 0.0016
fusion_layer_arousal.4.weight: Grad Norm = 0.0465
fusion_layer_arousal.4.bias: Grad Norm = 0.0024
fusion_layer_dominance.0.weight: Grad Norm = 0.1008
fusion_layer_dominance.0.bias: Grad Norm = 0.0261
fusion_layer_dominance.1.weight: Grad Norm = 0.0047
fusion_layer_dominance.1.bias: Grad Norm = 0.0049
fusion_layer_dominance.4.weight: Grad Norm = 0.1216
fusion_layer_dominance.4.bias: Grad Norm = 0.0050
shared_fc_arousal.0.weight: Grad Norm = 0.0369
shared_fc_arousal.0.bias: Grad Norm = 0.0037
shared_fc_arousal.1.weight: Grad Norm = 0.0021
shared_fc_arousal.1.bias: Grad Norm = 0.0022
shared_fc_dominance.0.weight: Grad Norm = 0.1187
shared_fc_dominance.0.bias: Grad Norm = 0.0094
shared_fc_dominance.1.weight: Grad Norm = 0.0102
shared_fc_dominance.1.bias: Grad Norm = 0.0089
output_branch_arousal.0.

Epoch 13/15 [Train]:  70%|██████▉   | 701/1004 [06:28<02:44,  1.84it/s, loss=0.1340, a_loss=0.0834, d_loss=0.1508]

fusion_layer_arousal.0.weight: Grad Norm = 0.0843
fusion_layer_arousal.0.bias: Grad Norm = 0.0147
fusion_layer_arousal.1.weight: Grad Norm = 0.0032
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0764
fusion_layer_arousal.4.bias: Grad Norm = 0.0047
fusion_layer_dominance.0.weight: Grad Norm = 0.1503
fusion_layer_dominance.0.bias: Grad Norm = 0.0303
fusion_layer_dominance.1.weight: Grad Norm = 0.0070
fusion_layer_dominance.1.bias: Grad Norm = 0.0073
fusion_layer_dominance.4.weight: Grad Norm = 0.1681
fusion_layer_dominance.4.bias: Grad Norm = 0.0099
shared_fc_arousal.0.weight: Grad Norm = 0.0571
shared_fc_arousal.0.bias: Grad Norm = 0.0077
shared_fc_arousal.1.weight: Grad Norm = 0.0032
shared_fc_arousal.1.bias: Grad Norm = 0.0030
shared_fc_dominance.0.weight: Grad Norm = 0.1680
shared_fc_dominance.0.bias: Grad Norm = 0.0145
shared_fc_dominance.1.weight: Grad Norm = 0.0120
shared_fc_dominance.1.bias: Grad Norm = 0.0123
output_branch_arousal.0.

Epoch 13/15 [Train]:  80%|███████▉  | 801/1004 [07:24<01:49,  1.85it/s, loss=0.2353, a_loss=0.1522, d_loss=0.2630]

fusion_layer_arousal.0.weight: Grad Norm = 0.1240
fusion_layer_arousal.0.bias: Grad Norm = 0.0237
fusion_layer_arousal.1.weight: Grad Norm = 0.0045
fusion_layer_arousal.1.bias: Grad Norm = 0.0045
fusion_layer_arousal.4.weight: Grad Norm = 0.1065
fusion_layer_arousal.4.bias: Grad Norm = 0.0076
fusion_layer_dominance.0.weight: Grad Norm = 0.1727
fusion_layer_dominance.0.bias: Grad Norm = 0.0384
fusion_layer_dominance.1.weight: Grad Norm = 0.0102
fusion_layer_dominance.1.bias: Grad Norm = 0.0092
fusion_layer_dominance.4.weight: Grad Norm = 0.2002
fusion_layer_dominance.4.bias: Grad Norm = 0.0118
shared_fc_arousal.0.weight: Grad Norm = 0.0874
shared_fc_arousal.0.bias: Grad Norm = 0.0108
shared_fc_arousal.1.weight: Grad Norm = 0.0056
shared_fc_arousal.1.bias: Grad Norm = 0.0054
shared_fc_dominance.0.weight: Grad Norm = 0.1807
shared_fc_dominance.0.bias: Grad Norm = 0.0176
shared_fc_dominance.1.weight: Grad Norm = 0.0133
shared_fc_dominance.1.bias: Grad Norm = 0.0149
output_branch_arousal.0.

Epoch 13/15 [Train]:  90%|████████▉ | 901/1004 [08:21<00:54,  1.88it/s, loss=0.1318, a_loss=0.1155, d_loss=0.1372]

fusion_layer_arousal.0.weight: Grad Norm = 0.0545
fusion_layer_arousal.0.bias: Grad Norm = 0.0123
fusion_layer_arousal.1.weight: Grad Norm = 0.0025
fusion_layer_arousal.1.bias: Grad Norm = 0.0021
fusion_layer_arousal.4.weight: Grad Norm = 0.0646
fusion_layer_arousal.4.bias: Grad Norm = 0.0033
fusion_layer_dominance.0.weight: Grad Norm = 0.0832
fusion_layer_dominance.0.bias: Grad Norm = 0.0164
fusion_layer_dominance.1.weight: Grad Norm = 0.0049
fusion_layer_dominance.1.bias: Grad Norm = 0.0043
fusion_layer_dominance.4.weight: Grad Norm = 0.0905
fusion_layer_dominance.4.bias: Grad Norm = 0.0048
shared_fc_arousal.0.weight: Grad Norm = 0.0554
shared_fc_arousal.0.bias: Grad Norm = 0.0054
shared_fc_arousal.1.weight: Grad Norm = 0.0040
shared_fc_arousal.1.bias: Grad Norm = 0.0032
shared_fc_dominance.0.weight: Grad Norm = 0.0825
shared_fc_dominance.0.bias: Grad Norm = 0.0063
shared_fc_dominance.1.weight: Grad Norm = 0.0068
shared_fc_dominance.1.bias: Grad Norm = 0.0065
output_branch_arousal.0.

Epoch 13/15 [Train]: 100%|█████████▉| 1001/1004 [09:16<00:01,  1.84it/s, loss=0.1722, a_loss=0.1440, d_loss=0.1816]

fusion_layer_arousal.0.weight: Grad Norm = 0.0716
fusion_layer_arousal.0.bias: Grad Norm = 0.0163
fusion_layer_arousal.1.weight: Grad Norm = 0.0026
fusion_layer_arousal.1.bias: Grad Norm = 0.0028
fusion_layer_arousal.4.weight: Grad Norm = 0.0778
fusion_layer_arousal.4.bias: Grad Norm = 0.0051
fusion_layer_dominance.0.weight: Grad Norm = 0.1224
fusion_layer_dominance.0.bias: Grad Norm = 0.0243
fusion_layer_dominance.1.weight: Grad Norm = 0.0077
fusion_layer_dominance.1.bias: Grad Norm = 0.0062
fusion_layer_dominance.4.weight: Grad Norm = 0.1393
fusion_layer_dominance.4.bias: Grad Norm = 0.0068
shared_fc_arousal.0.weight: Grad Norm = 0.0655
shared_fc_arousal.0.bias: Grad Norm = 0.0073
shared_fc_arousal.1.weight: Grad Norm = 0.0042
shared_fc_arousal.1.bias: Grad Norm = 0.0041
shared_fc_dominance.0.weight: Grad Norm = 0.1356
shared_fc_dominance.0.bias: Grad Norm = 0.0097
shared_fc_dominance.1.weight: Grad Norm = 0.0092
shared_fc_dominance.1.bias: Grad Norm = 0.0098
output_branch_arousal.0.

Epoch 13/15 [Train]: 100%|██████████| 1004/1004 [09:18<00:00,  1.80it/s, loss=0.1678, a_loss=0.1330, d_loss=0.1795]
Epoch 13/15 [Val]: 100%|██████████| 126/126 [00:45<00:00,  2.79it/s, loss=0.1794, a_loss=0.0901, d_loss=0.2687]


Epoch 13/15 Results:
  Train Loss: 0.1544 (Arousal: 0.1184, Dominance: 0.1664)
  Val Loss: 0.1815 (Arousal: 0.1421, Dominance: 0.2209)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500


Epoch 14/15 [Train]:   0%|          | 1/1004 [00:00<10:11,  1.64it/s, loss=0.0997, a_loss=0.1625, d_loss=0.0787]

fusion_layer_arousal.0.weight: Grad Norm = 0.0834
fusion_layer_arousal.0.bias: Grad Norm = 0.0136
fusion_layer_arousal.1.weight: Grad Norm = 0.0031
fusion_layer_arousal.1.bias: Grad Norm = 0.0027
fusion_layer_arousal.4.weight: Grad Norm = 0.0762
fusion_layer_arousal.4.bias: Grad Norm = 0.0037
fusion_layer_dominance.0.weight: Grad Norm = 0.0727
fusion_layer_dominance.0.bias: Grad Norm = 0.0156
fusion_layer_dominance.1.weight: Grad Norm = 0.0046
fusion_layer_dominance.1.bias: Grad Norm = 0.0044
fusion_layer_dominance.4.weight: Grad Norm = 0.0962
fusion_layer_dominance.4.bias: Grad Norm = 0.0054
shared_fc_arousal.0.weight: Grad Norm = 0.0606
shared_fc_arousal.0.bias: Grad Norm = 0.0060
shared_fc_arousal.1.weight: Grad Norm = 0.0034
shared_fc_arousal.1.bias: Grad Norm = 0.0032
shared_fc_dominance.0.weight: Grad Norm = 0.0900
shared_fc_dominance.0.bias: Grad Norm = 0.0076
shared_fc_dominance.1.weight: Grad Norm = 0.0075
shared_fc_dominance.1.bias: Grad Norm = 0.0068
output_branch_arousal.0.

Epoch 14/15 [Train]:  10%|█         | 101/1004 [00:56<08:12,  1.83it/s, loss=0.0778, a_loss=0.0944, d_loss=0.0723]

fusion_layer_arousal.0.weight: Grad Norm = 0.0520
fusion_layer_arousal.0.bias: Grad Norm = 0.0098
fusion_layer_arousal.1.weight: Grad Norm = 0.0023
fusion_layer_arousal.1.bias: Grad Norm = 0.0018
fusion_layer_arousal.4.weight: Grad Norm = 0.0555
fusion_layer_arousal.4.bias: Grad Norm = 0.0024
fusion_layer_dominance.0.weight: Grad Norm = 0.0818
fusion_layer_dominance.0.bias: Grad Norm = 0.0201
fusion_layer_dominance.1.weight: Grad Norm = 0.0051
fusion_layer_dominance.1.bias: Grad Norm = 0.0047
fusion_layer_dominance.4.weight: Grad Norm = 0.0982
fusion_layer_dominance.4.bias: Grad Norm = 0.0059
shared_fc_arousal.0.weight: Grad Norm = 0.0478
shared_fc_arousal.0.bias: Grad Norm = 0.0047
shared_fc_arousal.1.weight: Grad Norm = 0.0033
shared_fc_arousal.1.bias: Grad Norm = 0.0027
shared_fc_dominance.0.weight: Grad Norm = 0.0938
shared_fc_dominance.0.bias: Grad Norm = 0.0082
shared_fc_dominance.1.weight: Grad Norm = 0.0075
shared_fc_dominance.1.bias: Grad Norm = 0.0074
output_branch_arousal.0.

Epoch 14/15 [Train]:  20%|██        | 201/1004 [01:51<06:52,  1.94it/s, loss=0.1316, a_loss=0.1644, d_loss=0.1207]

fusion_layer_arousal.0.weight: Grad Norm = 0.0500
fusion_layer_arousal.0.bias: Grad Norm = 0.0111
fusion_layer_arousal.1.weight: Grad Norm = 0.0023
fusion_layer_arousal.1.bias: Grad Norm = 0.0019
fusion_layer_arousal.4.weight: Grad Norm = 0.0575
fusion_layer_arousal.4.bias: Grad Norm = 0.0028
fusion_layer_dominance.0.weight: Grad Norm = 0.0864
fusion_layer_dominance.0.bias: Grad Norm = 0.0212
fusion_layer_dominance.1.weight: Grad Norm = 0.0048
fusion_layer_dominance.1.bias: Grad Norm = 0.0040
fusion_layer_dominance.4.weight: Grad Norm = 0.0953
fusion_layer_dominance.4.bias: Grad Norm = 0.0041
shared_fc_arousal.0.weight: Grad Norm = 0.0501
shared_fc_arousal.0.bias: Grad Norm = 0.0048
shared_fc_arousal.1.weight: Grad Norm = 0.0035
shared_fc_arousal.1.bias: Grad Norm = 0.0027
shared_fc_dominance.0.weight: Grad Norm = 0.0942
shared_fc_dominance.0.bias: Grad Norm = 0.0072
shared_fc_dominance.1.weight: Grad Norm = 0.0078
shared_fc_dominance.1.bias: Grad Norm = 0.0073
output_branch_arousal.0.

Epoch 14/15 [Train]:  30%|██▉       | 301/1004 [02:46<06:20,  1.85it/s, loss=0.2739, a_loss=0.1674, d_loss=0.3094]

fusion_layer_arousal.0.weight: Grad Norm = 0.0781
fusion_layer_arousal.0.bias: Grad Norm = 0.0191
fusion_layer_arousal.1.weight: Grad Norm = 0.0035
fusion_layer_arousal.1.bias: Grad Norm = 0.0031
fusion_layer_arousal.4.weight: Grad Norm = 0.0838
fusion_layer_arousal.4.bias: Grad Norm = 0.0042
fusion_layer_dominance.0.weight: Grad Norm = 0.1245
fusion_layer_dominance.0.bias: Grad Norm = 0.0284
fusion_layer_dominance.1.weight: Grad Norm = 0.0077
fusion_layer_dominance.1.bias: Grad Norm = 0.0069
fusion_layer_dominance.4.weight: Grad Norm = 0.1343
fusion_layer_dominance.4.bias: Grad Norm = 0.0074
shared_fc_arousal.0.weight: Grad Norm = 0.0714
shared_fc_arousal.0.bias: Grad Norm = 0.0071
shared_fc_arousal.1.weight: Grad Norm = 0.0041
shared_fc_arousal.1.bias: Grad Norm = 0.0043
shared_fc_dominance.0.weight: Grad Norm = 0.1309
shared_fc_dominance.0.bias: Grad Norm = 0.0111
shared_fc_dominance.1.weight: Grad Norm = 0.0116
shared_fc_dominance.1.bias: Grad Norm = 0.0109
output_branch_arousal.0.

Epoch 14/15 [Train]:  40%|███▉      | 401/1004 [03:42<05:23,  1.87it/s, loss=0.1407, a_loss=0.0740, d_loss=0.1629]

fusion_layer_arousal.0.weight: Grad Norm = 0.0464
fusion_layer_arousal.0.bias: Grad Norm = 0.0084
fusion_layer_arousal.1.weight: Grad Norm = 0.0017
fusion_layer_arousal.1.bias: Grad Norm = 0.0016
fusion_layer_arousal.4.weight: Grad Norm = 0.0474
fusion_layer_arousal.4.bias: Grad Norm = 0.0025
fusion_layer_dominance.0.weight: Grad Norm = 0.0964
fusion_layer_dominance.0.bias: Grad Norm = 0.0175
fusion_layer_dominance.1.weight: Grad Norm = 0.0054
fusion_layer_dominance.1.bias: Grad Norm = 0.0049
fusion_layer_dominance.4.weight: Grad Norm = 0.1357
fusion_layer_dominance.4.bias: Grad Norm = 0.0059
shared_fc_arousal.0.weight: Grad Norm = 0.0420
shared_fc_arousal.0.bias: Grad Norm = 0.0044
shared_fc_arousal.1.weight: Grad Norm = 0.0029
shared_fc_arousal.1.bias: Grad Norm = 0.0024
shared_fc_dominance.0.weight: Grad Norm = 0.1376
shared_fc_dominance.0.bias: Grad Norm = 0.0102
shared_fc_dominance.1.weight: Grad Norm = 0.0114
shared_fc_dominance.1.bias: Grad Norm = 0.0095
output_branch_arousal.0.

Epoch 14/15 [Train]:  50%|████▉     | 501/1004 [04:38<04:23,  1.91it/s, loss=0.0964, a_loss=0.1151, d_loss=0.0901]

fusion_layer_arousal.0.weight: Grad Norm = 0.0574
fusion_layer_arousal.0.bias: Grad Norm = 0.0122
fusion_layer_arousal.1.weight: Grad Norm = 0.0023
fusion_layer_arousal.1.bias: Grad Norm = 0.0021
fusion_layer_arousal.4.weight: Grad Norm = 0.0595
fusion_layer_arousal.4.bias: Grad Norm = 0.0030
fusion_layer_dominance.0.weight: Grad Norm = 0.1066
fusion_layer_dominance.0.bias: Grad Norm = 0.0195
fusion_layer_dominance.1.weight: Grad Norm = 0.0065
fusion_layer_dominance.1.bias: Grad Norm = 0.0056
fusion_layer_dominance.4.weight: Grad Norm = 0.1076
fusion_layer_dominance.4.bias: Grad Norm = 0.0049
shared_fc_arousal.0.weight: Grad Norm = 0.0499
shared_fc_arousal.0.bias: Grad Norm = 0.0051
shared_fc_arousal.1.weight: Grad Norm = 0.0034
shared_fc_arousal.1.bias: Grad Norm = 0.0028
shared_fc_dominance.0.weight: Grad Norm = 0.0957
shared_fc_dominance.0.bias: Grad Norm = 0.0073
shared_fc_dominance.1.weight: Grad Norm = 0.0072
shared_fc_dominance.1.bias: Grad Norm = 0.0068
output_branch_arousal.0.

Epoch 14/15 [Train]:  60%|█████▉    | 601/1004 [05:34<03:41,  1.82it/s, loss=0.1520, a_loss=0.0910, d_loss=0.1723]

fusion_layer_arousal.0.weight: Grad Norm = 0.1009
fusion_layer_arousal.0.bias: Grad Norm = 0.0178
fusion_layer_arousal.1.weight: Grad Norm = 0.0036
fusion_layer_arousal.1.bias: Grad Norm = 0.0034
fusion_layer_arousal.4.weight: Grad Norm = 0.0890
fusion_layer_arousal.4.bias: Grad Norm = 0.0054
fusion_layer_dominance.0.weight: Grad Norm = 0.1007
fusion_layer_dominance.0.bias: Grad Norm = 0.0184
fusion_layer_dominance.1.weight: Grad Norm = 0.0057
fusion_layer_dominance.1.bias: Grad Norm = 0.0051
fusion_layer_dominance.4.weight: Grad Norm = 0.1190
fusion_layer_dominance.4.bias: Grad Norm = 0.0046
shared_fc_arousal.0.weight: Grad Norm = 0.0719
shared_fc_arousal.0.bias: Grad Norm = 0.0080
shared_fc_arousal.1.weight: Grad Norm = 0.0043
shared_fc_arousal.1.bias: Grad Norm = 0.0041
shared_fc_dominance.0.weight: Grad Norm = 0.1161
shared_fc_dominance.0.bias: Grad Norm = 0.0082
shared_fc_dominance.1.weight: Grad Norm = 0.0092
shared_fc_dominance.1.bias: Grad Norm = 0.0081
output_branch_arousal.0.

Epoch 14/15 [Train]:  70%|██████▉   | 701/1004 [06:30<02:36,  1.94it/s, loss=0.0776, a_loss=0.0807, d_loss=0.0766]

fusion_layer_arousal.0.weight: Grad Norm = 0.0627
fusion_layer_arousal.0.bias: Grad Norm = 0.0116
fusion_layer_arousal.1.weight: Grad Norm = 0.0027
fusion_layer_arousal.1.bias: Grad Norm = 0.0022
fusion_layer_arousal.4.weight: Grad Norm = 0.0617
fusion_layer_arousal.4.bias: Grad Norm = 0.0030
fusion_layer_dominance.0.weight: Grad Norm = 0.1139
fusion_layer_dominance.0.bias: Grad Norm = 0.0245
fusion_layer_dominance.1.weight: Grad Norm = 0.0065
fusion_layer_dominance.1.bias: Grad Norm = 0.0058
fusion_layer_dominance.4.weight: Grad Norm = 0.1112
fusion_layer_dominance.4.bias: Grad Norm = 0.0056
shared_fc_arousal.0.weight: Grad Norm = 0.0495
shared_fc_arousal.0.bias: Grad Norm = 0.0050
shared_fc_arousal.1.weight: Grad Norm = 0.0031
shared_fc_arousal.1.bias: Grad Norm = 0.0026
shared_fc_dominance.0.weight: Grad Norm = 0.0982
shared_fc_dominance.0.bias: Grad Norm = 0.0076
shared_fc_dominance.1.weight: Grad Norm = 0.0074
shared_fc_dominance.1.bias: Grad Norm = 0.0078
output_branch_arousal.0.

Epoch 14/15 [Train]:  80%|███████▉  | 801/1004 [07:26<01:56,  1.74it/s, loss=0.1093, a_loss=0.1197, d_loss=0.1058]

fusion_layer_arousal.0.weight: Grad Norm = 0.0648
fusion_layer_arousal.0.bias: Grad Norm = 0.0137
fusion_layer_arousal.1.weight: Grad Norm = 0.0025
fusion_layer_arousal.1.bias: Grad Norm = 0.0025
fusion_layer_arousal.4.weight: Grad Norm = 0.0607
fusion_layer_arousal.4.bias: Grad Norm = 0.0038
fusion_layer_dominance.0.weight: Grad Norm = 0.0746
fusion_layer_dominance.0.bias: Grad Norm = 0.0137
fusion_layer_dominance.1.weight: Grad Norm = 0.0041
fusion_layer_dominance.1.bias: Grad Norm = 0.0037
fusion_layer_dominance.4.weight: Grad Norm = 0.0816
fusion_layer_dominance.4.bias: Grad Norm = 0.0040
shared_fc_arousal.0.weight: Grad Norm = 0.0518
shared_fc_arousal.0.bias: Grad Norm = 0.0054
shared_fc_arousal.1.weight: Grad Norm = 0.0036
shared_fc_arousal.1.bias: Grad Norm = 0.0033
shared_fc_dominance.0.weight: Grad Norm = 0.0768
shared_fc_dominance.0.bias: Grad Norm = 0.0067
shared_fc_dominance.1.weight: Grad Norm = 0.0069
shared_fc_dominance.1.bias: Grad Norm = 0.0061
output_branch_arousal.0.

Epoch 14/15 [Train]:  90%|████████▉ | 901/1004 [08:22<01:06,  1.54it/s, loss=0.1583, a_loss=0.0732, d_loss=0.1866]

fusion_layer_arousal.0.weight: Grad Norm = 0.0487
fusion_layer_arousal.0.bias: Grad Norm = 0.0128
fusion_layer_arousal.1.weight: Grad Norm = 0.0020
fusion_layer_arousal.1.bias: Grad Norm = 0.0019
fusion_layer_arousal.4.weight: Grad Norm = 0.0512
fusion_layer_arousal.4.bias: Grad Norm = 0.0026
fusion_layer_dominance.0.weight: Grad Norm = 0.1030
fusion_layer_dominance.0.bias: Grad Norm = 0.0258
fusion_layer_dominance.1.weight: Grad Norm = 0.0057
fusion_layer_dominance.1.bias: Grad Norm = 0.0058
fusion_layer_dominance.4.weight: Grad Norm = 0.1185
fusion_layer_dominance.4.bias: Grad Norm = 0.0069
shared_fc_arousal.0.weight: Grad Norm = 0.0422
shared_fc_arousal.0.bias: Grad Norm = 0.0045
shared_fc_arousal.1.weight: Grad Norm = 0.0026
shared_fc_arousal.1.bias: Grad Norm = 0.0025
shared_fc_dominance.0.weight: Grad Norm = 0.1157
shared_fc_dominance.0.bias: Grad Norm = 0.0101
shared_fc_dominance.1.weight: Grad Norm = 0.0107
shared_fc_dominance.1.bias: Grad Norm = 0.0094
output_branch_arousal.0.

Epoch 14/15 [Train]: 100%|█████████▉| 1001/1004 [09:19<00:01,  1.71it/s, loss=0.1293, a_loss=0.0956, d_loss=0.1406]

fusion_layer_arousal.0.weight: Grad Norm = 0.0630
fusion_layer_arousal.0.bias: Grad Norm = 0.0144
fusion_layer_arousal.1.weight: Grad Norm = 0.0020
fusion_layer_arousal.1.bias: Grad Norm = 0.0023
fusion_layer_arousal.4.weight: Grad Norm = 0.0588
fusion_layer_arousal.4.bias: Grad Norm = 0.0038
fusion_layer_dominance.0.weight: Grad Norm = 0.1163
fusion_layer_dominance.0.bias: Grad Norm = 0.0257
fusion_layer_dominance.1.weight: Grad Norm = 0.0064
fusion_layer_dominance.1.bias: Grad Norm = 0.0056
fusion_layer_dominance.4.weight: Grad Norm = 0.1115
fusion_layer_dominance.4.bias: Grad Norm = 0.0047
shared_fc_arousal.0.weight: Grad Norm = 0.0471
shared_fc_arousal.0.bias: Grad Norm = 0.0050
shared_fc_arousal.1.weight: Grad Norm = 0.0028
shared_fc_arousal.1.bias: Grad Norm = 0.0030
shared_fc_dominance.0.weight: Grad Norm = 0.1054
shared_fc_dominance.0.bias: Grad Norm = 0.0082
shared_fc_dominance.1.weight: Grad Norm = 0.0096
shared_fc_dominance.1.bias: Grad Norm = 0.0081
output_branch_arousal.0.

Epoch 14/15 [Train]: 100%|██████████| 1004/1004 [09:20<00:00,  1.79it/s, loss=0.1387, a_loss=0.1262, d_loss=0.1429]
Epoch 14/15 [Val]: 100%|██████████| 126/126 [00:45<00:00,  2.79it/s, loss=0.2594, a_loss=0.1245, d_loss=0.3943]


Epoch 14/15 Results:
  Train Loss: 0.1600 (Arousal: 0.1211, Dominance: 0.1730)
  Val Loss: 0.1471 (Arousal: 0.1106, Dominance: 0.1836)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500


Epoch 15/15 [Train]:   0%|          | 1/1004 [00:00<09:28,  1.76it/s, loss=0.2370, a_loss=0.0678, d_loss=0.2934]

fusion_layer_arousal.0.weight: Grad Norm = 0.0347
fusion_layer_arousal.0.bias: Grad Norm = 0.0065
fusion_layer_arousal.1.weight: Grad Norm = 0.0013
fusion_layer_arousal.1.bias: Grad Norm = 0.0013
fusion_layer_arousal.4.weight: Grad Norm = 0.0353
fusion_layer_arousal.4.bias: Grad Norm = 0.0019
fusion_layer_dominance.0.weight: Grad Norm = 0.1747
fusion_layer_dominance.0.bias: Grad Norm = 0.0368
fusion_layer_dominance.1.weight: Grad Norm = 0.0101
fusion_layer_dominance.1.bias: Grad Norm = 0.0096
fusion_layer_dominance.4.weight: Grad Norm = 0.1607
fusion_layer_dominance.4.bias: Grad Norm = 0.0094
shared_fc_arousal.0.weight: Grad Norm = 0.0305
shared_fc_arousal.0.bias: Grad Norm = 0.0031
shared_fc_arousal.1.weight: Grad Norm = 0.0020
shared_fc_arousal.1.bias: Grad Norm = 0.0018
shared_fc_dominance.0.weight: Grad Norm = 0.1411
shared_fc_dominance.0.bias: Grad Norm = 0.0126
shared_fc_dominance.1.weight: Grad Norm = 0.0118
shared_fc_dominance.1.bias: Grad Norm = 0.0113
output_branch_arousal.0.

Epoch 15/15 [Train]:  10%|█         | 101/1004 [00:56<07:43,  1.95it/s, loss=0.1715, a_loss=0.1502, d_loss=0.1786]

fusion_layer_arousal.0.weight: Grad Norm = 0.0770
fusion_layer_arousal.0.bias: Grad Norm = 0.0149
fusion_layer_arousal.1.weight: Grad Norm = 0.0029
fusion_layer_arousal.1.bias: Grad Norm = 0.0026
fusion_layer_arousal.4.weight: Grad Norm = 0.0686
fusion_layer_arousal.4.bias: Grad Norm = 0.0029
fusion_layer_dominance.0.weight: Grad Norm = 0.1550
fusion_layer_dominance.0.bias: Grad Norm = 0.0347
fusion_layer_dominance.1.weight: Grad Norm = 0.0088
fusion_layer_dominance.1.bias: Grad Norm = 0.0080
fusion_layer_dominance.4.weight: Grad Norm = 0.1298
fusion_layer_dominance.4.bias: Grad Norm = 0.0075
shared_fc_arousal.0.weight: Grad Norm = 0.0577
shared_fc_arousal.0.bias: Grad Norm = 0.0056
shared_fc_arousal.1.weight: Grad Norm = 0.0033
shared_fc_arousal.1.bias: Grad Norm = 0.0033
shared_fc_dominance.0.weight: Grad Norm = 0.1167
shared_fc_dominance.0.bias: Grad Norm = 0.0108
shared_fc_dominance.1.weight: Grad Norm = 0.0105
shared_fc_dominance.1.bias: Grad Norm = 0.0092
output_branch_arousal.0.

Epoch 15/15 [Train]:  20%|██        | 201/1004 [01:52<07:21,  1.82it/s, loss=0.1525, a_loss=0.0794, d_loss=0.1769]

fusion_layer_arousal.0.weight: Grad Norm = 0.0600
fusion_layer_arousal.0.bias: Grad Norm = 0.0108
fusion_layer_arousal.1.weight: Grad Norm = 0.0022
fusion_layer_arousal.1.bias: Grad Norm = 0.0020
fusion_layer_arousal.4.weight: Grad Norm = 0.0535
fusion_layer_arousal.4.bias: Grad Norm = 0.0030
fusion_layer_dominance.0.weight: Grad Norm = 0.1291
fusion_layer_dominance.0.bias: Grad Norm = 0.0349
fusion_layer_dominance.1.weight: Grad Norm = 0.0076
fusion_layer_dominance.1.bias: Grad Norm = 0.0070
fusion_layer_dominance.4.weight: Grad Norm = 0.1390
fusion_layer_dominance.4.bias: Grad Norm = 0.0075
shared_fc_arousal.0.weight: Grad Norm = 0.0419
shared_fc_arousal.0.bias: Grad Norm = 0.0050
shared_fc_arousal.1.weight: Grad Norm = 0.0029
shared_fc_arousal.1.bias: Grad Norm = 0.0023
shared_fc_dominance.0.weight: Grad Norm = 0.1194
shared_fc_dominance.0.bias: Grad Norm = 0.0105
shared_fc_dominance.1.weight: Grad Norm = 0.0106
shared_fc_dominance.1.bias: Grad Norm = 0.0095
output_branch_arousal.0.

Epoch 15/15 [Train]:  30%|██▉       | 301/1004 [02:47<07:35,  1.54it/s, loss=0.1388, a_loss=0.0620, d_loss=0.1644]

fusion_layer_arousal.0.weight: Grad Norm = 0.0554
fusion_layer_arousal.0.bias: Grad Norm = 0.0135
fusion_layer_arousal.1.weight: Grad Norm = 0.0026
fusion_layer_arousal.1.bias: Grad Norm = 0.0022
fusion_layer_arousal.4.weight: Grad Norm = 0.0585
fusion_layer_arousal.4.bias: Grad Norm = 0.0033
fusion_layer_dominance.0.weight: Grad Norm = 0.1333
fusion_layer_dominance.0.bias: Grad Norm = 0.0268
fusion_layer_dominance.1.weight: Grad Norm = 0.0073
fusion_layer_dominance.1.bias: Grad Norm = 0.0066
fusion_layer_dominance.4.weight: Grad Norm = 0.1219
fusion_layer_dominance.4.bias: Grad Norm = 0.0068
shared_fc_arousal.0.weight: Grad Norm = 0.0478
shared_fc_arousal.0.bias: Grad Norm = 0.0047
shared_fc_arousal.1.weight: Grad Norm = 0.0027
shared_fc_arousal.1.bias: Grad Norm = 0.0029
shared_fc_dominance.0.weight: Grad Norm = 0.1073
shared_fc_dominance.0.bias: Grad Norm = 0.0095
shared_fc_dominance.1.weight: Grad Norm = 0.0084
shared_fc_dominance.1.bias: Grad Norm = 0.0085
output_branch_arousal.0.

Epoch 15/15 [Train]:  40%|███▉      | 401/1004 [03:43<06:18,  1.59it/s, loss=0.2130, a_loss=0.0259, d_loss=0.2753]

fusion_layer_arousal.0.weight: Grad Norm = 0.0435
fusion_layer_arousal.0.bias: Grad Norm = 0.0099
fusion_layer_arousal.1.weight: Grad Norm = 0.0017
fusion_layer_arousal.1.bias: Grad Norm = 0.0015
fusion_layer_arousal.4.weight: Grad Norm = 0.0396
fusion_layer_arousal.4.bias: Grad Norm = 0.0021
fusion_layer_dominance.0.weight: Grad Norm = 0.1984
fusion_layer_dominance.0.bias: Grad Norm = 0.0472
fusion_layer_dominance.1.weight: Grad Norm = 0.0120
fusion_layer_dominance.1.bias: Grad Norm = 0.0100
fusion_layer_dominance.4.weight: Grad Norm = 0.1937
fusion_layer_dominance.4.bias: Grad Norm = 0.0113
shared_fc_arousal.0.weight: Grad Norm = 0.0323
shared_fc_arousal.0.bias: Grad Norm = 0.0033
shared_fc_arousal.1.weight: Grad Norm = 0.0018
shared_fc_arousal.1.bias: Grad Norm = 0.0018
shared_fc_dominance.0.weight: Grad Norm = 0.1754
shared_fc_dominance.0.bias: Grad Norm = 0.0155
shared_fc_dominance.1.weight: Grad Norm = 0.0143
shared_fc_dominance.1.bias: Grad Norm = 0.0149
output_branch_arousal.0.

Epoch 15/15 [Train]:  50%|████▉     | 501/1004 [04:38<04:38,  1.81it/s, loss=0.1550, a_loss=0.0972, d_loss=0.1743]

fusion_layer_arousal.0.weight: Grad Norm = 0.0522
fusion_layer_arousal.0.bias: Grad Norm = 0.0093
fusion_layer_arousal.1.weight: Grad Norm = 0.0020
fusion_layer_arousal.1.bias: Grad Norm = 0.0020
fusion_layer_arousal.4.weight: Grad Norm = 0.0522
fusion_layer_arousal.4.bias: Grad Norm = 0.0030
fusion_layer_dominance.0.weight: Grad Norm = 0.2004
fusion_layer_dominance.0.bias: Grad Norm = 0.0428
fusion_layer_dominance.1.weight: Grad Norm = 0.0111
fusion_layer_dominance.1.bias: Grad Norm = 0.0102
fusion_layer_dominance.4.weight: Grad Norm = 0.1752
fusion_layer_dominance.4.bias: Grad Norm = 0.0098
shared_fc_arousal.0.weight: Grad Norm = 0.0457
shared_fc_arousal.0.bias: Grad Norm = 0.0048
shared_fc_arousal.1.weight: Grad Norm = 0.0029
shared_fc_arousal.1.bias: Grad Norm = 0.0027
shared_fc_dominance.0.weight: Grad Norm = 0.1478
shared_fc_dominance.0.bias: Grad Norm = 0.0133
shared_fc_dominance.1.weight: Grad Norm = 0.0124
shared_fc_dominance.1.bias: Grad Norm = 0.0117
output_branch_arousal.0.

Epoch 15/15 [Train]:  60%|█████▉    | 601/1004 [05:34<03:51,  1.74it/s, loss=0.1328, a_loss=0.1651, d_loss=0.1220]

fusion_layer_arousal.0.weight: Grad Norm = 0.0747
fusion_layer_arousal.0.bias: Grad Norm = 0.0140
fusion_layer_arousal.1.weight: Grad Norm = 0.0031
fusion_layer_arousal.1.bias: Grad Norm = 0.0029
fusion_layer_arousal.4.weight: Grad Norm = 0.0674
fusion_layer_arousal.4.bias: Grad Norm = 0.0043
fusion_layer_dominance.0.weight: Grad Norm = 0.0928
fusion_layer_dominance.0.bias: Grad Norm = 0.0155
fusion_layer_dominance.1.weight: Grad Norm = 0.0049
fusion_layer_dominance.1.bias: Grad Norm = 0.0047
fusion_layer_dominance.4.weight: Grad Norm = 0.0993
fusion_layer_dominance.4.bias: Grad Norm = 0.0057
shared_fc_arousal.0.weight: Grad Norm = 0.0558
shared_fc_arousal.0.bias: Grad Norm = 0.0059
shared_fc_arousal.1.weight: Grad Norm = 0.0038
shared_fc_arousal.1.bias: Grad Norm = 0.0035
shared_fc_dominance.0.weight: Grad Norm = 0.0874
shared_fc_dominance.0.bias: Grad Norm = 0.0075
shared_fc_dominance.1.weight: Grad Norm = 0.0085
shared_fc_dominance.1.bias: Grad Norm = 0.0073
output_branch_arousal.0.

Epoch 15/15 [Train]:  70%|██████▉   | 701/1004 [06:30<02:47,  1.81it/s, loss=0.1091, a_loss=0.0529, d_loss=0.1278]

fusion_layer_arousal.0.weight: Grad Norm = 0.0423
fusion_layer_arousal.0.bias: Grad Norm = 0.0083
fusion_layer_arousal.1.weight: Grad Norm = 0.0018
fusion_layer_arousal.1.bias: Grad Norm = 0.0016
fusion_layer_arousal.4.weight: Grad Norm = 0.0456
fusion_layer_arousal.4.bias: Grad Norm = 0.0021
fusion_layer_dominance.0.weight: Grad Norm = 0.0984
fusion_layer_dominance.0.bias: Grad Norm = 0.0208
fusion_layer_dominance.1.weight: Grad Norm = 0.0061
fusion_layer_dominance.1.bias: Grad Norm = 0.0053
fusion_layer_dominance.4.weight: Grad Norm = 0.1039
fusion_layer_dominance.4.bias: Grad Norm = 0.0050
shared_fc_arousal.0.weight: Grad Norm = 0.0392
shared_fc_arousal.0.bias: Grad Norm = 0.0036
shared_fc_arousal.1.weight: Grad Norm = 0.0028
shared_fc_arousal.1.bias: Grad Norm = 0.0024
shared_fc_dominance.0.weight: Grad Norm = 0.0985
shared_fc_dominance.0.bias: Grad Norm = 0.0079
shared_fc_dominance.1.weight: Grad Norm = 0.0087
shared_fc_dominance.1.bias: Grad Norm = 0.0075
output_branch_arousal.0.

Epoch 15/15 [Train]:  80%|███████▉  | 801/1004 [07:26<01:39,  2.04it/s, loss=0.3047, a_loss=0.1272, d_loss=0.3639]

fusion_layer_arousal.0.weight: Grad Norm = 0.0514
fusion_layer_arousal.0.bias: Grad Norm = 0.0108
fusion_layer_arousal.1.weight: Grad Norm = 0.0023
fusion_layer_arousal.1.bias: Grad Norm = 0.0019
fusion_layer_arousal.4.weight: Grad Norm = 0.0549
fusion_layer_arousal.4.bias: Grad Norm = 0.0026
fusion_layer_dominance.0.weight: Grad Norm = 0.1589
fusion_layer_dominance.0.bias: Grad Norm = 0.0396
fusion_layer_dominance.1.weight: Grad Norm = 0.0100
fusion_layer_dominance.1.bias: Grad Norm = 0.0097
fusion_layer_dominance.4.weight: Grad Norm = 0.1644
fusion_layer_dominance.4.bias: Grad Norm = 0.0105
shared_fc_arousal.0.weight: Grad Norm = 0.0511
shared_fc_arousal.0.bias: Grad Norm = 0.0046
shared_fc_arousal.1.weight: Grad Norm = 0.0038
shared_fc_arousal.1.bias: Grad Norm = 0.0031
shared_fc_dominance.0.weight: Grad Norm = 0.1563
shared_fc_dominance.0.bias: Grad Norm = 0.0134
shared_fc_dominance.1.weight: Grad Norm = 0.0130
shared_fc_dominance.1.bias: Grad Norm = 0.0128
output_branch_arousal.0.

Epoch 15/15 [Train]:  90%|████████▉ | 901/1004 [08:20<00:54,  1.88it/s, loss=0.2197, a_loss=0.0917, d_loss=0.2623]

fusion_layer_arousal.0.weight: Grad Norm = 0.0520
fusion_layer_arousal.0.bias: Grad Norm = 0.0127
fusion_layer_arousal.1.weight: Grad Norm = 0.0022
fusion_layer_arousal.1.bias: Grad Norm = 0.0021
fusion_layer_arousal.4.weight: Grad Norm = 0.0567
fusion_layer_arousal.4.bias: Grad Norm = 0.0033
fusion_layer_dominance.0.weight: Grad Norm = 0.1382
fusion_layer_dominance.0.bias: Grad Norm = 0.0263
fusion_layer_dominance.1.weight: Grad Norm = 0.0081
fusion_layer_dominance.1.bias: Grad Norm = 0.0077
fusion_layer_dominance.4.weight: Grad Norm = 0.1366
fusion_layer_dominance.4.bias: Grad Norm = 0.0090
shared_fc_arousal.0.weight: Grad Norm = 0.0494
shared_fc_arousal.0.bias: Grad Norm = 0.0050
shared_fc_arousal.1.weight: Grad Norm = 0.0030
shared_fc_arousal.1.bias: Grad Norm = 0.0032
shared_fc_dominance.0.weight: Grad Norm = 0.1267
shared_fc_dominance.0.bias: Grad Norm = 0.0105
shared_fc_dominance.1.weight: Grad Norm = 0.0104
shared_fc_dominance.1.bias: Grad Norm = 0.0105
output_branch_arousal.0.

Epoch 15/15 [Train]: 100%|█████████▉| 1001/1004 [09:15<00:01,  1.98it/s, loss=0.1470, a_loss=0.0749, d_loss=0.1710]

fusion_layer_arousal.0.weight: Grad Norm = 0.0662
fusion_layer_arousal.0.bias: Grad Norm = 0.0143
fusion_layer_arousal.1.weight: Grad Norm = 0.0030
fusion_layer_arousal.1.bias: Grad Norm = 0.0026
fusion_layer_arousal.4.weight: Grad Norm = 0.0608
fusion_layer_arousal.4.bias: Grad Norm = 0.0036
fusion_layer_dominance.0.weight: Grad Norm = 0.1021
fusion_layer_dominance.0.bias: Grad Norm = 0.0148
fusion_layer_dominance.1.weight: Grad Norm = 0.0057
fusion_layer_dominance.1.bias: Grad Norm = 0.0052
fusion_layer_dominance.4.weight: Grad Norm = 0.1136
fusion_layer_dominance.4.bias: Grad Norm = 0.0056
shared_fc_arousal.0.weight: Grad Norm = 0.0527
shared_fc_arousal.0.bias: Grad Norm = 0.0053
shared_fc_arousal.1.weight: Grad Norm = 0.0032
shared_fc_arousal.1.bias: Grad Norm = 0.0033
shared_fc_dominance.0.weight: Grad Norm = 0.1190
shared_fc_dominance.0.bias: Grad Norm = 0.0087
shared_fc_dominance.1.weight: Grad Norm = 0.0111
shared_fc_dominance.1.bias: Grad Norm = 0.0093
output_branch_arousal.0.

Epoch 15/15 [Train]: 100%|██████████| 1004/1004 [09:16<00:00,  1.80it/s, loss=0.1250, a_loss=0.1409, d_loss=0.1197]
Epoch 15/15 [Val]: 100%|██████████| 126/126 [00:45<00:00,  2.79it/s, loss=0.2454, a_loss=0.1459, d_loss=0.3449]


Epoch 15/15 Results:
  Train Loss: 0.1570 (Arousal: 0.1185, Dominance: 0.1698)
  Val Loss: 0.1363 (Arousal: 0.0996, Dominance: 0.1730)
  Loss Weights - Arousal: 0.2500, Dominance: 0.7500
  Saved best model (val_loss: 0.1363)

Test Results:
  Overall Test Loss: 0.1360
  Arousal MSE: 0.1046, PCC: 0.7292
  Dominance MSE: 0.1675, PCC: 0.6229


In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer, HubertModel, Wav2Vec2FeatureExtractor
import torchaudio
import os
import shutil
from pathlib import Path

# ValenceRegressor
class ValenceRegressor(nn.Module):
    def __init__(self, audio_dim=768, text_dim=768, hidden_dim=192, num_heads=6, num_layers=2, dropout=0.5):
        super().__init__()
        self.audio_transformer = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=audio_dim, nhead=num_heads, dim_feedforward=hidden_dim*4, dropout=dropout, batch_first=True)
            for _ in range(num_layers)
        ])
        self.audio_layer_norm = nn.LayerNorm(audio_dim)
        self.audio_attention_pool = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim*2), nn.GELU(), nn.Dropout(dropout), nn.Linear(hidden_dim*2, 1)
        )
        self.text_encoder = BertModel.from_pretrained("bert-base-uncased")
        for param in self.text_encoder.parameters():
            param.requires_grad = False
        for param in list(self.text_encoder.parameters())[-2:]:
            param.requires_grad = True
        self.audio_projection = nn.Linear(audio_dim, hidden_dim)
        self.text_projection = nn.Linear(text_dim, hidden_dim)
        self.audio_to_text_attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=num_heads//2, dropout=dropout, batch_first=True)
        self.text_to_audio_attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=num_heads//2, dropout=dropout, batch_first=True)
        self.audio_gate = nn.Sequential(nn.Linear(hidden_dim*2, hidden_dim), nn.Sigmoid())
        self.text_gate = nn.Sequential(nn.Linear(hidden_dim*2, hidden_dim), nn.Sigmoid())
        self.fusion_layer = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim*2), nn.LayerNorm(hidden_dim*2), nn.GELU(), nn.Dropout(dropout), nn.Linear(hidden_dim*2, hidden_dim)
        )
        self.shared_fc = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.GELU(), nn.Dropout(dropout))
        self.output_branch = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2), nn.LayerNorm(hidden_dim//2), nn.GELU(), nn.Dropout(dropout*0.5), nn.Linear(hidden_dim//2, 1)
        )

    def audio_attention_pooling(self, x, audio_mask=None):
        weights = self.audio_attention_pool(x)
        if audio_mask is not None:
            weights = weights.masked_fill(~audio_mask.bool().unsqueeze(-1), float('-inf'))
        weights = torch.softmax(weights, dim=1)
        output = torch.bmm(weights.transpose(1, 2), x)
        return output.squeeze(1)

    def forward(self, audio_features, input_ids, attention_mask):
        audio_mask = (audio_features.abs().sum(dim=-1) > 1e-6)
        audio_repr = audio_features
        for layer in self.audio_transformer:
            audio_key_padding_mask = (~audio_mask).float()
            audio_repr = layer(audio_repr, src_key_padding_mask=audio_key_padding_mask)
        audio_repr = self.audio_layer_norm(audio_repr)
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_repr = text_outputs.last_hidden_state
        audio_proj = self.audio_projection(audio_repr)
        text_proj = self.text_projection(text_repr)
        audio_attended_text, _ = self.audio_to_text_attention(
            query=audio_proj, key=text_proj, value=text_proj, key_padding_mask=(1 - attention_mask).bool()
        )
        text_attended_audio, _ = self.text_to_audio_attention(
            query=text_proj, key=audio_proj, value=audio_proj, key_padding_mask=(~audio_mask).bool()
        )
        audio_concat = torch.cat([audio_proj, audio_attended_text], dim=-1)
        text_concat = torch.cat([text_proj, text_attended_audio], dim=-1)
        audio_gate_value = self.audio_gate(audio_concat)
        text_gate_value = self.text_gate(text_concat)
        gated_audio = audio_proj * audio_gate_value
        gated_text = text_proj * text_gate_value
        pooled_audio = self.audio_attention_pooling(gated_audio, audio_mask)
        text_sum = torch.sum(gated_text * attention_mask.unsqueeze(-1), dim=1)
        text_count = torch.sum(attention_mask, dim=1, keepdim=True).clamp(min=1)
        pooled_text = text_sum / text_count
        fused = torch.cat([pooled_audio, pooled_text], dim=1)
        joint_repr = self.fusion_layer(fused)
        shared = self.shared_fc(joint_repr)
        output = self.output_branch(shared)
        scaled_output = 1.0 + 4.0 * torch.sigmoid(output)
        return scaled_output

# MultimodalArousalDominanceModel
class MultimodalArousalDominanceModel(nn.Module):
    def __init__(self, audio_dim=768, text_dim=768, hidden_dim=192, num_heads=6, num_layers=2, dropout=0.5):
        super().__init__()
        self.audio_transformer = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=audio_dim, nhead=num_heads, dim_feedforward=hidden_dim*4, dropout=dropout, batch_first=True)
            for _ in range(num_layers)
        ])
        self.audio_layer_norm = nn.LayerNorm(audio_dim)
        self.audio_attention_pool = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim*2), nn.GELU(), nn.Dropout(dropout), nn.Linear(hidden_dim*2, 1)
        )
        self.text_encoder = BertModel.from_pretrained("bert-base-uncased")
        for param in self.text_encoder.parameters():
            param.requires_grad = False
        for param in list(self.text_encoder.parameters())[-2:]:
            param.requires_grad = True
        self.audio_projection = nn.Linear(audio_dim, hidden_dim)
        self.text_projection = nn.Linear(text_dim, hidden_dim)
        self.audio_to_text_attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=num_heads//2, dropout=dropout, batch_first=True)
        self.text_to_audio_attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=num_heads//2, dropout=dropout, batch_first=True)
        self.audio_gate = nn.Sequential(nn.Linear(hidden_dim*2, hidden_dim), nn.Sigmoid())
        self.text_gate = nn.Sequential(nn.Linear(hidden_dim*2, hidden_dim), nn.Sigmoid())
        self.fusion_layer_arousal = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim*2), nn.LayerNorm(hidden_dim*2), nn.GELU(), nn.Dropout(dropout), nn.Linear(hidden_dim*2, hidden_dim)
        )
        self.fusion_layer_dominance = nn.Sequential(
            nn.Linear(hidden_dim*2, hidden_dim*2), nn.LayerNorm(hidden_dim*2), nn.GELU(), nn.Dropout(dropout), nn.Linear(hidden_dim*2, hidden_dim)
        )
        self.shared_fc_arousal = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.GELU(), nn.Dropout(dropout)
        )
        self.shared_fc_dominance = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.GELU(), nn.Dropout(dropout)
        )
        self.output_branch_arousal = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2), nn.LayerNorm(hidden_dim//2), nn.GELU(), nn.Dropout(dropout*0.5), nn.Linear(hidden_dim//2, 1)
        )
        self.output_branch_dominance = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2), nn.LayerNorm(hidden_dim//2), nn.GELU(), nn.Dropout(dropout*0.5), nn.Linear(hidden_dim//2, 1)
        )

    def audio_attention_pooling(self, x, audio_mask=None):
        weights = self.audio_attention_pool(x)
        if audio_mask is not None:
            weights = weights.masked_fill(~audio_mask.bool().unsqueeze(-1), float('-inf'))
        weights = torch.softmax(weights, dim=1)
        output = torch.bmm(weights.transpose(1, 2), x)
        return output.squeeze(1)

    def forward(self, audio_features, input_ids, attention_mask):
        audio_mask = (audio_features.abs().sum(dim=-1) > 1e-6)
        audio_repr = audio_features
        for layer in self.audio_transformer:
            audio_key_padding_mask = (~audio_mask).float()
            audio_repr = layer(audio_repr, src_key_padding_mask=audio_key_padding_mask)
        audio_repr = self.audio_layer_norm(audio_repr)
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_repr = text_outputs.last_hidden_state
        audio_proj = self.audio_projection(audio_repr)
        text_proj = self.text_projection(text_repr)
        audio_attended_text, _ = self.audio_to_text_attention(
            query=audio_proj, key=text_proj, value=text_proj, key_padding_mask=(1 - attention_mask).bool()
        )
        text_attended_audio, _ = self.text_to_audio_attention(
            query=text_proj, key=audio_proj, value=audio_proj, key_padding_mask=(~audio_mask).bool()
        )
        audio_concat = torch.cat([audio_proj, audio_attended_text], dim=-1)
        text_concat = torch.cat([text_proj, text_attended_audio], dim=-1)
        audio_gate_value = self.audio_gate(audio_concat)
        text_gate_value = self.text_gate(text_concat)
        gated_audio = audio_proj * audio_gate_value
        gated_text = text_proj * text_gate_value
        pooled_audio = self.audio_attention_pooling(gated_audio, audio_mask)
        text_sum = torch.sum(gated_text * attention_mask.unsqueeze(-1), dim=1)
        text_count = torch.sum(attention_mask, dim=1, keepdim=True).clamp(min=1)
        pooled_text = text_sum / text_count
        fused = torch.cat([pooled_audio, pooled_text], dim=1)
        joint_repr_arousal = self.fusion_layer_arousal(fused)
        joint_repr_dominance = self.fusion_layer_dominance(fused)
        shared_arousal = self.shared_fc_arousal(joint_repr_arousal)
        shared_dominance = self.shared_fc_dominance(joint_repr_dominance)
        output_arousal = self.output_branch_arousal(shared_arousal)
        output_dominance = self.output_branch_dominance(shared_dominance)
        scaled_arousal = 1.0 + 4.0 * torch.sigmoid(output_arousal)
        scaled_dominance = 1.0 + 4.0 * torch.sigmoid(output_dominance)
        return scaled_arousal, scaled_dominance

# Function to clear Hugging Face cache
def clear_huggingface_cache():
    cache_dir = Path.home() / ".cache" / "huggingface" / "transformers"
    if cache_dir.exists():
        shutil.rmtree(cache_dir)
        print(f"Cleared cache at {cache_dir}")

# Function to load feature extractor with retry
def load_feature_extractor(model_name, max_retries=3):
    for attempt in range(max_retries):
        try:
            return Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        except (OSError, ValueError) as e:
            print(f"Attempt {attempt+1}/{max_retries} failed: {e}")
            if attempt < max_retries - 1:
                clear_huggingface_cache()
    raise RuntimeError(f"Failed to load Wav2Vec2FeatureExtractor after {max_retries} attempts")

# Function to extract audio features using HuBERT
def extract_hubert_features(audio_path, processor, hubert_model, device, sampling_rate=16000, max_audio_samples=128000):
    audio, sr = torchaudio.load(audio_path)
    if sr != sampling_rate:
        audio = torchaudio.transforms.Resample(sr, sampling_rate)(audio)
    audio = audio.squeeze(0)
    if audio.dim() > 1:
        audio = audio[0]
    if audio.size(0) > max_audio_samples:
        audio = audio[:max_audio_samples]
    elif audio.size(0) < max_audio_samples:
        audio = torch.nn.functional.pad(audio, (0, max_audio_samples - audio.size(0)))
    audio = audio.cpu().numpy()
    inputs = processor(audio, sampling_rate=sampling_rate, return_tensors="pt", padding=False, truncation=False)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = hubert_model(**inputs)
    return outputs.last_hidden_state

# Function to preprocess inputs
def preprocess_inputs(text, audio_path, tokenizer, processor, hubert_model, device, max_length=512):
    encoding = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    audio_features = extract_hubert_features(audio_path, processor, hubert_model, device)
    return input_ids, attention_mask, audio_features

# Function to load models
def load_models(valence_checkpoint_path, ad_checkpoint_path, device):
    valence_model = ValenceRegressor(audio_dim=768, text_dim=768, hidden_dim=192, num_heads=6, num_layers=2, dropout=0.5)
    valence_checkpoint = torch.load(valence_checkpoint_path, map_location=device)
    valence_model.load_state_dict(valence_checkpoint['model_state_dict'])
    if torch.cuda.device_count() > 1:
        valence_model = nn.DataParallel(valence_model)
    valence_model.to(device)
    valence_model.eval()

    ad_model = MultimodalArousalDominanceModel(audio_dim=768, text_dim=768, hidden_dim=192, num_heads=6, num_layers=2, dropout=0.5)
    ad_checkpoint = torch.load(ad_checkpoint_path, map_location=device)
    ad_model.load_state_dict(ad_checkpoint['model_state_dict'])
    if torch.cuda.device_count() > 1:
        ad_model = nn.DataParallel(ad_model)
    ad_model.to(device)
    ad_model.eval()

    return valence_model, ad_model

# Main function for prediction
def predict_emotions(audio_path, transcription, valence_checkpoint_path, ad_checkpoint_path):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    processor = load_feature_extractor('facebook/hubert-base-ls960')
    hubert_model = HubertModel.from_pretrained('facebook/hubert-base-ls960').to(device)
    hubert_model.eval()

    valence_model, ad_model = load_models(valence_checkpoint_path, ad_checkpoint_path, device)

    input_ids, attention_mask, audio_features = preprocess_inputs(
        text=transcription,
        audio_path=audio_path,
        tokenizer=tokenizer,
        processor=processor,
        hubert_model=hubert_model,
        device=device,
        max_length=512
    )

    with torch.no_grad():
        valence_pred = valence_model(audio_features, input_ids, attention_mask)
        arousal_pred, dominance_pred = ad_model(audio_features, input_ids, attention_mask)

    return {
        'valence': valence_pred.item(),
        'arousal': arousal_pred.item(),
        'dominance': dominance_pred.item()
    }

# Example usage
if __name__ == "__main__":
    valence_checkpoint_path = 'path/to/valence_model_checkpoint.pth'  # Replace
    ad_checkpoint_path = '/content/best_arousal_dominance_model.pth'
    audio_path = 'path/to/audio.wav'  # Replace
    transcription = "This is a sample utterance from IEMOCAP."

    predictions = predict_emotions(audio_path, transcription, valence_checkpoint_path, ad_checkpoint_path)

    print(f"Predicted Valence: {predictions['valence']:.4f}")
    print(f"Predicted Arousal: {predictions['arousal']:.4f}")
    print(f"Predicted Dominance: {predictions['dominance']:.4f}")