In [3]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, HubertModel
import numpy as np
import math


In [4]:
class VADRegressor(nn.Module):
    def __init__(self, input_dim=768, gru_stack_depth=1, hidden_dim=256):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers=gru_stack_depth, batch_first=True, bidirectional=True)
        self.attn_fc = nn.Linear(hidden_dim * 2, hidden_dim * 2)
        self.attn_softmax = nn.Softmax(dim=-1)
        self.fc1 = nn.Linear(hidden_dim * 2, 256)
        self.dropout = nn.Dropout(0.25)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 3)

    def attention(self, x):
        query = key = value = self.attn_fc(x)
        d_k = query.size(-1)
        attn_scores = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(d_k)
        attn_weights = self.attn_softmax(attn_scores)
        context = torch.matmul(attn_weights, value)
        return context.sum(dim=1)

    def forward(self, x):
        gru_out, _ = self.gru(x)
        context = self.attention(gru_out)
        x = torch.relu(self.fc1(context))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


In [5]:
class IEMOCAPDataset(Dataset):
    def __init__(self, df, feature_extractor):
        self.df = df
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio_path = row['filepath']
        labels = torch.tensor([row['EmoVal'], row['EmoAct'], row['EmoDom']], dtype=torch.float32)

        waveform, sample_rate = torchaudio.load(audio_path)
        if sample_rate != 16000:
            waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
        waveform = waveform.squeeze(0)

        inputs = self.feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
        return {'input_values': inputs['input_values'].squeeze(0)}, labels

def collate_fn(batch):
    input_values = [item[0]['input_values'] for item in batch]
    labels = torch.stack([item[1] for item in batch])
    input_values = torch.nn.utils.rnn.pad_sequence(input_values, batch_first=True)
    return {'input_values': input_values}, labels


In [6]:
csv_path = "/kaggle/input/iemocap/updated_iemocap_metadata.csv"
df = pd.read_csv(csv_path)

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960").cuda()


Train: 8031, Val: 1004, Test: 1004


preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

In [7]:
train_dataset = IEMOCAPDataset(train_df, feature_extractor)
val_dataset = IEMOCAPDataset(val_df, feature_extractor)
test_dataset = IEMOCAPDataset(test_df, feature_extractor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

model = VADRegressor(input_dim=768).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)


In [8]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch_inputs, labels in train_loader:
        input_values = batch_inputs['input_values'].cuda()
        labels = labels.cuda()

        optimizer.zero_grad()
        with torch.no_grad():
            hubert_out = hubert_model(input_values).last_hidden_state

        outputs = model(hubert_out)
        loss = criterion(outputs, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_inputs, labels in val_loader:
            input_values = batch_inputs['input_values'].cuda()
            labels = labels.cuda()

            hubert_out = hubert_model(input_values).last_hidden_state
            outputs = model(hubert_out)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    avg_train = train_loss / len(train_loader)
    avg_val = val_loss / len(val_loader)
    scheduler.step(avg_val)

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train:.4f}, Val Loss: {avg_val:.4f}")


Epoch 1/10, Train Loss: 4.8469, Val Loss: 1.0423
Epoch 2/10, Train Loss: 21.8486, Val Loss: 1.2042
Epoch 3/10, Train Loss: 3.3898, Val Loss: 1.2999
Epoch 4/10, Train Loss: 1.7979, Val Loss: 0.5945
Epoch 5/10, Train Loss: 2.2594, Val Loss: 0.7895
Epoch 6/10, Train Loss: 1.0268, Val Loss: 0.6343
Epoch 7/10, Train Loss: 3.4467, Val Loss: 0.7481
Epoch 8/10, Train Loss: 0.5916, Val Loss: 0.5448
Epoch 9/10, Train Loss: 0.8344, Val Loss: 0.6399
Epoch 10/10, Train Loss: 124.2276, Val Loss: 0.5714


In [9]:
model.eval()
test_loss = 0.0
with torch.no_grad():
    for batch_inputs, labels in test_loader:
        input_values = batch_inputs['input_values'].cuda()
        labels = labels.cuda()

        hubert_out = hubert_model(input_values).last_hidden_state
        outputs = model(hubert_out)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

print(f"Test Loss: {test_loss / len(test_loader):.4f}")


Test Loss: 0.6045


In [10]:
torch.save(model.state_dict(), "vad_regressor.pth")


In [24]:
import torch
from transformers import Wav2Vec2FeatureExtractor, HubertModel
import torchaudio

# Load the saved model weights
model = VADRegressor(input_dim=768)  # Use the same model architecture you trained
model.load_state_dict(torch.load("/kaggle/working/vad_regressor.pth", map_location=torch.device('cpu')))  # Load the trained weights onto CPU
model = model.cpu()  # Ensure model is on CPU
model.eval()  # Set the model to evaluation mode

# Load the feature extractor (Wav2Vec2 or HuBERT, depending on the original model)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

# Function to predict VAD scores from an audio file
def predict_vad(audio_path):
    # Load and preprocess audio
    waveform, sample_rate = torchaudio.load(audio_path)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
    waveform = waveform.squeeze(0)  # Remove channel dim if mono

    # Extract features
    inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.squeeze(0) for k, v in inputs.items()}  # Remove batch dim

    # Predict VAD scores
    with torch.no_grad():
        hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
        hubert_outputs = hubert_model(input_values=inputs['input_values'].unsqueeze(0)).last_hidden_state  # (1, seq_len, 768)
        vad_scores = model(hubert_outputs)

    return vad_scores.cpu().numpy()

# Example usage:
audio_path = "/kaggle/input/iemocap/iemocap_audio/Ses01F_impro01_F000.wav"
vad_scores = predict_vad(audio_path)
print("Predicted VAD scores:", vad_scores)


  model.load_state_dict(torch.load("/kaggle/working/vad_regressor.pth", map_location=torch.device('cpu')))  # Load the trained weights onto CPU


Predicted VAD scores: [[2.3477838 2.1251054 2.2284825]]


In [26]:
import torch
from transformers import Wav2Vec2FeatureExtractor, HubertModel
import torchaudio

# Load the saved model weights
model = VADRegressor(input_dim=768)  # Use the same model architecture you trained
model.load_state_dict(torch.load("/kaggle/working/vad_regressor.pth", map_location=torch.device('cpu')))  # Load the trained weights onto CPU
model = model.cpu()  # Ensure model is on CPU
model.eval()  # Set the model to evaluation mode

# Load the feature extractor (Wav2Vec2 or HuBERT, depending on the original model)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")

# Function to predict VAD scores from an audio file
def predict_vad(audio_path):
    # Load and preprocess audio
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # Resample to 16kHz if necessary
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
    
    # Convert to mono (if stereo or multi-channel)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)  # Averaging the channels
    
    waveform = waveform.squeeze(0)  # Remove channel dim if mono

    # Extract features using the feature extractor
    inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.squeeze(0) for k, v in inputs.items()}  # Remove batch dim

    # Predict VAD scores
    with torch.no_grad():
        # Load the pre-trained HuBERT model to extract features
        hubert_model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
        
        # Extract features from HuBERT
        hubert_outputs = hubert_model(input_values=inputs['input_values'].unsqueeze(0)).last_hidden_state  # (1, seq_len, 768)
        
        # Predict the VAD scores using your trained model
        vad_scores = model(hubert_outputs)

    return vad_scores.cpu().numpy()

# Example usage with custom recorded audio:
audio_path = "/kaggle/input/iemocap/iemocap_audio/Ses01F_impro01_F000.wav"  # Path to your custom uploaded audio file
vad_scores = predict_vad(audio_path)
print("Predicted VAD scores:", vad_scores)


  model.load_state_dict(torch.load("/kaggle/working/vad_regressor.pth", map_location=torch.device('cpu')))  # Load the trained weights onto CPU


Predicted VAD scores: [[2.3477838 2.1251054 2.2284825]]
