In [2]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
from torch.optim import AdamW
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Emotion to integer mapping for the CREMA-D dataset
emotion_map = {
    "ANG": 0,  # Anger
    "DIS": 1,  # Disgust
    "FEA": 2,  # Fear
    "HAP": 3,  # Happy
    "NEU": 4,  # Neutral
    "SAD": 5,  # Sad
}

class CREMADataset(Dataset):
    def __init__(self, audio_dir, processor, emotion_map, file_paths=None, labels=None, sample_rate=16000):
        self.audio_dir = audio_dir
        self.processor = processor
        self.emotion_map = emotion_map
        self.sample_rate = sample_rate
        self.file_paths = []
        self.labels = []

        if file_paths is not None and labels is not None:
            self.file_paths = file_paths
            self.labels = labels
        else:
            for file_name in os.listdir(audio_dir):
                if file_name.endswith(".wav"):
                    parts = file_name.split('_')
                    if len(parts) < 3:
                        continue
                    emotion = parts[2]
                    label = self.emotion_map.get(emotion, -1)
                    if label != -1:
                        self.file_paths.append(os.path.join(audio_dir, file_name))
                        self.labels.append(label)


    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        
        # Load audio file
        waveform, sr = torchaudio.load(file_path)
        
        # Resample if necessary
        if sr != self.sample_rate:
            waveform = torchaudio.transforms.Resample(sr, self.sample_rate)(waveform)
        
        # Use Wav2Vec2 processor to get features
        inputs = self.processor(waveform.squeeze(0).numpy(), sampling_rate=self.sample_rate, return_tensors="pt", padding=True)
        
        # print(f"Processor output: {inputs}")  # Inspect what keys are returned
        # if 'attention_mask' not in inputs:
        #     print(f"Warning: attention_mask not found for file {file_path}")

        
        input_values = inputs.input_values.squeeze(0)
        attention_mask = inputs.get('attention_mask', torch.ones_like(input_values))

        return {
            'input_values': input_values,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }




import os

audio_dir = "audio-emotion-dataset2"  # Replace with your actual path

# List all files in the directory
files = os.listdir(audio_dir)
print(f"Files in directory: {files[:10]}")  # Print first 10 files to check

# Initialize Wav2Vec2 Processor (for Wav2Vec2 model)
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# Create dataset instance
dataset = CREMADataset(audio_dir=audio_dir, processor=processor, emotion_map=emotion_map)

# Check how many samples are loaded
print(f"Number of samples in dataset: {len(dataset)}")
print(f"First few file paths: {dataset.file_paths[:5]}")  # Show first 5 file paths
print(f"First few labels: {dataset.labels[:5]}")  # Show first 5 labels

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Extract lists for input_values, attention_mask, and labels
    input_values = [item['input_values'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]
    
    # Pad input sequences to the same length
    input_values = pad_sequence(input_values, batch_first=True, padding_value=0)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    
    # Stack labels as they are integers and don't need padding
    labels = torch.stack(labels, dim=0)
    
    return {
        'input_values': input_values,
        'attention_mask': attention_mask,
        'labels': labels
    }


# Split file paths and labels
train_file_paths, val_file_paths, train_labels, val_labels = train_test_split(
    dataset.file_paths, dataset.labels, test_size=0.2, random_state=42
)

# Create separate datasets
train_dataset = CREMADataset(audio_dir, processor, emotion_map, train_file_paths, train_labels)
val_dataset = CREMADataset(audio_dir, processor, emotion_map, val_file_paths, val_labels)

# Create DataLoaders for each
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

# Load pre-trained Wav2Vec2 model for sequence classification
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base-960h", num_labels=len(emotion_map))

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} - Training"):
        batch = {key: value.to(device) for key, value in batch.items()}
        inputs = batch['input_values']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        optimizer.zero_grad()
        
        outputs = model(input_values=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        # Backpropagate the loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        preds = torch.argmax(logits, dim=-1)
        correct_preds += (preds == labels).sum().item()
        total_preds += labels.size(0)
    
    train_accuracy = correct_preds / total_preds
    avg_train_loss = total_loss / len(train_loader)
    
    print(f"Train Loss: {avg_train_loss:.4f} - Train Accuracy: {train_accuracy:.4f}")
    
    # Validation loop
    model.eval()
    val_loss = 0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} - Validation"):
            batch = {key: value.to(device) for key, value in batch.items()}
            inputs = batch['input_values']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            
            outputs = model(input_values=inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            
            val_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            correct_preds += (preds == labels).sum().item()
            total_preds += labels.size(0)
    
    val_accuracy = correct_preds / total_preds
    avg_val_loss = val_loss / len(val_loader)
    
    print(f"Validation Loss: {avg_val_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")

# Save the model after training
model.save_pretrained("wav2vec2_emotion_classifier")


Files in directory: ['1001_DFA_ANG_XX.wav', '1001_DFA_DIS_XX.wav', '1001_DFA_FEA_XX.wav', '1001_DFA_HAP_XX.wav', '1001_DFA_NEU_XX.wav', '1001_DFA_SAD_XX.wav', '1001_IEO_ANG_HI.wav', '1001_IEO_ANG_LO.wav', '1001_IEO_ANG_MD.wav', '1001_IEO_DIS_HI.wav']
Number of samples in dataset: 7441
First few file paths: ['audio-emotion-dataset2\\1001_DFA_ANG_XX.wav', 'audio-emotion-dataset2\\1001_DFA_DIS_XX.wav', 'audio-emotion-dataset2\\1001_DFA_FEA_XX.wav', 'audio-emotion-dataset2\\1001_DFA_HAP_XX.wav', 'audio-emotion-dataset2\\1001_DFA_NEU_XX.wav']
First few labels: [0, 1, 2, 3, 4]


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 - Training: 100%|██████████| 744/744 [08:56<00:00,  1.39it/s]


Train Loss: 1.7067 - Train Accuracy: 0.2747


Epoch 1 - Validation: 100%|██████████| 187/187 [00:26<00:00,  6.98it/s]


Validation Loss: 1.5835 - Validation Accuracy: 0.3277


Epoch 2 - Training: 100%|██████████| 744/744 [08:51<00:00,  1.40it/s]


Train Loss: 1.5556 - Train Accuracy: 0.3555


Epoch 2 - Validation: 100%|██████████| 187/187 [00:27<00:00,  6.83it/s]


Validation Loss: 1.5550 - Validation Accuracy: 0.3398


Epoch 3 - Training: 100%|██████████| 744/744 [09:08<00:00,  1.36it/s]


Train Loss: 1.4612 - Train Accuracy: 0.4187


Epoch 3 - Validation: 100%|██████████| 187/187 [00:26<00:00,  6.93it/s]


Validation Loss: 1.3767 - Validation Accuracy: 0.4527


In [4]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

model_path = "wav2vec2_emotion_classifier"

# Load model and processor
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
# processor = Wav2Vec2Processor.from_pretrained(model_path)

model.to(device)  # Send model to GPU or CPU as needed
model.eval()


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from torch.utils.data import DataLoader

test_loader = DataLoader(test_dataset, batch_size=1)  # Use batch_size=1 or adjust as needed
