In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import numpy as np

# Define a function to load the acoustic features from .npy files
def load_acoustic_features(feature_dir):
    feature_files = sorted(os.listdir(feature_dir))
    features = [np.load(os.path.join(feature_dir, file), allow_pickle=True) for file in feature_files]
    features = np.stack(features, axis=0)  # Stack into a single tensor
    return torch.tensor(features, dtype=torch.float32)

# Auxiliary Modal Redundancy Reduction Algorithm for acoustic features
class AuxiliaryModalRedundancyReduction(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(AuxiliaryModalRedundancyReduction, self).__init__()
        # Define linear layers for acoustic features
        self.acoustic_linear_Q = nn.Linear(input_dim, output_dim)
        self.acoustic_linear_K = nn.Linear(input_dim, output_dim)
        self.acoustic_linear_V = nn.Linear(input_dim, output_dim)
        
    def forward(self, acoustic_features):
        # Assuming acoustic_features have the shape [batch_size, seq_len, input_dim]
        # Embedding step
        fa_Q = self.acoustic_linear_Q(acoustic_features)  # [batch_size, seq_len, output_dim]
        fa_K = self.acoustic_linear_K(acoustic_features)  # [batch_size, seq_len, output_dim]
        fa_V = self.acoustic_linear_V(acoustic_features)  # [batch_size, seq_len, output_dim]
        
        # Attention score calculation
        W_a2a = F.softmax(torch.matmul(fa_Q, fa_K.transpose(-1, -2)) / fa_K.size(-1)**0.5, dim=-1)  # [batch_size, seq_len, seq_len]
        Attena = torch.matmul(W_a2a, fa_V)  # [batch_size, seq_len, output_dim]

        # Output refined acoustic information
        F2 = Attena

        return F2

# Example usage:
input_dim = 16  # The dimension of your acoustic features
output_dim = 768  # Output dimension

# Load the acoustic features
acoustic_feature_dir = r"/Users/dinesh/College/final proj/attempt3/features/audio"
acoustic_features = load_acoustic_features(acoustic_feature_dir)  # [batch_size, seq_len, input_dim]

# Ensure the device is set up for MPS (Apple Silicon)
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

# Move features to the device
acoustic_features = acoustic_features.to(device)

# Ensure acoustic features have the correct shape
if acoustic_features.dim() == 2:
    acoustic_features = acoustic_features.unsqueeze(1)  # Adding sequence length dimension

# Instantiate the model
model = AuxiliaryModalRedundancyReduction(input_dim=input_dim, output_dim=output_dim).to(device)

# Forward pass through the model
F2 = model(acoustic_features)

# Print the shape of the output
print(F2.shape)


torch.Size([2089, 1, 768])
