In [2]:
import os
import torch
import torch.nn as nn
import numpy as np

# Define the CrossModalAttention model
class CrossModalAttention(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CrossModalAttention, self).__init__()
        self.text_linear = nn.Linear(input_dim, output_dim)
        self.audio_linear = nn.Linear(16, output_dim)  # Matching acoustic features with text output_dim
        self.query_linear = nn.Linear(output_dim, output_dim)
        self.key_linear = nn.Linear(output_dim, output_dim)
        self.value_linear = nn.Linear(output_dim, output_dim)
        self.softmax = nn.Softmax(dim=-1)
        self.output_dim = output_dim

    def forward(self, text_features, acoustic_features):
        # Reshape acoustic features to match text features dimension
        acoustic_features = acoustic_features.unsqueeze(1).expand(-1, text_features.size(1), -1)
        
        # Step 1: Embedding
        ft = self.text_linear(text_features)  # [batch_size, seq_len, output_dim]
        fa = self.audio_linear(acoustic_features)  # [batch_size, seq_len, output_dim]
        
        # Step 2: Linear transformations for Query, Key, Value
        Q_t = self.query_linear(ft)  # [batch_size, seq_len, output_dim]
        K_a = self.key_linear(fa)    # [batch_size, seq_len, output_dim]
        V_a = self.value_linear(fa)  # [batch_size, seq_len, output_dim]

        # Step 3: Cross-modal Attention
        attention_scores = self.softmax(torch.matmul(Q_t, K_a.transpose(-2, -1)))  # [batch_size, seq_len, seq_len]
        attention_output = torch.matmul(attention_scores, V_a)  # [batch_size, seq_len, output_dim]
        
        # Step 4: Residual connection
        F1 = ft + attention_output  # [batch_size, seq_len, output_dim]
        
        return F1

# Function to load features from .npy files in a directory
def load_features(feature_dir):
    feature_files = sorted(os.listdir(feature_dir))
    features = []
    for file in feature_files:
        if file.endswith('.npy'):  # Load only .npy files
            file_path = os.path.join(feature_dir, file)
            try:
                data = np.load(file_path, allow_pickle=True)
                features.append(data)
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
    features = np.stack(features, axis=0)  # Stack into a single tensor
    return torch.tensor(features, dtype=torch.float32)

# Directories containing the .npy files for each modality
text_feature_dir = r"/Users/dinesh/College/final proj/attempt3/features/text"
audio_feature_dir = r"/Users/dinesh/College/final proj/attempt3/features/audio"

# Load features
text_features = load_features(text_feature_dir)  # Shape: [batch_size, seq_len, input_dim]
acoustic_features = load_features(audio_feature_dir)  # Shape: [batch_size, seq_len, input_dim]

# Ensure the device is set up for MPS
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

# Print feature shapes
print("Text features shape:", text_features.shape)
print("Acoustic features shape:", acoustic_features.shape)

# Input and output dimensions
input_dim = text_features.shape[-1]  # Assuming both text and audio features have the same dimension
output_dim = 768  # Desired output dimension

# Initialize the model and move it to the appropriate device
model = CrossModalAttention(input_dim=input_dim, output_dim=output_dim).to(device)

# Move features to the same device
text_features = text_features.to(device)
acoustic_features = acoustic_features.to(device)

# Forward pass through the model
F1 = model(text_features, acoustic_features)

# Print the shape of the output
print(F1.shape)


Text features shape: torch.Size([2089, 1, 768])
Acoustic features shape: torch.Size([2089, 16])
torch.Size([2089, 1, 768])
