In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os

# CrossModalAttention Module
class CrossModalAttention(nn.Module):
    def __init__(self, feature_dim):
        super(CrossModalAttention, self).__init__()
        self.query = nn.Linear(feature_dim, feature_dim)
        self.key = nn.Linear(feature_dim, feature_dim)
        self.value = nn.Linear(feature_dim, feature_dim)
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, visual, textual):
        """
        Forward pass of CrossModalAttention.
        """
        # Ensure `visual` has three dimensions
        if len(visual.shape) == 2:  # If visual is (B, Dim), add sequence dimension
            visual = visual.unsqueeze(1)  # Shape becomes (B, 1, Dim)

        # Ensure `textual` has three dimensions
        if len(textual.shape) == 2:  # If textual is (B, Dim), add sequence dimension
            textual = textual.unsqueeze(1)  # Shape becomes (B, 1, Dim)

        # Linear transformations
        query = self.query(textual)  # (B, 1, Dim)
        key = self.key(visual)       # (B, Seq, Dim)
        value = self.value(visual)   # (B, Seq, Dim)

        # Transpose key for batch matrix multiplication
        key = key.transpose(1, 2)  # (B, Dim, Seq)

        # Attention weights
        attention_weights = self.softmax(torch.bmm(query, key))  # (B, 1, Seq)

        # Weighted sum of values
        attended_features = torch.bmm(attention_weights, value)  # (B, 1, Dim)
        attended_features = attended_features.squeeze(1)         # Remove sequence dimension

        return attended_features, attention_weights


# CrossModalModel with CrossModalAttention
class CrossModalModel(nn.Module):
    def __init__(self, feature_dim):
        super(CrossModalModel, self).__init__()
        self.cross_modal_attention = CrossModalAttention(feature_dim)
        self.fc = nn.Linear(feature_dim, feature_dim)  # Optional final processing layer

    def forward(self, visual, textual):
        attended_features, attention_weights = self.cross_modal_attention(visual, textual)
        visual_features = self.fc(attended_features)  # Optional processing
        return visual_features, attention_weights


# Dataset for Cross-Modal Data
class CrossModalDataset(Dataset):
    def __init__(self, bilstm_folder, embedding_folder):
        self.bilstm_files = sorted(os.listdir(bilstm_folder))
        self.embedding_files = sorted(os.listdir(embedding_folder))
        self.bilstm_folder = bilstm_folder
        self.embedding_folder = embedding_folder

    def __len__(self):
        return len(self.bilstm_files)

    def __getitem__(self, idx):
        bilstm_path = os.path.join(self.bilstm_folder, self.bilstm_files[idx])
        embedding_path = os.path.join(self.embedding_folder, self.embedding_files[idx])

        visual_features = np.load(bilstm_path)  # Shape (Seq, Dim)
        embedding_vector = np.load(embedding_path)  # Shape (1, Dim)

        # Convert to tensors
        visual_features = torch.tensor(visual_features, dtype=torch.float32)
        embedding_vector = torch.tensor(embedding_vector, dtype=torch.float32)

        return visual_features, embedding_vector


# Loss Function for Cross-Modal Training
def cross_modal_loss(predictions, targets):
    return nn.MSELoss()(predictions, targets)


# Training Function
def train_model(dataset, feature_dim, epochs=10, lr=1e-4, batch_size=4):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model = CrossModalModel(feature_dim).cuda()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for visual, textual in dataloader:
            visual, textual = visual.cuda(), textual.cuda()

            optimizer.zero_grad()
            visual_features, _ = model(visual, textual)
            loss = cross_modal_loss(visual_features, textual.squeeze(1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader):.4f}")

    return model




In [3]:
# Main Code
if __name__ == "__main__":
    bilstm_folder = "C:/Users/prath/OneDrive/Desktop/bilstm_features_10" # Path to BiLSTM features folder
    embedding_folder = "C:/Users/prath/OneDrive/Desktop/paper2/embedding_vectors_1_10"  # Folder containing embedding .npy files

    feature_dim =  1024  # Dimension of the features

    dataset = CrossModalDataset(bilstm_folder, embedding_folder)
    trained_model = train_model(dataset, feature_dim)

    # Save the trained model
    torch.save(trained_model.state_dict(), "cross_modal_model.pth")


Epoch 1/10, Loss: 0.0339
Epoch 2/10, Loss: 0.0147
Epoch 3/10, Loss: 0.0100
Epoch 4/10, Loss: 0.0075
Epoch 5/10, Loss: 0.0060
Epoch 6/10, Loss: 0.0047
Epoch 7/10, Loss: 0.0039
Epoch 8/10, Loss: 0.0032
Epoch 9/10, Loss: 0.0027
Epoch 10/10, Loss: 0.0023


In [5]:
import torch
from torch.utils.data import DataLoader
import os
import numpy as np

def extract_attended_features(model, dataset, output_folder, batch_size=4):
    """
    Extract attended visual features for the dataset and save them.
    
    Args:
        model: Trained CrossModalModel.
        dataset: Dataset containing visual and textual inputs.
        output_folder: Folder to save the attended features.
        batch_size: Batch size for processing.
    """
    os.makedirs(output_folder, exist_ok=True)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    
    with torch.no_grad():
        for idx, (visual, textual) in enumerate(dataloader):
            visual, textual = visual.cuda(), textual.cuda()
            attended_features, _ = model(visual, textual)  # Get attended features
            
            # Save each batch of features
            for i in range(attended_features.size(0)):
                feature_path = os.path.join(output_folder, f"{idx * batch_size + i + 1}.npy")
                np.save(feature_path, attended_features[i].cpu().numpy())
                print(f"Saved attended feature to {feature_path}")

# Usage
if __name__ == "__main__":
    bilstm_folder = "C:/Users/prath/OneDrive/Desktop/bilstm_features_10"
    embedding_folder = "C:/Users/prath/OneDrive/Desktop/paper2/embedding_vectors_1_10"
    output_folder = "C:/Users/prath/OneDrive/Desktop/attended_features10"

    feature_dim = 1024
    dataset = CrossModalDataset(bilstm_folder, embedding_folder)

    # Load trained model
    model = CrossModalModel(feature_dim).cuda()
    model.load_state_dict(torch.load("cross_modal_model.pth"))

    # Extract and save features
    extract_attended_features(model, dataset, output_folder)


  model.load_state_dict(torch.load("cross_modal_model.pth"))


Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\1.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\2.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\3.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\4.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\5.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\6.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\7.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\8.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\9.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\10.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_features10\11.npy
Saved attended feature to C:/Users/prath/OneDrive/Desktop/attended_feature

COMPACT BI-LINEAR POOLING

In [6]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.nn.functional import normalize

# Compact Bilinear Pooling Implementation
class CompactBilinearPooling(nn.Module):
    def __init__(self, input_dim1, input_dim2, output_dim):
        super(CompactBilinearPooling, self).__init__()
        self.output_dim = output_dim
        self.sketch1 = nn.Parameter(torch.randint(0, output_dim, (input_dim1,), dtype=torch.long), requires_grad=False)
        self.sign1 = nn.Parameter(torch.randint(0, 2, (input_dim1,), dtype=torch.float32) * 2 - 1, requires_grad=False)
        self.sketch2 = nn.Parameter(torch.randint(0, output_dim, (input_dim2,), dtype=torch.long), requires_grad=False)
        self.sign2 = nn.Parameter(torch.randint(0, 2, (input_dim2,), dtype=torch.float32) * 2 - 1, requires_grad=False)

    def forward(self, x1, x2):
        # Create empty tensors for the sketches
        x1_sketch = torch.zeros(x1.size(0), self.output_dim, device=x1.device)
        x2_sketch = torch.zeros(x2.size(0), self.output_dim, device=x2.device)

        # Compute sketches for x1
        for i in range(x1.size(1)):
            idx = self.sketch1[i].item()
            x1_sketch[:, idx] += x1[:, i] * self.sign1[i]

        # Compute sketches for x2
        for i in range(x2.size(1)):
            idx = self.sketch2[i].item()
            x2_sketch[:, idx] += x2[:, i] * self.sign2[i]

        # Perform FFT, element-wise multiplication, and inverse FFT
        fft_x1 = torch.fft.rfft(x1_sketch, dim=1)
        fft_x2 = torch.fft.rfft(x2_sketch, dim=1)
        fft_product = fft_x1 * fft_x2
        result = torch.fft.irfft(fft_product, n=self.output_dim, dim=1)

        return normalize(result, p=2, dim=1)  # Normalize the output

    
   # Paths to your data folders
attended_features_folder = "C:/Users/prath/OneDrive/Desktop/attended_features10"
embedding_vectors_folder = "C:/Users/prath/OneDrive/Desktop/paper2/embedding_vectors_1_10"
output_folder = "C:/Users/prath/OneDrive/Desktop/pooled features10"

# Hyperparameters
input_dim1, input_dim2, output_dim = 1024, 1024, 2048  # Adjust dimensions as needed
cbp = CompactBilinearPooling(input_dim1, input_dim2, output_dim)

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Process data
for filename in os.listdir(attended_features_folder):
    # Load attended visual features and corresponding embedding vector
    attended_path = os.path.join(attended_features_folder, filename)
    embedding_path = os.path.join(embedding_vectors_folder, filename)  # Assuming matching filenames

    if os.path.isfile(attended_path) and os.path.isfile(embedding_path):
        attended_visual_features = torch.tensor(np.load(attended_path), dtype=torch.float32)  # Shape: (batch_size, 1024)
        embedding_vectors = torch.tensor(np.load(embedding_path), dtype=torch.float32)        # Shape: (batch_size, 1024)

        # Reshape attended visual features to match (1, 1024)
        attended_visual_features = attended_visual_features.unsqueeze(0)  # Add batch dimension: (1, 1024)
        
        # Ensure batch dimensions are aligned
        if attended_visual_features.shape[1] != embedding_vectors.shape[1]:
            print(f"Feature size mismatch for {filename}. Skipping...")
            continue

        # Perform Compact Bilinear Pooling
        pooled_features = cbp(attended_visual_features, embedding_vectors)

        # Save pooled features
        output_path = os.path.join(output_folder, filename)
        np.save(output_path, pooled_features.detach().cpu().numpy())
        print(f"Processed and saved pooled features for {filename} to {output_path}")


Processed and saved pooled features for 1.npy to C:/Users/prath/OneDrive/Desktop/pooled features10\1.npy
Processed and saved pooled features for 10.npy to C:/Users/prath/OneDrive/Desktop/pooled features10\10.npy
Processed and saved pooled features for 11.npy to C:/Users/prath/OneDrive/Desktop/pooled features10\11.npy
Processed and saved pooled features for 12.npy to C:/Users/prath/OneDrive/Desktop/pooled features10\12.npy
Processed and saved pooled features for 13.npy to C:/Users/prath/OneDrive/Desktop/pooled features10\13.npy
Processed and saved pooled features for 14.npy to C:/Users/prath/OneDrive/Desktop/pooled features10\14.npy
Processed and saved pooled features for 15.npy to C:/Users/prath/OneDrive/Desktop/pooled features10\15.npy
Processed and saved pooled features for 16.npy to C:/Users/prath/OneDrive/Desktop/pooled features10\16.npy
Processed and saved pooled features for 17.npy to C:/Users/prath/OneDrive/Desktop/pooled features10\17.npy
Processed and saved pooled features for