In [7]:
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
import torch.nn.functional as Fnn
from torch import nn

In [13]:
class ImageModalityEncoder(nn.Module):
    def __init__(self, num_objects=100, vector_dim=256):
        super(ImageModalityEncoder, self).__init__()
        self.num_objects = num_objects
        self.vector_dim = vector_dim

        # Load pretrained Faster R-CNN model
        self.faster_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)
        self.faster_rcnn.eval()  # Set model to eval mode

        # Define the learnable parameters for position-aware encoding
        self.Wf = nn.Linear(1024, vector_dim)
        self.Wb = nn.Linear(4, vector_dim)
        self.bf = nn.Parameter(torch.zeros(vector_dim))
        self.bb = nn.Parameter(torch.zeros(vector_dim))

        # Layer normalization
        self.norm_f = nn.LayerNorm(vector_dim)
        self.norm_b = nn.LayerNorm(vector_dim)

    def forward(self, image):
        # Register hook to capture features from the backbone
        feature_maps = []
        
        def hook_fn(module, input, output):
            feature_maps.append(output)
        
        hook = self.faster_rcnn.backbone.register_forward_hook(hook_fn)
        
        # Apply Faster R-CNN to extract object detections
        with torch.no_grad():
            predictions = self.faster_rcnn([image])
        
        hook.remove()  # Remove the hook after the forward pass
        
        # The feature maps from the backbone are now stored in `feature_maps`
        feature_map = feature_maps[0]  # [batch_size, channels, height, width]
        
        # Get the top `num_objects` detected objects (bounding boxes and features)
        boxes = predictions[0]['boxes'][:self.num_objects]  # [num_objects, 4]

        # Normalize bounding boxes to the feature map dimensions
        image_height, image_width = image.shape[-2:]
        feature_map_height, feature_map_width = feature_map.shape[-2:]

        # Normalize bounding box coordinates to [0, 1] with respect to image size
        boxes[:, [0, 2]] /= image_width
        boxes[:, [1, 3]] /= image_height

        # Scale boxes to the feature map size
        boxes[:, [0, 2]] *= feature_map_width
        boxes[:, [1, 3]] *= feature_map_height

        # Add batch index (since RoIAlign expects [batch_idx, x1, y1, x2, y2])
        box_indices = torch.zeros(boxes.size(0), dtype=torch.int, device=boxes.device)
        boxes_with_indices = torch.cat([box_indices[:, None], boxes], dim=1)

        # Use RoIAlign to pool features for each bounding box
        roi_align = torchvision.ops.RoIAlign(output_size=(7, 7), spatial_scale=1.0, sampling_ratio=-1)
        pooled_features = roi_align(feature_map, [boxes_with_indices])  # [num_objects, channels, 7, 7]
        
        # Flatten pooled features and encode them
        pooled_features = pooled_features.view(pooled_features.size(0), -1)  # [num_objects, channels * 7 * 7]
        
        # Normalize and encode features and bounding box coordinates
        fj = self.norm_f(self.Wf(pooled_features)) + self.bf  # [num_objects, vector_dim]
        bj = self.norm_b(self.Wb(boxes)) + self.bb  # [num_objects, vector_dim]

        # Compute the final position-aware object representations
        vj = (fj + bj) / 2  # [num_objects, vector_dim]

        # Aggregate all object representations to get the final image representation
        V = torch.mean(vj, dim=0)  # [vector_dim]
        
        return V

In [14]:
# Example usage
def preprocess_image(image_path):
    from PIL import Image
    image = Image.open(image_path)
    image = F.to_tensor(image)  # Convert image to tensor
    return image

In [15]:
# Initialize the model
image_encoder = ImageModalityEncoder()

In [16]:
# Load and preprocess an example image
image_tensor = preprocess_image("D:/Project_phase_1/train2014/COCO_train2014_000000000009.jpg")

In [18]:
# Add a batch dimension
image_tensor = image_tensor.unsqueeze(0)

In [27]:
import torch
import torchvision.transforms as transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from PIL import Image

# Check if a GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the Image Encoder class
class ImageEncoder(torch.nn.Module):
    def __init__(self):
        super(ImageEncoder, self).__init__()
        self.faster_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)
        self.faster_rcnn.eval()  # Set to evaluation mode

    def forward(self, image):
        # Ensure image is on the same device as the model
        image = image.to(device)
        with torch.no_grad():
            predictions = self.faster_rcnn([image])  # Pass image as a list
        print("Predictions Output: ", predictions)
        
        # Example processing for the output
        boxes = predictions[0]['boxes']
        labels = predictions[0]['labels']
        scores = predictions[0]['scores']

        # Use only boxes with a score > threshold
        threshold = 0.5
        valid_indices = scores > threshold
        boxes = boxes[valid_indices]
        
        # Further processing can go here (e.g., RoIAlign, etc.)

        return boxes  # Return the processed boxes or other features

# Create an instance of the ImageEncoder
image_encoder = ImageEncoder().to(device)

# Load an image from the MS COCO dataset
image_path = 'D:/Project_phase_1/train2014/COCO_train2014_000000000009.jpg'  # Update this path to your image file
image = Image.open(image_path).convert("RGB")

# Define the transformation
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert to tensor
    transforms.Resize((480, 640)),  # Resize if necessary
])

# Preprocess the image
image_tensor = transform(image).unsqueeze(0).to(device)  # Add batch dimension and move to device

# Get the final image representation
image_representation = image_encoder(image_tensor.squeeze(0))  # Pass image tensor directly
print(image_representation.shape)  # Output the final vector dimension


Predictions Output:  [{'boxes': tensor([[3.1598e+02, 1.5025e+00, 6.1762e+02, 2.3067e+02],
        [2.5639e+02, 2.3405e+02, 5.7711e+02, 4.6684e+02],
        [0.0000e+00, 1.2453e+01, 4.3430e+02, 3.8766e+02],
        [3.9088e+02, 7.3975e+01, 4.6952e+02, 1.4071e+02],
        [4.6476e+02, 4.0634e+01, 5.2319e+02, 8.9265e+01],
        [3.3420e+01, 1.9109e+02, 6.1583e+02, 4.8000e+02],
        [3.6256e+02, 2.6677e+00, 4.5770e+02, 6.7999e+01],
        [3.2560e+02, 4.9800e+00, 4.9315e+02, 1.3748e+02],
        [4.6027e+02, 5.5890e+01, 5.7009e+02, 1.4032e+02],
        [3.7236e+02, 3.6787e+01, 4.5330e+02, 8.3875e+01],
        [0.0000e+00, 1.1023e+02, 2.3114e+02, 3.0810e+02],
        [5.1149e+02, 2.0075e+02, 6.4000e+02, 3.2278e+02],
        [0.0000e+00, 1.1126e+02, 2.2888e+02, 3.0539e+02],
        [1.3820e+02, 5.6518e+00, 5.4538e+02, 2.6526e+02],
        [5.0799e+02, 2.0302e+02, 6.4000e+02, 3.6213e+02],
        [9.1573e+00, 3.1162e+02, 3.5753e+02, 4.8000e+02],
        [3.0671e+02, 3.7328e+00, 4.5103e