In [46]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install pycoco

In [None]:
import torchvision.datasets as dset
import torchvision.transforms as transforms
cap = dset.CocoCaptions(root = '/kaggle/input/coco-2017-dataset/coco2017/train2017',
                        annFile = '/kaggle/input/coco-2017-dataset/coco2017/annotations/captions_train2017.json',
                        transform=transforms.ToTensor())

print('Number of samples: ', len(cap))
img, target = cap[3] # load 4th sample

print("Image Size: ", img.size())
print(target)

In [None]:
pip install transformers

In [47]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [48]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [49]:


import os
import json
from PIL import Image
from pycocotools.coco import COCO
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import random


In [50]:

class COCOImageCaptionDataset(Dataset):
    def __init__(self, img_dir, annotations_file, transform=None):
        self.img_dir = img_dir
        
        # Load annotations
        with open(annotations_file, 'r') as f:
            self.annotations = json.load(f)
        
        # Define default transform if not provided
        self.transform = transform or transforms.Compose([
            transforms.RandomGrayscale(p=0.3),
            transforms.RandomRotation(5),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.ColorJitter(brightness=0.1, contrast=0.1),
            transforms.RandomResizedCrop(224, scale=(0.9, 1.0)),
            transforms.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 1.0)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        
        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

        
        # Get list of image files
        self.image_files = [
            os.path.join(img_dir, file)
            for file in os.listdir(img_dir)
            if file.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))
        ]
    
    def __len__(self):
        return len(self.annotations['annotations'])
    
    def __getitem__(self, idx):
        # Get annotation
        ann = self.annotations['annotations'][idx]
        
        # Load image
        img_path = f"{self.img_dir}/{ann['image_id']:012d}.jpg"
        image = Image.open(img_path).convert('RGB')
        
        # Apply transformations
        if self.transform:
            image = self.transform(image)
        
        # Anchor caption
        print(ann)
        caption = ann['caption']
        encoding = self.tokenizer(caption, padding='max_length', 
                                  truncation=True, max_length=64,
                                  return_tensors='pt')
        
        # Negative caption (random from dataset)
        neg_idx = random.choice([i for i in range(len(self)) if i != idx])
        
        neg_caption = self.annotations['annotations'][neg_idx]['caption']
        print(neg_caption)
        neg_encoding = self.tokenizer(neg_caption, padding='max_length',
                                      truncation=True, max_length=64,
                                      return_tensors='pt')
        
        return {
            'image': image,
            'caption_ids': encoding['input_ids'].squeeze(0),
            'caption_mask': encoding['attention_mask'].squeeze(0),
            'neg_caption_ids': neg_encoding['input_ids'].squeeze(0),
            'neg_caption_mask': neg_encoding['attention_mask'].squeeze(0)
        }

In [51]:
ob = COCOImageCaptionDataset("/kaggle/input/coco-2017-dataset/coco2017/train2017", "/kaggle/input/coco-2017-dataset/coco2017/annotations/captions_train2017.json")

{'image_id': 106140, 'id': 98, 'caption': 'A large passenger airplane flying through the air.'}
A large pink and white bird standing in a pool of water.


{'image': tensor([[[-1.4158, -0.8678, -0.4739,  ..., -2.1179, -2.1179, -2.1179],
          [-1.2959, -0.7822, -0.4397,  ..., -2.1179, -2.1179, -2.1179],
          [-1.1760, -0.6965, -0.4054,  ..., -2.1179, -2.1179, -2.1179],
          ...,
          [-2.1179, -2.1179, -2.1179,  ..., -1.1589, -1.1760, -1.1589],
          [-2.1179, -2.1179, -2.1179,  ..., -1.1589, -1.1075, -1.0390],
          [-2.1179, -2.1179, -2.1179,  ..., -1.1075, -1.0048, -0.8507]],
 
         [[-1.3004, -0.7227, -0.2850,  ..., -2.0357, -2.0357, -2.0357],
          [-1.1779, -0.6352, -0.2675,  ..., -2.0357, -2.0357, -2.0357],
          [-1.0203, -0.5126, -0.2325,  ..., -2.0357, -2.0357, -2.0357],
          ...,
          [-2.0357, -2.0357, -2.0357,  ..., -0.7927, -0.7752, -0.7752],
          [-2.0357, -2.0357, -2.0357,  ..., -0.7577, -0.7227, -0.6877],
          [-2.0357, -2.0357, -2.0357,  ..., -0.7227, -0.6877, -0.6352]],
 
         [[-1.0027, -0.3753,  0.0779,  ..., -1.8044, -1.8044, -1.8044],
          [-0.8633,

In [61]:
from torch import nn

In [62]:
class ImageEncoder(nn.Module):
    def __init__(self, projection_dim=128):
        super(ImgEncoder_CNN, self).__init__()
        
        base_model = models.resnet50(pretrained=False)
        self.base_model = nn.Sequential(*list(base_model.children())[:-1])
        
        # Freezeing the parameters of the base model
        for param in self.base_model.parameters():
            param.requires_grad = False  # Corrected attribute name
        
        # Define the projection head
        self.projection_head = ProjectionHead(2048, 256, projection_dim)  # Corrected input_dim

    def forward(self, x):
    
        # Extract features from the base model
        h = self.base_model(x).squeeze()  # Shape: [batch_size, 2048]
        
        # Pass through the projection head
        z = self.projection_head(h)  # Shape: [batch_size, projection_dim]
        
        # Normalize the output embeddings
        return normalize(z, dim=1)