#### Import

In [14]:
import os
import sys
os.getcwd()  # Check current working directory
os.chdir('/home/sayem/Desktop/ImageCaption/notebooks')  # Change if necessary
sys.path.append('../src')  # Now append the src path

In [15]:
# import sys
# # Add the src directory to the Python path
# sys.path.append('../src')
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
from sklearn.model_selection import train_test_split

import pandas as pd
# Set the maximum number of rows to 100 (default is 10 in newer versions)
pd.set_option('display.max_rows', 500)
# Set the maximum number of columns to 50 (default is 20)
pd.set_option('display.max_columns', 1500)

data_folder = "/media/sayem/510B93E12554BBD1/CocoData"

#### Dataset Building

In [16]:
import torchvision.transforms as transforms
from torchvision.datasets import CocoCaptions
from torch.utils.data import DataLoader
from dataset.dataset import CocoCaptionsDataset, collate_fn, Vocabulary

# Image transformations
transform = transforms.Compose([
    transforms.Resize(256),  # Resize the image to a larger size while maintaining aspect ratio
    transforms.CenterCrop(224),  # Center crop the image to 224x224
    transforms.RandomHorizontalFlip(),  # Randomly flip the image horizontally
    transforms.ColorJitter(brightness=0.1, contrast=0.1, \
        saturation=0.1, hue=0.1),  # Random color adjustments
    transforms.ToTensor(),  # Convert the image to a PyTorch tensor
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # Normalize with mean and std for pretrained models
])

# # Enhanced image transformations
# transform = transforms.Compose([
#     transforms.Resize(256),  # Resize the image to a larger size
#     transforms.RandomCrop(224),  # Randomly crop the image to 224x224
#     transforms.RandomHorizontalFlip(),  # Randomly flip the image horizontally
#     transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),  # Randomly change color properties
#     transforms.ToTensor(),  # Convert the image to a PyTorch tensor
#     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # Normalize with mean and std for pretrained models
# ])

In [17]:
import os
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from pycocotools.coco import COCO

# Set the NLTK data directory
nltk_data_path = "/home/sayem/nltk_data"
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)

# Download the 'punkt' tokenizer models
nltk.download('punkt', download_dir=nltk_data_path)

def build_vocab(json_file, threshold=5):
    coco = COCO(json_file)
    counter = Counter()

    for i, id in enumerate(coco.anns.keys()):
        caption = str(coco.anns[id]['caption']).lower()
        tokens = word_tokenize(caption)
        counter.update(tokens)

    vocab = Vocabulary()

    for word, count in counter.items():
        if count >= threshold:
            vocab.add_word(word)

    return vocab

# Build the vocabulary
vocab = build_vocab(os.path.join(data_folder, 'annotations', 'captions_train2017.json'))

[nltk_data] Downloading package punkt to /home/sayem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


loading annotations into memory...
Done (t=0.42s)
creating index...
index created!


In [18]:
train_dataset = CocoCaptionsDataset(root_dir=data_folder+'/train2017',
                                    ann_file=data_folder+'/annotations/captions_train2017.json',
                                    vocab=vocab,
                                    transform=transform)

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=1,
                          shuffle=True,
                          collate_fn=collate_fn)

loading annotations into memory...
Done (t=0.41s)
creating index...
index created!


In [19]:
# Create the validation dataset
val_dataset = CocoCaptionsDataset(
    root_dir=data_folder + '/val2017',  # Path to validation images
    ann_file=data_folder + '/annotations/captions_val2017.json',  # Path to validation annotations
    vocab=vocab,  # Vocabulary instance
    transform=transform  # Image transformations
)

# Create the DataLoader for the validation dataset
val_loader = DataLoader(
    dataset=val_dataset, 
    batch_size=32,  # You can adjust the batch size if needed
    shuffle=False,  # Usually, we don't need to shuffle the validation data
    collate_fn=collate_fn  # Use the same collate function as for the training dataset
)

loading annotations into memory...
Done (t=0.03s)
creating index...
index created!


In [20]:
img, caption, length = next(iter(val_loader))

print(img.shape)
print(caption.shape)
print(length.shape)

torch.Size([32, 3, 224, 224])
torch.Size([32, 19])
torch.Size([32])


#### Model Building

In [21]:
from models.enocders import EncoderCNN

In [22]:
# Create a dummy input tensor of size [batch_size, channels, width, height]
dummy_input = torch.randn(32, 3, 224, 224)  # Batch size of 32, 3 color channels, 224x224 image size

# Instantiate the EncoderCNN
embed_size = 256  # Example embedding size
encoder = EncoderCNN(embed_size)

# Pass the dummy input through the encoder
output_features = encoder(dummy_input)

print("Output features size:", output_features.size())  # Should be [32, embed_size]

Output features size: torch.Size([32, 256])
