# Build Model

In [1]:
import os
import torch
from torch import nn
import numpy as np
import PIL

In [2]:
# Set pytorch device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
PROJECT_DIR = os.getcwd()  # get the current working directory
DATA_DIR = os.path.join(PROJECT_DIR, 'data') # Concatenate the folder name which contains images
DATA_DIR  # Check the final path value

'/home/bairathirahul/Workspace/image-captioning/data'

In [4]:
# Load dataset
meme_captions = np.load(os.path.join(DATA_DIR, 'meme_captions.npy'), allow_pickle=True)
print(meme_captions.shape)
meme_captions[0]

(368342, 3)


array(['yuno.jpg', 'y-u-no.jpg',
       'the real slim shady y you no stand up ?'], dtype='<U689')

In [5]:
from torchtext.data.utils import get_tokenizer

from torchtext.vocab import GloVe

# Torch Module using GloVe Embeddings
class CaptionEncoder(nn.Module):
    def __init__(self, embed_size):
        super(CaptionEncoder, self).__init__()

        self.embed_size = embed_size

        self.tokenizer = get_tokenizer('basic_english')
        
        self.embedding = GloVe(name='twitter.27B', dim=embed_size)

        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):

        tokens = self.tokenizer(x)

        print(self.embedding.device)

        vectors = self.embedding.get_vecs_by_tokens(tokens)

        return self.dropout(vectors)
    
text = meme_captions[0][1]
print(text)

model = CaptionEncoder(200).to(device)

embedding = model(text)

embedding = CaptionEncoder(200)(text)
print(embedding.device)
print(embedding.shape)
embedding

y-u-no.jpg


AttributeError: 'GloVe' object has no attribute 'device'

In [37]:
# Torch module for image encoding using Inception V3
import torchvision.models as models
import torchvision.transforms as transforms


class ImageEncoder(nn.Module):
    def __init__(self, embed_size):
        super(ImageEncoder, self).__init__()

        self.embed_size = embed_size

        self.inception = models.inception_v3(
            weights=models.Inception_V3_Weights.DEFAULT
        )

        self.inception.eval()

        self.fc = nn.Linear(in_features=1000, out_features=embed_size)

    def forward(self, x):
        vectors = self.inception(x)

        return self.fc(vectors)


with torch.no_grad():
    image = PIL.Image.open(os.path.join(DATA_DIR, "memes", "y-u-no.jpg"))

    image_tensors = transforms.ToTensor()(image)

    image_tensors_resized = transforms.Resize((299, 299), antialias=True)(
        image_tensors
    ).unsqueeze(0)

    embedding = ImageEncoder(200)(image_tensors_resized)

print(embedding.shape)

embedding

torch.Size([1, 200])


tensor([[-0.1927, -0.0785, -0.5948, -0.2257,  0.1356, -0.1189, -1.0368, -0.7186,
          0.3053, -0.1520,  0.7308, -0.0852, -0.7530, -0.2627, -0.3564,  0.2473,
          0.4449,  0.2278, -0.1392, -0.4025,  0.1228, -1.1415,  0.5589,  0.1911,
          0.2443,  0.2190, -0.8402,  0.3068, -0.0629, -0.2389, -0.1502, -0.4937,
         -0.3047,  0.4230, -0.0375,  0.5902,  0.1465, -0.4917, -0.1273, -0.6709,
          0.8164,  0.8518,  0.3649, -0.4915,  0.9703, -0.8993, -0.4560,  0.8441,
         -0.2487,  0.1850, -0.2462,  0.3159, -0.2192,  0.3389, -0.3913, -0.4828,
          0.1312, -1.5410,  0.6713, -0.7781, -0.2192, -0.6062, -0.0609,  1.2553,
         -0.4616,  0.0454,  1.1359,  0.4332,  0.3019,  1.0750,  0.7409,  0.5057,
         -0.2241, -0.4372, -0.1643, -0.6472, -0.2351,  0.1329,  0.7971,  0.1797,
          0.9245,  0.2165, -0.7465,  0.2341, -1.2789,  0.0423,  0.0151,  0.4509,
         -0.4216, -1.0433, -1.0251,  1.4620, -0.8937,  0.4200, -0.7396, -0.3900,
          0.8238,  0.7280,  

In [None]:
class ImageLabelEncoder(nn.Module):
    """ImageLabel encoder.

    Encodes images and text labels into a single embedding of size `emb_dim`.
    """

    def __init__(self, num_tokens, emb_dim=256, dropout=0.2):
        """Initializes LabelEncoder.

        Args:
            num_tokens: number of tokens in the vocabulary
            emb_dim (int): dimensions of the output embedding
            dropout (float): dropout for the encoded features
        """
        super().__init__()
        self.image_encoder = ImageEncoder(emb_dim, dropout)
        self.label_encoder = LabelEncoder(num_tokens, emb_dim, dropout)
        self.linear = nn.Linear(2 * emb_dim, emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, images, labels):
        """
        Args:
            images (torch.Tensor): input images of shape `[bs, width, height]`
            labels (torch.Tensor): input text labels of shape `[bs, seq_len]`

        Returns:
            torch.Tensor: combined image-label embedding of shape `[bs, emb_dim]`
        """
        image_emb = self.image_encoder(images)
        label_emb = self.label_encoder(labels)

        emb = torch.cat([image_emb, label_emb], dim=1)
        emb = self.dropout(self.linear(emb))

        return emb