<a href="https://colab.research.google.com/github/srishti-git1110/Lets-go-deep-with-PyTorch/blob/main/Dataset%20and%20DataPipes%20blog/Dataset_and_DataPipe_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Getting the dataset from kaggle.**

Here's the link for you to check it out - https://www.kaggle.com/datasets/lefterislymp/neuralsntua-image-captioning

In [None]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d lefterislymp/neuralsntua-image-captioning
!unzip /content/neuralsntua-image-captioning.zip

# Pytorch makes deep learning easier and highly accessible and so, we "depend" a lot on it. 
</br> So, some dependencies -

In [None]:
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms

from PIL import Image
import os
import pandas as pd

!pip install transformers
from transformers import AutoTokenizer

**Custom** **Dataset** **class**

In [None]:
class KaggleImageCaptioningDataset(Dataset):
  def __init__(self, train_captions, root_dir, transform=None, bert_model='distilbert-base-uncased', max_len=512):
    self.df = pd.read_csv(train_captions, header=None, sep='|')
    self.root_dir = root_dir
    self.transform = transform
    self.tokenizer = AutoTokenizer.from_pretrained(bert_model)
    self.max_len = max_len

    self.images = self.df.iloc[:,0]
    self.captions = self.df.iloc[:,2]

  def __len__(self):
    return len(self.df)


  def __getitem__(self, idx):
    caption = self.captions[idx]
    image_id = self.images[idx]
    path_to_image = os.path.join(self.root_dir, image_id)
    image = Image.open(path_to_image).convert('RGB')
    
    if self.transform is not None:
      image = self.transform(image)

    tokenized_caption = self.tokenizer(caption, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.max_len,  
                                      return_tensors='pt')['input_ids']
    
    return image, tokenized_caption

# Let's load the data with the mighty DataLoader

In [None]:
root_dir = '/content/flickr30k-images-ecemod/image_dir'
train_captions = '/content/train_captions.csv'
bert_model = 'distilbert-base-uncased'
transform = transforms.Compose([transforms.Resize(256),
                                transforms.CenterCrop(224),
                                transforms.PILToTensor()])
train_dataset = KaggleImageCaptioningDataset(train_captions=train_captions,
                                       root_dir=root_dir,
                                       transform=transform,
                                       bert_model=bert_model)
train_loader = DataLoader(train_dataset, 
                          batch_size=64, 
                          num_workers=2, 
                          shuffle=True)

*Hoping everything went right...*

In [None]:
for batch_num, (image, caption) in enumerate(train_loader):
  if batch_num > 3:
    break
  print(f'batch number {batch_num} has {image.shape[0]} images and correspondingly {caption.shape[0]} tokenized captions')

batch number 0 has 64 images and correspondingly 64 tokenized captions
batch number 1 has 64 images and correspondingly 64 tokenized captions
batch number 2 has 64 images and correspondingly 64 tokenized captions
batch number 3 has 64 images and correspondingly 64 tokenized captions


IT DID !!!

Now, let's look at the new DataPipes.



# DataPipes

In [2]:
!pip install torchdata
import torchdata.datapipes as dp
from torch.utils.data.backward_compatibility import worker_init_fn
from torch.utils.data import DataLoader

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
training_csv = '/content/train_captions.csv'
train_dp = dp.iter.FileOpener([training_csv])
train_dp = train_dp.parse_csv(delimiter='|')
train_dp = train_dp.shuffle(buffer_size=2000)
train_dp = train_dp.sharding_filter()

In [25]:
max_len = 512
root_dir = '/content/flickr30k-images-ecemod/image_dir'

def apply_image_transforms(image):
  
  transform = transforms.Compose([transforms.Resize(256),
                                transforms.CenterCrop(224),
                                transforms.PILToTensor()])
  return transform(image)

def open_image_from_imagepath(row):
  image_id, _, caption = row
  path_to_image = os.path.join(root_dir, image_id)
  image = Image.open(path_to_image).convert('RGB')
  image = apply_image_transforms(image)
  tokenized_caption = tokenizer(caption, 
                                padding='max_length',  # Pad to max_length
                                truncation=True,  # Truncate to max_length
                                max_length=max_len,  
                                return_tensors='pt')['input_ids']
  return {'image':image, 'caption':tokenized_caption}

  
train_dp = train_dp.map(open_image_from_imagepath)
train_loader = DataLoader(dataset=train_dp, shuffle=True, batch_size=32, num_workers=2, worker_init_fn=worker_init_fn)

In [27]:
num_epochs = 1
bert_model = 'distilbert-base-uncased'    # use any model of your choice
tokenizer = AutoTokenizer.from_pretrained(bert_model)
for epoch in range(num_epochs):
  for batch_num, batch_dict in enumerate(train_loader):
            if batch_num > 2:
                break
            
            images, captions = batch_dict['image'], batch_dict['caption']
            print(f'Batch {batch_num} has {images.shape[0]} images and correspondingly {captions.shape[0]} captions')



Batch 0 has 32 images and correspondingly 32 captions
Batch 1 has 32 images and correspondingly 32 captions
Batch 2 has 32 images and correspondingly 32 captions


# That's how the amazing 🤗 tokenizers work!

In [None]:
!pip install transformers
from transformers import AutoTokenizer

bert_model = 'distilbert-base-uncased'    # use any model of your choice
tokenizer = AutoTokenizer.from_pretrained(bert_model)
tokenizer('hi how are you')

In [None]:
bert_model = 'distilbert-base-uncased'    # use any model of your choice
tokenizer = AutoTokenizer.from_pretrained(bert_model)


**Thanks for going through my notebook. I hope to see you in a new PyTorch blog of mine!** 👋