<a href="https://colab.research.google.com/github/rahiakela/deep-learning-research-and-practice/blob/main/deep-learning-fundamentals/unit04-multilayer-networks/3_efficient_data_loaders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Unit 4.4: Defining Efficient Data Loaders

How to set up a `DataLoader` for a folder containing image files.

## Setup

In [None]:
!pip install GitPython

In [2]:
import os
from git import Repo

if not os.path.exists("mnist-pngs"):
  Repo.clone_from("https://github.com/rasbt/mnist-pngs", "mnist-pngs")

In [9]:
import pandas as pd

import os
import time

from torch.utils.data import DataLoader
from torchvision import transforms
from torch.utils.data import Dataset

import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image

In [11]:
def viz_batch_images(batch):
  plt.figure(figsize=(8, 8))
  plt.axis("off")
  plt.title("Training images")
  plt.imshow(
      np.transpose(
          vutils.make_grid(batch[0][:64], padding=2, normalize=True), (1, 2, 0)
      )
  )
  plt.show()

##1) Data Preparation

- Here, we check the CSV files listing the image names and labels

In [4]:
df_train = pd.read_csv('mnist-pngs/train.csv')
df_train.head()

Unnamed: 0,filepath,label
0,train/0/16585.png,0
1,train/0/24537.png,0
2,train/0/25629.png,0
3,train/0/20751.png,0
4,train/0/34730.png,0


In [5]:
df_test = pd.read_csv('mnist-pngs/test.csv')
df_test.head()

Unnamed: 0,filepath,label
0,test/0/66062.png,0
1,test/0/64675.png,0
2,test/0/62204.png,0
3,test/0/60407.png,0
4,test/0/67368.png,0


##2) Creating a validation split

- MNIST doesn't come with a validation set partition, so we are creating it here from the training set, using 10% of the training data for validation.

In [6]:
df_train = pd.read_csv('mnist-pngs/train.csv')
df_train = df_train.sample(frac=1, random_state=123)

loc = round(df_train.shape[0]*0.9)
df_new_train = df_train.iloc[:loc]
df_new_val = df_train.iloc[loc:]

df_new_train.to_csv('mnist-pngs/new_train.csv', index=None)
df_new_val.to_csv('mnist-pngs/new_val.csv', index=None)

In [7]:
df_new_train.head()

Unnamed: 0,filepath,label
29561,train/4/38855.png,4
26640,train/4/17837.png,4
24498,train/3/7672.png,3
24594,train/3/17906.png,3
24249,train/3/41969.png,3


In [8]:
df_new_val.head()

Unnamed: 0,filepath,label
8023,train/1/9221.png,1
26302,train/4/32108.png,4
54489,train/9/7053.png,9
2712,train/0/32086.png,0
1463,train/0/12095.png,0


## 3) Defining the Dataset Class

In [12]:
class MyDataset(Dataset):

  def __init__(self, csv_path, img_dir, transform=None):
    df = pd.read_csv(csv_path)

    self.img_dir = img_dir
    self.transform = transform

    # based on DataFrame columns
    self.img_names = df["filepath"]
    self.labels = df["label"]

  def __getitem__(self, index):
    img = Image.open(os.path.join(self.img_dir, self.img_names[index]))

    if self.transform is not None:
      img = self.transform(img)
    
    label = self.labels[index]
    return img, label
    
  def __len__(self):
    return self.labels.shape[0]

## 4) Defining optional image transformations

In [13]:
data_transforms = {
  "train": transforms.Compose([
    transforms.Resize(32),
    transforms.RandomCrop((28, 28)),
    transforms.ToTensor(),
    transforms.Normalize((0.5), (0.5))  # normalize images to [-1, 1] range
  ]),
  "test": transforms.Compose([
    transforms.Resize(32),
    transforms.CenterCrop((28, 28)),
    transforms.ToTensor(),
    transforms.Normalize((0.5), (0.5))  # normalize images to [-1, 1] range
  ]),
}

## 5) Defining the data loaders

In [14]:
train_dataset = MyDataset(
  csv_path="mnist-pngs/new_train.csv",
  img_dir="mnist-pngs/",
  transform=data_transforms["train"]
)

val_dataset = MyDataset(
  csv_path="mnist-pngs/new_val.csv",
  img_dir="mnist-pngs/",
  transform=data_transforms["test"]
)

test_dataset = MyDataset(
  csv_path="mnist-pngs/test.csv",
  img_dir="mnist-pngs/",
  transform=data_transforms["test"]
)

In [16]:
train_loader = DataLoader(
  dataset=train_dataset,
  batch_size=32,
  shuffle=True,
  drop_last=True,
  num_workers=2
)

val_loader = DataLoader(
  dataset=val_dataset,
  batch_size=32,
  shuffle=False,
  num_workers=2
)

test_loader = DataLoader(
  dataset=test_dataset,
  batch_size=32,
  shuffle=False,
  num_workers=2
)

## 6) Testing the data loaders

In [17]:
num_epochs = 1
for epoch in range(num_epochs):
  for batch_idx, (x, y) in enumerate(train_loader):
    if batch_idx >= 3:
      break
    print(f"Batch index: {batch_idx} | Batch size: {y.shape[0]} | x shape: {x.shape} | y shape: {y.shape}")

print(f"Labels from current batch: {y}")

Batch index: 0 | Batch size: 32 | x shape: torch.Size([32, 1, 28, 28]) | y shape: torch.Size([32])
Batch index: 1 | Batch size: 32 | x shape: torch.Size([32, 1, 28, 28]) | y shape: torch.Size([32])
Batch index: 2 | Batch size: 32 | x shape: torch.Size([32, 1, 28, 28]) | y shape: torch.Size([32])
Labels from current batch: tensor([7, 8, 6, 1, 0, 8, 0, 8, 2, 7, 4, 7, 9, 3, 9, 9, 0, 8, 3, 0, 8, 1, 0, 6,
        1, 4, 7, 0, 3, 9, 9, 3])
