Understanding the building blocks Dataset and DataLoader primitives in PyTorch

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
import pandas as pd

In [3]:
# To check cuda is available or not
torch.cuda.is_available()

False

Using a built-in dataset

In [4]:
# Transform: convert image → tensor
transform = transforms.ToTensor()

# Download MNIST dataset
train_dataset = datasets.MNIST(
    root="data",
    train=True,
    transform=transform,
    download=True
)

# Create the dataloader
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True  
)

# Check one batch
images, labels = next(iter(train_loader))
print(images.shape)
print(labels.shape)

100%|██████████| 9.91M/9.91M [00:01<00:00, 6.20MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 162kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.54MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 11.4MB/s]


torch.Size([32, 1, 28, 28])
torch.Size([32])


Custom dataset

In [5]:
class MyDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        x = self.data.iloc[index, :-1].values
        y = self.data.iloc[index, -1]

        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)

        return x, y

In [6]:
from google.colab import files
uploaded = files.upload()

KeyboardInterrupt: 

In [7]:
# Create dataset
dataset = MyDataset("sample_dataset.csv")

# DataLoader
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Check a batch
for batch_x, batch_y in dataloader:
    print(batch_x.shape)   # torch.Size([16, 3])
    print(batch_y.shape)   # torch.Size([16])
    break

FileNotFoundError: [Errno 2] No such file or directory: 'sample_dataset.csv'