# Using Dataset and DataLoader

Here we show how to use pytorch Dataset and DataLoader utilities that allow us to easily manage large datasets.

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader  # new imports
import numpy as np

### Create your Dataset

Our dataset is a class that extends torch.utils.data.Dataset. We just have to override two methods: 
* __\_\_getitem\_\_(self, index)__: returns a single item (input,output) from the dataset based on a index 
* __\_\_len\_\_(self)__: returns the length of the dataset (number of samples)

There are pre-built datasets (MNIST, FashionMNIST, COCO, CIFAR, ...) and dataset folder iterators (ImageFolder, DataFolder, ...) under the __torchvision.datasets__ module.

In [2]:
class DiabetesDataset(Dataset):
    
    # Initialize data, download, ...
    def __init__(self):
        
        xy = np.loadtxt('data/data-diabetes.csv', delimiter=',', dtype=np.float32)
        self.x_data = xy[:, :-1]
        self.y_data = xy[:, -1].reshape(-1, 1)
        
    # Return one item at the specified index
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
        
    def __len__(self):
        return self.x_data.shape[0]

dataset = DiabetesDataset()

### Define the DataLoader

The DataLoader is the iterator that iterates over the dataset's data. We can define the batch size and other important properties (shuffle for instance)

In [3]:
train_loader = DataLoader(dataset=dataset,
                          batch_size=8,
                          shuffle=True,
                          num_workers=2)

#### Define the model

No modifications

In [4]:
class Model(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate the layers with weights
        """
        super(Model, self).__init__()

        self.l1 = torch.nn.Linear(8, 60)
        self.l2 = torch.nn.Linear(60, 10)
        self.l3 = torch.nn.Linear(10, 1)

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data
        and we must return a Variable of output data. We can use
        Modules defined in the constructor as well as arbitrary
        operators on Variables
        """
        out1 = self.l1(x).relu()
        out2 = self.l2(out1).relu()
        y_pred = self.l3(out2).sigmoid()
        return y_pred

model = Model()
print(model)

Model(
  (l1): Linear(in_features=8, out_features=60, bias=True)
  (l2): Linear(in_features=60, out_features=10, bias=True)
  (l3): Linear(in_features=10, out_features=1, bias=True)
)


#### Construct our loss function and an optimizer

No modifications

In [5]:
criterion = torch.nn.BCELoss()   # binary cross entropy
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

### Train the model

Use the DataLoader as an iterator

Note: the loss printed is not the validation loss, but the loss computed on the last batch of training data during every epoch

In [6]:
for epoch in range(10):
    for x_data, y_data in train_loader:

        y_pred = model(x_data)
        loss = criterion(y_pred, y_data)

        optimizer.zero_grad()  # zero gradient buffers
        loss.backward()        # computes gradients
        optimizer.step()       # does the update
        
    print(epoch, loss.data)

0 tensor(0.6538)
1 tensor(0.7838)
2 tensor(0.8948)
3 tensor(0.2937)
4 tensor(0.5904)
5 tensor(0.4483)
6 tensor(0.4679)
7 tensor(0.2547)
8 tensor(0.5974)
9 tensor(0.2500)
