DATASET & DATALOADER - BATCH TRAINING

In [3]:
'''
epoch = 1 forward and backward pass of all training samples

batch_size = number of training samples in one forward and backward pass

number of iterations per epoch = number of passes, each pass using [batch_size] number of samples

e.g. 100 samples, batch_size = 20 --> 100/20 = 5 iterations for 1 epoch
'''


'\nepoch = 1 forward and backward pass of all training samples\n\nbatch_size = number of training samples in one forward and backward pass\n\nnumber of iterations per epoch = number of passes, each pass using [batch_size] number of samples\n\ne.g. 100 samples, batch_size = 20 --> 100/20 = 5 iterations for 1 epoch\n'

In [5]:
import pandas as pd
dv = pd.read_csv('wine.csv')
dv

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [10]:
import torch 
import torchvision
from torch.utils.data import Dataset,DataLoader
import numpy as np
import math  

Implementing Our Own Custom Dataset 

In [15]:
class WineDataset(Dataset): #must inherit Dataset
       def __init__(self):
           #data loading
           xy = np.loadtxt('wine.csv',delimiter=",",dtype = np.float32,skiprows=1) #delimiter = "," because it is a comma separated file and skiprows =1 to skip the header
           self.x = torch.from_numpy(xy[:,1:]) #the attributes or the features ,neglecting the 0th column as it is class label
           self.y = torch.from_numpy(xy[:,[0]]) # the ground truths ,[0] because we want the shape of y in n_samples,1 form  
           self.n_samples = xy.shape[0]
       def __getitem__(self,index):
           #indexing dataset
           #like dataet[0]
           return self.x[index],self.y[index] #this will return a tuple
       def __len__(self):
           #len(dataset)
           return self.n_samples
       
#creation of the dataset
dataset = WineDataset()
first_data = dataset[0]
features,labels = first_data
print(features,labels)

tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
        3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
        1.0650e+03]) tensor([1.])


How To Use The Dataloader 

In [17]:
#creating a dataloader object
dataloader = DataLoader(dataset=dataset,batch_size=4,shuffle=True) #num_workers=2 makes loading faster because it uses multiple sub processes now
#now let's convert the dataloader object into an iterator
dataiter = iter(dataloader)
data = dataiter.__next__() #only getting the first batch
#now unpack data
features,labels = data
print(features,labels)

tensor([[1.2580e+01, 1.2900e+00, 2.1000e+00, 2.0000e+01, 1.0300e+02, 1.4800e+00,
         5.8000e-01, 5.3000e-01, 1.4000e+00, 7.6000e+00, 5.8000e-01, 1.5500e+00,
         6.4000e+02],
        [1.3160e+01, 3.5700e+00, 2.1500e+00, 2.1000e+01, 1.0200e+02, 1.5000e+00,
         5.5000e-01, 4.3000e-01, 1.3000e+00, 4.0000e+00, 6.0000e-01, 1.6800e+00,
         8.3000e+02],
        [1.2210e+01, 1.1900e+00, 1.7500e+00, 1.6800e+01, 1.5100e+02, 1.8500e+00,
         1.2800e+00, 1.4000e-01, 2.5000e+00, 2.8500e+00, 1.2800e+00, 3.0700e+00,
         7.1800e+02],
        [1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
         3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
         1.0650e+03]]) tensor([[3.],
        [3.],
        [2.],
        [1.]])


As batch size is 4 , so we see 4 different feature vectors in our feature tensor and 4 different class labels in our labels tensor

Training Loop

In [20]:
num_epochs = 3
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4) #no. of iterations per epoch and 4 is the batch size
print(total_samples,n_iterations)

178 45


In [21]:
for epoch in range(num_epochs):
    # now iterate over dataloader and each iteration is perfomed on a batch
    for i,(inputs,labels) in enumerate(dataloader):   #enumerate gives index in i and inputs and labels according to the batch size
        #actually we gotta do forward pass, backward pass and wt. update for a batch iteration in an epoch
        #but here we are going to perform some dummy work
        if (i+1)%5 == 0:
            print(f'epoch no. {epoch+1}/{num_epochs}, iteration no. {i+1}/{n_iterations}, batch shape: {inputs.shape}, output shape: {labels.shape} ')

epoch no. 1/3, iteration no. 5/45, batch shape: torch.Size([4, 13]), output shape: torch.Size([4, 1]) 
epoch no. 1/3, iteration no. 10/45, batch shape: torch.Size([4, 13]), output shape: torch.Size([4, 1]) 
epoch no. 1/3, iteration no. 15/45, batch shape: torch.Size([4, 13]), output shape: torch.Size([4, 1]) 
epoch no. 1/3, iteration no. 20/45, batch shape: torch.Size([4, 13]), output shape: torch.Size([4, 1]) 
epoch no. 1/3, iteration no. 25/45, batch shape: torch.Size([4, 13]), output shape: torch.Size([4, 1]) 
epoch no. 1/3, iteration no. 30/45, batch shape: torch.Size([4, 13]), output shape: torch.Size([4, 1]) 
epoch no. 1/3, iteration no. 35/45, batch shape: torch.Size([4, 13]), output shape: torch.Size([4, 1]) 
epoch no. 1/3, iteration no. 40/45, batch shape: torch.Size([4, 13]), output shape: torch.Size([4, 1]) 
epoch no. 1/3, iteration no. 45/45, batch shape: torch.Size([2, 13]), output shape: torch.Size([2, 1]) 
epoch no. 2/3, iteration no. 5/45, batch shape: torch.Size([4, 13

PyTorch has some built-in datasets  
torchvision.datasets.MNIST()
Fashion-MNIST,CIFAR,COCO etc datasets