using num_workers attribute of the data_loader class to increase training speed.<br>
(improvising on pracFMNIST4)

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from IPython.display import display, clear_output
import pandas as pd
import time
import json

from itertools import product
from collections import namedtuple
from collections import OrderedDict

In [2]:
use_cuda = True
torch.cuda.is_available()

True

In [0]:
class RunBuilder():
    @staticmethod
    def get_runs(params):
        Run = namedtuple('Run',params.keys())
        runs = []
        for  v in product(*params.values()):
            runs.append(Run(*v))
        return runs

In [0]:
class RunManager():
    def __init__(self):

        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None

        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None

        self.network = None
        self.loader = None
        self.tb = None

    def begin_run(self,run,network,loader):
        
        self.run_start_time = time.time()
        
        self.run_params = run 
        self.run_count += 1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{run}')

        images,labels = next(iter(self.loader))

        if use_cuda and torch.cuda.is_available():
            images = images.cuda()
        
        grid = torchvision.utils.make_grid(images)

        self.tb.add_image('images',grid)
        self.tb.add_graph(self.network,images)

    def end_run(self):
        self.tb.close()
        self.epoch_count = 0

    def begin_epoch(self):
        
        self.epoch_start_time = time.time()
        
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0

    def end_epoch(self):

        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)

        self.tb.add_scalar('Loss',loss,self.epoch_count)
        self.tb.add_scalar('Accuracy',accuracy,self.epoch_count)

        for name,param in self.network.named_parameters():
            self.tb.add_histogram(name,param,self.epoch_count)
            self.tb.add_histogram(f'{name}.grad',param.grad,self.epoch_count)

        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch duration"] = epoch_duration
        results["run duration"] = run_duration

        for k,v in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)

        df = pd.DataFrame.from_dict(self.run_data,orient='columns')

        clear_output(wait=True)     #specific to ipynbs
        display(df)                 #specific to ipynbs

    def track_loss(self,loss):
        self.epoch_loss += loss.item()*self.loader.batch_size

    def track_num_correct(self,preds,labels):
        self.epoch_num_correct += self._get_num_correct(preds,labels)

    @torch.no_grad()
    def _get_num_correct(self,pred,labels):
        return preds.argmax(dim=1).eq(labels).sum().item()

    def save(self,fileName):

        pd.DataFrame.from_dict(
            self.run_data,
            orient="columns",
        ).to_csv(f'{fileName}.csv')

        with open(f'{fileName}.json','w',encoding = 'utf-8') as f:
            json.dump(self.run_data,f,ensure_ascii = False,indent=4)



In [0]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,6,5)
        self.conv2 = nn.Conv2d(6,12,5)

        self.fc1 = nn.Linear(12*4*4,120)
        self.fc2 = nn.Linear(120,60)
        self.out = nn.Linear(60,10)

    def forward(self,t):
        t = F.relu(self.conv1(t))
        t = F.max_pool2d(t,kernel_size=2,stride=2)
        
        t = F.relu(self.conv2(t))
        t = F.max_pool2d(t,kernel_size=2,stride=2)

        t = t.flatten(start_dim=1)
        t = F.relu(self.fc1(t))

        t = F.relu(self.fc2(t))

        t = self.out(t)

        return t

In [6]:
train_set = torchvision.datasets.FashionMNIST(
    root = "./data",
    train=True,
    download=True,
    transform=transforms.Compose(
        [transforms.ToTensor()]
    )
)

train_loader = torch.utils.data.DataLoader(train_set,batch_size=100,shuffle=True)

0it [00:00, ?it/s]

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz


26427392it [00:02, 9607628.44it/s]                              


Extracting ./data/FashionMNIST/raw/train-images-idx3-ubyte.gz to ./data/FashionMNIST/raw


0it [00:00, ?it/s]

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


32768it [00:00, 75342.87it/s]                            
0it [00:00, ?it/s]

Extracting ./data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz


4423680it [00:01, 3097575.73it/s]                            
0it [00:00, ?it/s]

Extracting ./data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to ./data/FashionMNIST/raw
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


8192it [00:00, 26960.57it/s]            

Extracting ./data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/FashionMNIST/raw
Processing...
Done!





In [0]:
params = OrderedDict(
    lr = [0.01],
    batch_size = [100,1000,10000],
    num_workers = [0,1,2,4,8,16]
)

class RunManager():
    def __init__(self):

        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None

        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None

        self.network = None
        self.loader = None
        self.tb = None

    def begin_run(self,run,network,loader):
        
        self.run_start_time = time.time()
        
        self.run_params = run 
        self.run_count += 1
        
        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{run}')

        images,labels = next(iter(self.loader))

        if use_cuda and torch.cuda.is_available():
            images = images.cuda()
        
        grid = torchvision.utils.make_grid(images)

        self.tb.add_image('images',grid)
        self.tb.add_graph(self.network,images)

    def end_run(self):
        self.tb.close()
        self.epoch_count = 0

    def begin_epoch(self):
        
        self.epoch_start_time = time.time()
        
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0

    def end_epoch(self):

        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)

        self.tb.add_scalar('Loss',loss,self.epoch_count)
        self.tb.add_scalar('Accuracy',accuracy,self.epoch_count)

        for name,param in self.network.named_parameters():
            self.tb.add_histogram(name,param,self.epoch_count)
            self.tb.add_histogram(f'{name}.grad',param.grad,self.epoch_count)

        results = OrderedDict()
        results["run"] = self.run_count
        results["epoch"] = self.epoch_count
        results["loss"] = loss
        results["accuracy"] = accuracy
        results["epoch duration"] = epoch_duration
        results["run duration"] = run_duration

        for k,v in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)

        df = pd.DataFrame.from_dict(self.run_data,orient='columns')

        clear_output(wait=True)     #specific to ipynbs
        display(df)                 #specific to ipynbs

    def track_loss(self,loss):
        self.epoch_loss += loss.item()*self.loader.batch_size

    def track_num_correct(self,preds,labels):
        self.epoch_num_correct += self._get_num_correct(preds,labels)

    @torch.no_grad()
    def _get_num_correct(self,pred,labels):
        return preds.argmax(dim=1).eq(labels).sum().item()

    def save(self,fileName):

        pd.DataFrame.from_dict(
            self.run_data,
            orient="columns",
        ).to_csv(f'{fileName}.csv')

        with open(f'{fileName}.json','w',encoding = 'utf-8') as f:
            json.dump(self.run_data,f,ensure_ascii = False,indent=4)



In [8]:
!rm -r runs

rm: cannot remove 'runs': No such file or directory


In [9]:
m = RunManager()
for run in RunBuilder.get_runs(params):
    
    network = Network()     
    if use_cuda and torch.cuda.is_available():
        network.cuda()

    loader = DataLoader(train_set,batch_size = run.batch_size,num_workers = run.num_workers)
    optimizer = optim.Adam(network.parameters(),lr=run.lr)

    m.begin_run(run,network,loader)
    for epoch in range(1):
        m.begin_epoch()
        for batch in loader:

            images = batch[0].cuda()
            labels = batch[1].cuda()

            preds = network(images)
            loss = F.cross_entropy(preds,labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            m.track_loss(loss)
            m.track_num_correct(preds,labels)

        m.end_epoch()
    m.end_run()
m.save('results')

Unnamed: 0,run,epoch,loss,accuracy,epoch duration,run duration,lr,batch_size,num_workers
0,1,1,0.579461,0.780567,9.546074,11.560656,0.01,100,0
1,2,1,0.552522,0.788583,10.557312,10.7384,0.01,100,1
2,3,1,0.537637,0.794117,9.256456,9.467849,0.01,100,2
3,4,1,0.54489,0.79285,9.140315,9.470607,0.01,100,4
4,5,1,0.575684,0.778983,9.372262,9.942099,0.01,100,8
5,6,1,0.55075,0.792567,9.786679,10.822042,0.01,100,16
6,7,1,1.011354,0.60785,8.1819,8.819124,0.01,1000,0
7,8,1,1.002562,0.621433,7.62465,8.454855,0.01,1000,1
8,9,1,1.095863,0.5809,6.105537,7.062789,0.01,1000,2
9,10,1,0.95991,0.631033,6.201076,7.575699,0.01,1000,4


downloading runs for visualising on tensorboard

In [0]:
!zip -r runs3 runs

  adding: runs/ (stored 0%)
  adding: runs/Oct29_13-50-59_0eca79d7de00-Run(lr=0.01, batch_size=1000, shuffle=True)/ (stored 0%)
  adding: runs/Oct29_13-50-59_0eca79d7de00-Run(lr=0.01, batch_size=1000, shuffle=True)/events.out.tfevents.1572357059.0eca79d7de00.122.6 (deflated 51%)
  adding: runs/Oct29_13-53-07_0eca79d7de00-Run(lr=0.01, batch_size=2000, shuffle=False)/ (stored 0%)
  adding: runs/Oct29_13-53-07_0eca79d7de00-Run(lr=0.01, batch_size=2000, shuffle=False)/events.out.tfevents.1572357187.0eca79d7de00.122.9 (deflated 35%)
  adding: runs/Oct29_13-51-41_0eca79d7de00-Run(lr=0.01, batch_size=1000, shuffle=False)/ (stored 0%)
  adding: runs/Oct29_13-51-41_0eca79d7de00-Run(lr=0.01, batch_size=1000, shuffle=False)/events.out.tfevents.1572357101.0eca79d7de00.122.7 (deflated 51%)
  adding: runs/Oct29_13-52-24_0eca79d7de00-Run(lr=0.01, batch_size=2000, shuffle=True)/ (stored 0%)
  adding: runs/Oct29_13-52-24_0eca79d7de00-Run(lr=0.01, batch_size=2000, shuffle=True)/events.out.tfevents.15723

optimal num_workers = 1 , further increase doesn't help much

num_workers  indicates how many subprocesses to use for data loading.<br>
will be using a hard coded 1 from the next time.

In [11]:
!zip -r runs4 runs

  adding: runs/ (stored 0%)
  adding: runs/Oct29_15-13-45_4762073abd70-Run(lr=0.01, batch_size=1000, num_workers=1)/ (stored 0%)
  adding: runs/Oct29_15-13-45_4762073abd70-Run(lr=0.01, batch_size=1000, num_workers=1)/events.out.tfevents.1572362025.4762073abd70.122.7 (deflated 20%)
  adding: runs/Oct29_15-14-28_4762073abd70-Run(lr=0.01, batch_size=10000, num_workers=0)/ (stored 0%)
  adding: runs/Oct29_15-14-28_4762073abd70-Run(lr=0.01, batch_size=10000, num_workers=0)/events.out.tfevents.1572362068.4762073abd70.122.12 (deflated 6%)
  adding: runs/Oct29_15-13-15_4762073abd70-Run(lr=0.01, batch_size=100, num_workers=8)/ (stored 0%)
  adding: runs/Oct29_15-13-15_4762073abd70-Run(lr=0.01, batch_size=100, num_workers=8)/events.out.tfevents.1572361995.4762073abd70.122.4 (deflated 65%)
  adding: runs/Oct29_15-14-09_4762073abd70-Run(lr=0.01, batch_size=1000, num_workers=8)/ (stored 0%)
  adding: runs/Oct29_15-14-09_4762073abd70-Run(lr=0.01, batch_size=1000, num_workers=8)/events.out.tfevents.1