In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

from torch.utils.tensorboard import SummaryWriter

In [22]:
print(torch.__version__)
print(torchvision.__version__)

1.3.0+cu100
0.4.1+cu100


In [0]:
def get_num_correct(preds,labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

In [0]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,6,5)
        self.conv2 = nn.Conv2d(6,12,5)

        self.fc1 = nn.Linear(12*4*4,120)
        self.fc2 = nn.Linear(120,60)
        self.out = nn.Linear(60,10)

    def forward(self,t):
        t = F.relu(self.conv1(t))
        t = F.max_pool2d(t,kernel_size=2,stride=2)
        
        t = F.relu(self.conv2(t))
        t = F.max_pool2d(t,kernel_size=2,stride=2)

        t = t.flatten(start_dim=1)
        t = F.relu(self.fc1(t))

        t = F.relu(self.fc2(t))

        t = self.out(t)

        return t

In [0]:
train_set = torchvision.datasets.FashionMNIST(
    root = "./data",
    train=True,
    download=True,
    transform=transforms.Compose(
        [transforms.ToTensor()]
    )
)

In [0]:
train_loader = torch.utils.data.DataLoader(train_set,batch_size=100,shuffle=True)

Using tensorboard for network graph and images

NOTE: storing the SummaryWriter outputs files on my local system to use a local host.<br>
saved on github as well.<br>
ngrok not working<br>


In [0]:
# use !zip -r <zipped file> <directory being zipped>

The training loop review

In [8]:
network = Network()
train_loader = torch.utils.data.DataLoader(train_set,batch_size=100,shuffle=True)
optimizer = optim.Adam(network.parameters(),lr=0.01)

for epoch in range(1):
    total_loss = 0
    total_correct = 0

    for batch in train_loader:
        images,labels = batch
        
        preds = network(images)
        loss = F.cross_entropy(preds,labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += get_num_correct(preds,labels)

    print("epoch", epoch,'\n', "total_correct:",total_correct,'\n',"loss:",total_loss)


KeyboardInterrupt: ignored

now including tensorboard calls in between to get data for analysis

In [0]:
!rm -r runs

In [10]:
network = Network()
train_loader = torch.utils.data.DataLoader(train_set,batch_size=100,shuffle=True)
optimizer = optim.Adam(network.parameters(),lr=0.01)

images,labels = next(iter(train_loader))
grid = torchvision.utils.make_grid(images)

tb = SummaryWriter(comment=f'sample run')
tb.add_image('images',grid)
tb.add_graph(network,images)

for epoch in range(1):
    total_loss = 0
    total_correct = 0

    for batch in train_loader: # getting batch
        images,labels = batch # separating images and labels
        
        preds = network(images) 
        loss = F.cross_entropy(preds,labels) #calculating loss

        optimizer.zero_grad() 
        loss.backward()  # calculating gradients
        optimizer.step() # updating weights

        tb.add_scalar('Loss',loss,epoch)
        tb.add_scalar('Number correct',total_correct,epoch)
        tb.add_scalar('Accuracy',total_correct/len(train_set),epoch)

        tb.add_histogram('conv1.bias',network.conv1.bias,epoch)
        tb.add_histogram('conv1.weight',network.conv1.weight,epoch)
        tb.add_histogram('conv1.weight.grad',network.conv1.weight.grad,epoch)

        total_loss += loss.item()
        total_correct += get_num_correct(preds,labels)

    print("epoch", epoch,'\n', "total_correct:",total_correct,'\n',"loss:",total_loss)

tb.close()

epoch 0 
 total_correct: 47774 
 loss: 324.24461951851845


  adding: runs/ (stored 0%)
  adding: runs/Oct28_16-50-13_9f396b488efb/ (stored 0%)
  adding: runs/Oct28_16-50-13_9f396b488efb/events.out.tfevents.1572281413.9f396b488efb.131.5 (deflated 97%)


Hyperparameter tuning using tensorboard

shifting from hard-coded values to variables for hyperparameter testing

In [0]:
batch_size_list  = [100,1000,10000]
lr_list = [0.01,0.001,0.0001,0.00001]

testing for all the possible combinations of lr and batch_size now

rerunning using variable hyperparameters and an appropriately named summarywriter()

as we're using different batch sizes now, going to calculate loss in a different way and account for variation as the cross entropy loss function being used here averages the loss values that are generated by a batch and returns this average<br>
could use the reduction parameter of the cross-entropy function instead.<br>
using the first method currently.

In [0]:
!rm -r runs

In [0]:
for batch_size in batch_size_list:
    for lr in lr_list:
        network = Network()
        
        train_loader = torch.utils.data.DataLoader(
            train_set,batch_size=batch_size
        )
        optimizer = optim.Adam(network.parameters(),lr=lr)

        images,labels = next(iter(train_loader))
        grid = torchvision.utils.make_grid(images)

        comment = f'batch_size = {batch_size} lr = {lr}'

        tb = SummaryWriter(comment = comment)
        tb.add_image('images',grid)
        tb.add_graph(network,images)

        for epoch in range(5):
            total_loss = 0
            total_correct = 0
            for batch in train_loader:
                images,labels = batch
                preds = network(images)

                loss = F.cross_entropy(preds,labels)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                total_loss += loss.item()*batch_size
                total_correct += get_num_correct(preds,labels)

            tb.add_scalar('Loss',total_loss,epoch)
            tb.add_scalar('Number_correct',total_correct,epoch)
            tb.add_scalar('Accuracy',total_correct/len(train_set),epoch)

            for name,param in network.named_parameters():
                tb.add_histogram(name,param,epoch)
                tb.add_histogram(f'{name}.grad',param.grad,epoch)

            print(
                f"epoch: {epoch}",
                f"total_correct: {total_correct}"
                f"loss: {total_loss}"
            )

        tb.close()

using cartesian products of sets instead of nested for loops for more readable code

In [0]:
parameters = dict(
    lr = [0.01,0.0001],
    batch_size = [100,1000,10000]
)

In [31]:
param_values = [v for v in parameters.values()]
param_values

[[0.01, 0.0001], [100, 1000, 10000]]

looping over the cartesian product

In [0]:
from itertools import product

In [33]:
for lr,batch_size in product(*param_values):
    print(lr,batch_size)
        

0.01 100
0.01 1000
0.01 10000
0.0001 100
0.0001 1000
0.0001 10000


In [34]:
for lr,batch_size in product(*param_values):
    network = Network()
    
    train_loader = torch.utils.data.DataLoader(
        train_set,batch_size=batch_size
    )
    optimizer = optim.Adam(network.parameters(),lr=lr)

    images,labels = next(iter(train_loader))
    grid = torchvision.utils.make_grid(images)

    comment = f'batch_size = {batch_size} lr = {lr}'

    tb = SummaryWriter(comment = comment)
    tb.add_image('images',grid)
    tb.add_graph(network,images)

    for epoch in range(5):
        total_loss = 0
        total_correct = 0
        for batch in train_loader:
            images,labels = batch
            preds = network(images)

            loss = F.cross_entropy(preds,labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()*batch_size
            total_correct += get_num_correct(preds,labels)

        tb.add_scalar('Loss',total_loss,epoch)
        tb.add_scalar('Number_correct',total_correct,epoch)
        tb.add_scalar('Accuracy',total_correct/len(train_set),epoch)

        for name,param in network.named_parameters():
            tb.add_histogram(name,param,epoch)
            tb.add_histogram(f'{name}.grad',param.grad,epoch)

        print(
            f"epoch: {epoch}",
            f"total_correct: {total_correct}"
            f"loss: {total_loss}"
        )

    tb.close()

epoch: 0 total_correct: 47713loss: 32399.01174902916
epoch: 1 total_correct: 51708loss: 22450.518448650837
epoch: 2 total_correct: 52307loss: 20951.674051582813
epoch: 3 total_correct: 52748loss: 19802.543548494577
epoch: 4 total_correct: 52932loss: 19280.239336192608
epoch: 0 total_correct: 36035loss: 60779.63650226593
epoch: 1 total_correct: 46798loss: 34281.91778063774
epoch: 2 total_correct: 49336loss: 28776.17898583412
epoch: 3 total_correct: 50606loss: 25469.293296337128
epoch: 4 total_correct: 51303loss: 23690.684527158737
epoch: 0 total_correct: 10224loss: 130767.5302028656
epoch: 1 total_correct: 26044loss: 88107.41543769836
epoch: 2 total_correct: 33444loss: 66302.87647247314
epoch: 3 total_correct: 37982loss: 56926.10323429108
epoch: 4 total_correct: 40507loss: 49828.25756072998
epoch: 0 total_correct: 29521loss: 87715.04522562027
epoch: 1 total_correct: 42290loss: 46644.066524505615
epoch: 2 total_correct: 43774loss: 41928.28097939491
epoch: 3 total_correct: 44776loss: 3898

clearing runs for the final time (done with cleaning code) and running the model and downloading zipped runs folder and viewing logged data on a local host using tensorboard.

In [35]:
!zip -r runs runs/

  adding: runs/ (stored 0%)
  adding: runs/Oct29_06-14-51_da21891f4764batch_size = 100 lr = 0.01/ (stored 0%)
  adding: runs/Oct29_06-14-51_da21891f4764batch_size = 100 lr = 0.01/events.out.tfevents.1572329691.da21891f4764.123.3 (deflated 88%)
  adding: runs/Oct29_06-17-44_da21891f4764batch_size = 10000 lr = 0.01/ (stored 0%)
  adding: runs/Oct29_06-17-44_da21891f4764batch_size = 10000 lr = 0.01/events.out.tfevents.1572329864.da21891f4764.123.5 (deflated 12%)
  adding: runs/Oct29_06-19-09_da21891f4764batch_size = 100 lr = 0.0001/ (stored 0%)
  adding: runs/Oct29_06-19-09_da21891f4764batch_size = 100 lr = 0.0001/events.out.tfevents.1572329949.da21891f4764.123.6 (deflated 88%)
  adding: runs/Oct29_06-22-07_da21891f4764batch_size = 10000 lr = 0.0001/ (stored 0%)
  adding: runs/Oct29_06-22-07_da21891f4764batch_size = 10000 lr = 0.0001/events.out.tfevents.1572330127.da21891f4764.123.8 (deflated 12%)
  adding: runs/Oct29_06-16-23_da21891f4764batch_size = 1000 lr = 0.01/ (stored 0%)
  adding:

runs uploaded on github