#### performance on mnist with linear model doesn't go beyond 87%, so now we'll try with simple feed-forward neural net

In [1]:
#import everything
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt

from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import random_split

In [2]:
%matplotlib inline

In [3]:
dataset = MNIST(root='datasets/mnist/',transform=ToTensor())

In [4]:
dataset

Dataset MNIST
    Number of datapoints: 60000
    Root location: datasets/mnist/
    Split: Train
    StandardTransform
Transform: ToTensor()

In [5]:
#so default split is train data and has 60k samples, we will create a smaller 10k sample validation dataset
train_ds,valid_ds = random_split(dataset,[50000,10000])

In [6]:
batch_size = 128

In [7]:
#create data loaders to load data in batches
#now for GPU optimization, pin_memory is used for faster loading, but won't be much different in MNIST case
train_loader = DataLoader(train_ds,batch_size,shuffle=True,pin_memory=True,num_workers=2)
valid_loader = DataLoader(valid_ds,batch_size*2,num_workers=4,pin_memory=True)

#now images will be of shape 28*28 which is equal to 784<br>
#therefore input layer size will be 784<br>
#this is just one extra matrix multiplication than the linear model<br>
#if linear was y1 = w1*x+b1 now we have y2 = w2*y1+b<br>
#we go from 784 to 32 then from 32 to 10 which is the number of output classes(0-9)

In [8]:
#model creation
class MnistModel(nn.Module):
    def __init__(self,input_size,hidden_size,out_size):
        super().__init__()
        self.linear1 = nn.Linear(input_size,hidden_size)
        self.linear2 = nn.Linear(hidden_size,out_size)
    
    def forward(self,X_batch):
        X_batch = X_batch.view(X_batch.size(0),-1) #Flatten the image tensors from 28*28 to 784
        out = self.linear1(X_batch)
        out = F.relu(out)
        out = self.linear2(out)
        return out
    
    def train_step(self,X_batch):
        images,labels = X_batch
        out = self.forward(images)
        loss = F.cross_entropy(out,labels)
        return loss
    
    def valid_step(self,X_batch):
        images,labels = X_batch
        out = self.forward(images)
        loss = F.cross_entropy(out,labels)
        _,preds = torch.max(out,dim=1)
        acc = torch.tensor(torch.sum(preds==labels).item()/len(preds))
        return {"loss":loss,"acc":acc}

In [9]:
model = MnistModel(784,32,10)

In [10]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

In [11]:
device = get_default_device()

In [12]:
device

device(type='cuda')

In [13]:
def to_device(data,device):
    if isinstance(data,(list,tuple)):
        return [to_device(x,device) for x in data]
    else:
        return data.to(device,non_blocking=True)

In [14]:
#now data loader has to put data on the cuda device
#for this we need a __iter__ and __len__ method to retrieve a batch and number of batches
class DeviceDataLoader():
    #Wrap dataloader to move data to device
    def __init__(self,dl,device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        for batch in self.dl:
            yield to_device(batch,self.device)
    
    def __len__(self):
        return len(self.dl)

In [15]:
train_loader = DeviceDataLoader(train_loader,device)
valid_loader = DeviceDataLoader(valid_loader,device)

In [16]:
def evaluate(model,valid_loader):
    outputs = [model.valid_step(batch) for batch in valid_loader]
    batch_losses = [x['loss'] for x in outputs]
    loss = torch.stack(batch_losses).mean()
    batch_accs = [x['acc'] for x in outputs]
    acc = torch.stack(batch_accs).mean()
    print(f"loss is {loss} and acc is {acc}")

In [17]:
model = MnistModel(784,32,10)
to_device(model,device)

MnistModel(
  (linear1): Linear(in_features=784, out_features=32, bias=True)
  (linear2): Linear(in_features=32, out_features=10, bias=True)
)

In [18]:
def train(model,train_loader,valid_loader,epochs=20,lr=0.01,optim_func=torch.optim.SGD):
    optim = optim_func(model.parameters(),lr)
    for epoch in range(epochs):
        for batch in train_loader:
            loss = model.train_step(batch)
            loss.backward()
            optim.step()
            optim.zero_grad()
        res = evaluate(model,valid_loader)

In [19]:
evaluate(model,valid_loader)

loss is 2.3045036792755127 and acc is 0.10488281399011612


In [20]:
train(model,train_loader,valid_loader,10,0.1)

loss is 0.3634752333164215 and acc is 0.8985351324081421
loss is 0.30991020798683167 and acc is 0.911914050579071
loss is 0.27482175827026367 and acc is 0.919238269329071
loss is 0.24124775826931 and acc is 0.9287109375
loss is 0.2216728776693344 and acc is 0.9351562261581421
loss is 0.20776580274105072 and acc is 0.94287109375
loss is 0.19890932738780975 and acc is 0.9447265863418579
loss is 0.18860359489917755 and acc is 0.94775390625
loss is 0.1803520768880844 and acc is 0.949414074420929
loss is 0.17987488210201263 and acc is 0.946484386920929


In [21]:
train(model,train_loader,valid_loader,5,0.1)

loss is 0.16593708097934723 and acc is 0.954296886920929
loss is 0.16016976535320282 and acc is 0.955859363079071
loss is 0.15378592908382416 and acc is 0.956835925579071
loss is 0.15331526100635529 and acc is 0.956835925579071
loss is 0.15337739884853363 and acc is 0.9571288824081421


In [22]:
#we got 95% now with a simple double linear regression, although the relu layer makes it a neural net
test_dataset = MNIST(root='datasets/mnist/',train=False,transform=ToTensor())

In [23]:
test_loader = DeviceDataLoader(DataLoader(test_dataset,batch_size=256),device)

In [24]:
result = evaluate(model,test_loader)

loss is 0.12460916489362717 and acc is 0.962695300579071


In [None]:
#now try same model with Fashion MNIST