# Neural Networks on MNIST data

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import torch
from IPython.core.debugger import set_trace

In [None]:
from fastai.imports import *
from fastai.torch_imports import *
from fastai.io import *

---
## Data loading 

In [None]:
path = 'data/'

In [None]:
URL='http://deeplearning.net/data/mnist/'
FILENAME='mnist.pkl.gz'

def load_mnist(filename):
    return pickle.load(gzip.open(filename, 'rb'), encoding='latin-1')

In [None]:
get_data(URL+FILENAME, path+FILENAME)

((x, y), (x_valid, y_valid), (x_test,y_test)) = load_mnist(path+FILENAME)

In [None]:
len(x), len(y), len(x_valid), len(y_valid), len(x_test),len(y_test)

We have 50000 images in train. 10000 images in validation and test data. There are 10 classes

---
## Normalization

##### Normalization is better when feeding data to neural networks. RF is not affected by scale so not required. But neural networks are affected by scale otherwise NN has to learn the mean and std by itself.

In [None]:
mean = x.mean()

std = x.std()

x = (x -mean)/std

mean,std,x.mean(),x.std()

### Note on normalization:
* Train and validation should have same normalization
* For RGB image,normalization for each channel
* Structural: Normalization for each fetaure separately

##### Same normalization has to applied to validation/test data

In [None]:
x_valid = (x_valid-mean)/std

In [None]:
x_valid.mean(),x_valid.std()

In [None]:
x_valid.shape

### Reshaping your tensors

In [None]:
x_valid.reshape(-1, 28,28).shape #-1 is entered as reshape will identify by itself what is that order based on input

In [None]:
x_imgs = x_valid.reshape(-1,28,28) #store 10000 images as matrix of 28*28

In [None]:
def img_show(img,title = None):
    plt.imshow(img,cmap='gray')
    if title is not None: plt.title(title) 

In [None]:
img_show(x_imgs[0],"sample") #this will pick the first image

In [None]:
img_show(x_imgs[0,0:28,0:15]) #indexing into the image and grabbing a portion of the image

In [None]:
img_show(x_imgs[1,0:15,])

##### len will return the length of the 1st dimension

In [None]:
len(x_imgs) 

In [None]:
def plots(ims, figsize=(12,6), rows=2, titles=None):
    f = plt.figure(figsize=figsize)
    cols = len(ims)//rows
    for i in range(len(ims)):
        sp = f.add_subplot(rows, cols, i+1)
        sp.axis('Off')
        if titles is not None: sp.set_title(titles[i], fontsize=16)
        plt.imshow(ims[i], cmap='gray')

## Neural Nets with Pytorch

In [None]:
from fastai.metrics import *
from fastai.model import *
from fastai.dataset import *
import torch.nn as nn
import torch
from torch.autograd import Variable


In [None]:
d = torch.randn(2,3);d #creating a tensor of size 2*3

### Creating neural network, the easiest way

### Defining the network using pytorch completely

In [None]:
net1 = nn.Sequential(
nn.Linear(28*28,10,bias=True), #assuming input the network is number of images * 784, 
    #output of this layer will be number of images * 10
nn.LogSoftmax())

In [None]:
md = ImageClassifierData.from_arrays(path,(x,y),(x_valid,y_valid))
loss = nn.CrossEntropyLoss() #calculate cross entropy
metrics = [accuracy] #calculate accuracy based on predicted labels
opt = optim.Adam(net1.parameters()) #default learning rate = 1e-3

### Breaking it down

#### Looking at the linear layer

In [None]:
a = nn.Linear(2*2,2);a

In [None]:
a.weight,a.bias #these are randomly initialized

In [None]:
b = torch.autograd.Variable(torch.randn(5,4));x #sample input

In [None]:
a(b) #it applies the linear layer on the sample input defined

#### Looking at the softmax function

In [None]:
m = nn.LogSoftmax()

In [None]:
a(b) #sample input: assume 5 images of 2*2 size flattened out

In [None]:
o = m(a(b));o #applying logsoftmax to x

In [None]:
c = torch.exp(o);c #visualize what happens before the log....each row is an image with probabilities for 2 classes

In [None]:
c.sum(dim = 1) #if we sum probabilities for each row it should comes out to be 1 as expected

#### Understanding how loss function works

In [None]:
o #output of the network: 5 images and 2 probabilities, one for each class

In [None]:
target = torch.autograd.Variable(torch.LongTensor([1,0,0,1,1]));target #actual labels for the images

In [None]:
loss(o,target)

#### Understanding the parameters of the network

##### Parameters are special variables that need to be optimized

In [None]:
#print the weight and bias that is initialized for the network
for param in net1.parameters():
    print(param.data,param.size())

### Network training 

In [None]:
fit(net1,md,opt=opt,epochs=3,crit=loss, metrics = metrics) #accuracy is calculated on validation: the right most value

**Learning rate annealing: When the accuracy is not changing much...one step could be higher learning rate so decrease that**

In [None]:
set_lrs(opt,1e-4) #you can change the learning rate. I'm reducing it for now to 1e-4

In [None]:
fit(net1,md,opt=opt,epochs=3,crit=loss, metrics = metrics)

##### And we can see that it worked!!

### Prediction on validation and calculating the accuracy

In [None]:
preds_logp = predict(net1,md.val_dl)

In [None]:
preds_logp.shape

In [None]:
preds_logp #these are log probabilities for 10000 images and 10 classes

In [None]:
preds = preds_logp.argmax(axis=1) #converting them to labels

In [None]:
preds.shape,preds #class prediction for each image

In [None]:
np.mean(preds == md.val_y) # accuracy calculation

---

### Building it from scratch with minimum use of pytorch inbuilt functions

### Defining your network

##### We are defining layers by ourselves instead of using nn.Sequential

In [None]:
def get_params(*dims): return nn.Parameter(torch.randn(dims)/dims[0]) #for numerical stability otherwise weights will explode or die out
def softmax(x): return torch.exp(x)/(torch.exp(x).sum(dim=1)[:,None])
class custom_net(nn.Module):
    def __init__(self):
        super().__init__()
        self.w = get_params(28*28,10) #creating weight matrix
        self.b = get_params(10) #creating bias matrix
    def forward(self,x):
#         set_trace()
        x = x.view(x.size(0),-1) #flattens the input matrix
        x = x@self.w + self.b #applying the linear layer  
        x = torch.log(softmax(x)) #applying logsoftmax
        return x

net2 = custom_net()
md = ImageClassifierData.from_arrays(path,(x,y),(x_valid,y_valid),bs=64)
opt = optim.Adam(net2.parameters(),lr=1e-3)
loss = nn.CrossEntropyLoss() #calculate cross entropy
metrics = [accuracy] #calculate accuracy based on predicted labels

In [None]:
net2.b.size(),net2.w.size()

### Training your network

In [None]:
fit(data=md,model=net2,crit=loss,epochs=3,metrics=metrics,opt=opt)

In [None]:
t = [o.numel() for o in net2.parameters()] #to get number of elements in each layer: weights and bias matrix
t, sum(t)

* When the module is called as function it's forward method is called by default
* Log-probabilities is used for numerical stability

### Predicting on a batch of validation image through the network

In [None]:
xmb,ymb = next(iter(md.val_dl)) #you get a batch of 64 images

In [None]:
vxmb = Variable(xmb) #creating it as variable

In [None]:
vxmb

In [None]:
pred_mb = net2(vxmb).exp();pred_mb #actual probabilities

In [None]:
y_pred = pred_mb.max(1)[1]#pytorch version of argmax...it returns class and maximum probabilities for each image

In [None]:
np.mean(to_np(y_pred)==to_np(ymb)) #to_np converts it to numpy array

---

### Deeper Neural Networks

In [None]:
#deeper nn
net3 = nn.Sequential(
nn.Linear(28*28,100),
nn.ReLU(),
nn.Linear(100,10),
nn.LogSoftmax()) #use cuda() for GPU

md = ImageClassifierData.from_arrays(path,(x,y),(x_valid,y_valid),bs=64)
opt = optim.Adam(net3.parameters(),lr=1e-3)
loss = nn.CrossEntropyLoss() #calculate cross entropy
metrics = [accuracy] #calculate accuracy based on predicted labels

In [None]:
fit(data=md,model=net3,crit=loss,epochs=3,metrics=metrics,opt=opt)

#### We can see that deeper networks are working much better

---

### Breaking it further into elements

##### Understanding how optimizer works

In [None]:
net4 = custom_net()
md = ImageClassifierData.from_arrays(path,(x,y),(x_valid,y_valid),bs=64)
opt = optim.Adam(net4.parameters(),lr=1e-3) 

In [None]:
xt,yt = next(iter(md.trn_dl))

In [None]:
xt.shape

In [None]:
ypred = net4(Variable(xt))

In [None]:
ypred #prediction on a batch of training images

In [None]:
w,b = net4.w,net4.b #storing weights and biases

In [None]:
w,b 

In [None]:
l = loss(ypred,Variable(yt));l

In [None]:
l.backward() #calculates gradients of loss with respect to every parameter

In [None]:
w.grad.data,b.grad.data

In [None]:
learning_rate = 1e-3

In [None]:
w.data - w.grad.data*learning_rate #this is what optimizer is doing behind the scenes

In [None]:
opt.step()

In [None]:
w.data

##### Replacing the optimizer and metrics

In [None]:
def score(x, y):
    y_pred = to_np(net2(V(x)))
    return np.sum(y_pred.argmax(axis=1) == to_np(y))/len(y_pred)

In [None]:
net5 = custom_net()
loss=nn.CrossEntropyLoss()
lr = 1e-3
w,b = net5.w,net5.b

for epoch in range(1):
    losses=[]
    dl = iter(md.trn_dl)
    for t in range(len(dl)):
        xt, yt = next(dl)
        y_pred = net5(V(xt))
        l = loss(y_pred, Variable(yt))
        losses.append(l)

        

        # Backward pass: compute gradient of the loss with respect to model parameters
        l.backward()
#         optimizer.step()
        w.data -= w.grad.data * lr
        b.data -= b.grad.data * lr
        
#         optimizer.zero_grad()
        w.grad.data.zero_()
        b.grad.data.zero_() 
  

    val_dl = iter(md.val_dl)
    val_scores = [score(*next(val_dl)) for i in range(len(val_dl))]
    print(np.mean(val_scores))

* Regularization: Weight decay or dropout
* Weight decay is similar to L1/L2 regularization but is not exactly the same
* Weight decay either during update or apply regularization in loss function
* But in adam optimizer these 2 are different concepts
* Sometimes weight decay can improve the loss surface 
* Weight decay finally result in relatively poor training loss

In [None]:
w.grad.data

----

## Broadcasting

In [None]:
a = np.array([1,5,3])
b = np.array([3,4,4])

In [None]:
a.shape,b.shape

#### element wise operation

In [None]:
a+b

In [None]:
(a>b)

In [None]:
a*b #element wise multiplication

In [None]:
a = T([1,5,3])
b = T([3,4,4])

In [None]:
a,b

In [None]:
a.size(),b.size()

In [None]:
a+b

In [None]:
a>b

0 is being broadcasted here

In [None]:
a>0 #comparison of rank 1 tensor with rank 0 tensor

In [None]:
x = np.array([[1,2,3],[3,4,5],[9,1,2]])

In [None]:
x

In [None]:
x.shape

In [None]:
2*x

2 has been broadcasted here 

In [None]:
np.broadcast_to(2,x.shape)

In [None]:
y = np.array([10,20,30])

In [None]:
y.shape #rank 1 tensor is considered as a row vector

In [None]:
x

In [None]:
x+y

In [None]:
np.broadcast_to(y,x.shape)

In [None]:
y[None]

In [None]:
y[:,None]

In [None]:
x+y[:,None]

In [None]:
np.broadcast_to(y[:,None],x.shape)

In [None]:
y[:,None].shape

In [None]:
y[None,:,None]

In [None]:
x = np.array([[1,2,3],[3,4,5],[9,1,2],[1,3,4]])

In [None]:
x.shape

In [None]:
np.broadcast_to(y, x.shape)

In [None]:
x@y

In [None]:
torch.randn(4,3) + torch.randn(3)

## Broadcasting rules

In [None]:
s = y[None]

In [None]:
t = y[:,None]

In [None]:
s,t

In [None]:
s.shape,t.shape

In [None]:
s@t

In [None]:
s*t

In [None]:
t@s

In [None]:
s*y

In [None]:
t*y

In [None]:
x,y

In [None]:
x.shape

In [None]:
y.shape

In [None]:
(x@y).shape #empty dim is taken as 1

In [None]:
(x@y)

In [None]:
(T(x)@T(y)).size()

In [None]:
(T(x)@T(y))

In [None]:
(x*y).sum(axis =1) #same as matrix multiplication

----