In [None]:
%pip install numpy
%pip install matplotlib
%pip install graphviz
%pip install scikit-learn
%pip install adjustText

import math
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from graphviz import Digraph
from adjustText import adjust_text

seed = 1337

np.random.seed(seed)

%matplotlib inline

In [None]:
# Defining a grid of Xs and Ys
resolution = 100
X, Y = np.meshgrid( np.linspace(-1,1,resolution), np.linspace(-1,1,resolution) )

# Defining 4 different 2D functions
mux, muy, sigma = 0.3, -0.3, 4
G1 = np.exp(-((X-mux)**2+(Y-muy)**2)/2.0*sigma**2)

mux, muy, sigma = -0.3, 0.3, 2
G2 = np.exp(-((X-mux)**2+(Y-muy)**2)/2.0*sigma**2)

mux, muy, sigma = 0.6, 0.6, 2
G3 = np.exp(-((X-mux)**2+(Y-muy)**2)/2.0*sigma**2)

mux ,muy, sigma = -0.4, -0.2, 3
G4 = np.exp(-((X-mux)**2+(Y-muy)**2)/2.0*sigma**2)


# Composing the final function
G = G1 + G2 - G3 - G4


fig  = plt.figure(figsize=(6*4,6)) # Defining the figure space
axes = fig.subplots(1, 4)          # Defining the subplots in the figure

for ax, g, t in zip(axes.flat, [G1, G2, G3, G4], ['G1', 'G2', 'G3', 'G4']): # Iterating over axes and functions
    ax.imshow(g, vmin=-1, vmax=1, cmap='jet')                               # Ploting the function on the subplot
    ax.set(title=t, xlim=(0, 100), ylim=(0, 100))                           # Setting the title and limits of the subplot

fig.tight_layout() # Removes extra spacing from the figure

fig, ax = plt.subplots()

cax = ax.imshow(G, vmin=-1, vmax=1, cmap='jet')
ax.set(title="Function", xlim=(0, 100), ylim=(0, 100))

fig.colorbar(cax) # Attaching the colorbar to the figure

fig.tight_layout() 
plt.show()               # Instruct Matplotlib to show the figures created

# fig.savefig("./assets/images/Optimization.png", dpi=300)

In [None]:
n_iter = 5     # Number of Steps to take for optimisation
alpha  = 0.03  # Learning rate of the optimisation

w = np.array([70.0, 60.0]) # Starting Parameter (Point)
sigma  = 3                 # Standard deviation of the samples around current parameter vector

fig  = plt.figure( figsize=(5*n_iter, 5) )
axes = fig.subplots(1, n_iter) 

prevx, prevy = [], []
for q, ax in zip(range(n_iter), axes):
    
    # Draw the Optimization Landscape
    ax.imshow(G, vmin=-1, vmax=1, cmap='jet')

    # Sample Random Population
    noise = np.random.randn(200, 2)
    wp = np.expand_dims(w, 0) + sigma * noise
    x,y = zip(*wp)
    
    # Estimate Gradient (Direction)
    R  = np.array([G[int(wi[1]), int(wi[0])] for wi in wp])
    R -= R.mean()
    R /= R.std() 
    g  = np.dot(R, noise)
    u  = alpha * g
    
    prevx.append(w[0])
    prevy.append(w[1])
    
    # Draw Population on Landscape (Black Points)
    ax.scatter(x, y, 4, 'k', edgecolors='face')
    
    # Draw estimated gradient (direction) as arrow (White Arrow)
    ax.arrow(w[0], w[1], u[0], u[1], head_width=3, head_length=5, fc='w', ec='w')
    
    # Draw Parameter History (White Points)
    ax.plot(prevx, prevy, 'wo-')
    
    # Update Parameter According to the gradient
    w += alpha * g
    
    ax.set(title=f"Iteration: {q+1} | Reward: {G[int(w[0]), int(w[1])]:.2f}", xlim=(0, 100), ylim=(0, 100))

fig.tight_layout()

In [None]:
class Value:
    def __init__(self, data, label='', _children=(), _op=''):
        
        # Information about value, gradient and its name
        self.data  = data
        self.grad  = 0.0
        self.label = label
        
        # Utility attributes for the calculating and passing gradients (Backprop)
        self._backward = lambda: None
        self._prev     = set(_children)
        self._op       = _op 
    
    # Simple arithemtic operations on value and computing corresponding gradients   
    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, label='+', _children=(self, other), _op='+')

        def _backward():
            self.grad  += out.grad
            other.grad += out.grad
        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, label='*', _children=(self, other), _op='*')

        def _backward():
            self.grad  += other.data * out.grad
            other.grad += self.data  * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
        out = Value(self.data**other, label=f'**{other}', _children=(self,), _op='**')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad
        out._backward = _backward

        return out

    # Other arithmetic operations
    ### Don't need to define backward functions since, they use __mul__ or __add__ for which backward is already defined. 
    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1
    
    # Simple transformations on Value and computing corresponding gradients
    def relu(self):
        out = Value(0 if self.data < 0 else self.data, label='ReLU', _children=(self,), _op='ReLU')

        def _backward():
            self.grad += (out.data > 0) * out.grad
        out._backward = _backward

        return out

    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, label='Tanh', _children=(self, ), _op='Tanh')
        
        def _backward():
            self.grad += (1 - t**2) * out.grad
        out._backward = _backward
        
        return out
  
    def exp(self):
        x = self.data
        out = Value(math.exp(x), label='Exp',  _children=(self, ), _op='Exp')
        
        def _backward():
            self.grad += out.data * out.grad 
        out._backward = _backward
        
        return out
    
    # Information when printing instance
    def __repr__(self):
        if self.label:
            return f"Value(node={self.label}, data={self.data}, grad={self.grad})"
        else:
            return f"Value(data={self.data}, grad={self.grad})"
    
    # Recurisvely call backward -> Backprop
    def backward(self):

        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.grad = 1
        for v in reversed(topo):
            v._backward()

In [None]:
# Builds the graph from a root node
def trace(root):
    nodes, edges = set(), set()
    def build(v):
        if v not in nodes:
            nodes.add(v)
            for child in v._prev:
                edges.add((child, v))
                build(child)
    build(root)
    return nodes, edges

# Visualizes the graph built from root node
def draw_dot(root):
    dot = Digraph(format='svg', graph_attr={'rankdir': 'LR'})
    nodes, edges = trace(root)
    for n in nodes:
        uid = str(id(n))
        dot.node(name = uid, label = "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')
        if n._op:
            dot.node(name = uid + n._op, label = n._op)
            dot.edge(uid + n._op, uid)

    for n1, n2 in edges:
        dot.edge(str(id(n1)), str(id(n2)) + n2._op)

    return dot

In [None]:
def cost_function(x):
    return x**2 - 4*x + 3 

In [None]:
X = np.linspace(-7, 15, 100)
Y = cost_function(X)

plt.plot(X, Y)
plt.show()

In [None]:
x = Value(15.0, label='X')
y = cost_function(x)

draw_dot(y)

In [None]:
y.backward()
x

In [None]:
alpha = 0.3 # 0.1 # 0.3 # 0.9 #
num_iterations = 10

x = Value(15.0, label='X')

xy_list = []
for i in range(num_iterations):
    
    # Calculate f(x)
    y = cost_function(x)
    
    # Calculate dy/dx
    y.backward()
    
    xy_list.append((x.data, y.data))
    print(f"Step: {i+1:2d} | X: {x.data:5.2f} | f(X): {y.data:8.4f} | Gradient dy/dx: {x.grad:7.4f}")
    
    # Update x 
    x -= alpha * x.grad

print(f"\n{'-'*70}\n")

xy_list = np.asarray(xy_list)

fig = plt.figure(figsize=(16, 8))
ax  = fig.subplots()

ax.plot(X, Y)
ax.plot(xy_list[:, 0], xy_list[:, 1], 'r--', marker="o")

texts = []
for i in range(len(xy_list)):
    text = ax.text(xy_list[i, 0], xy_list[i, 1], f"({i+1}, {round(xy_list[i, 0], 2)}, {round(xy_list[i, 1], 4)})", ha='center', va='center')
    texts.append(text)
adjust_text(texts, expand=(3, 3.5), arrowprops=dict(arrowstyle='->', color='grey'))

ax.set(xlabel='X', ylabel='Cost Function', title=f"$f(x) = y = x^2 - 4x + 3$")
plt.show()

In [None]:
X_train, y_train = datasets.make_moons(n_samples=100, noise=0.1)
X_test,  y_test  = datasets.make_moons(n_samples=100, noise=0.15)

# make y be -1 or 1
y_train = y_train*2 - 1 
y_test  = y_test*2 - 1
 
cmap = plt.cm.Spectral

fig, ax = plt.subplots()

ax.scatter(X_train[:,0], X_train[:,1], c=[cmap(i%200) for i in y_train],               s=20)
ax.scatter(X_test[:,0],  X_test[:,1],  edgecolor=[cmap(i%200) for i in y_test], c='w', s=20)

ax.set(xlabel="X$_1$", ylabel="X$_2$", xlim =(X_train[:, 0].min()-1, X_train[:, 0].max()+1), ylim=(X_train[:, 1].min()-1, X_train[:, 1].max()+1))
plt.show()

In [None]:
# Inputs x1,x2
x1 = Value(2.0, label='x1')
x2 = Value(0.0, label='x2')

# Weights w1,w2
w1 = Value(-3.0, label='w1')
w2 = Value(1.0, label='w2')

# Bias of the neuron
b = Value(6.8813735870195432, label='b')

# x1*w1 + x2*w2 + b
x1w1 = x1*w1; x1w1.label = 'x1*w1'
x2w2 = x2*w2; x2w2.label = 'x2*w2'

x1w1x2w2 = x1w1 + x2w2; x1w1x2w2.label = 'x1*w1 + x2*w2'

n = x1w1x2w2 + b; n.label = 'n'

o = n.tanh(); o.label = 'o'

draw_dot(o)

In [None]:
def plot(x, y, dy, title, ax):
    ax.plot(x, y, linewidth=3, label="f(x)", color="#69acc7")
    ax.plot(x, dy, linewidth=3, label="f'(x)", color="#97c784")
    ax.set_title(f"Curve for {title} with its derivative")
    ax.legend(loc='best')
    ax.spines['left'].set_position('zero')
    ax.spines['bottom'].set_position('zero')
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')

from scipy.special import erf

fig = plt.figure(figsize=(4*5, 4*2))
axes = fig.subplots(2, 5)

x = np.arange(-10, 10, 0.1)

y  = x 
dy = np.ones_like(x)
plot(x, y, dy, 'Linear', axes[0, 0])

y  = 1/(1+np.exp(-x)) 
dy = y*(1-y)
plot(x, y, dy, 'Sigmoid', axes[0, 1])

y  = (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x)) 
dy = 1-(y**2)
plot(x, y, dy, 'Tanh', axes[0, 2])

y  = np.maximum(x, 0) 
dy = np.heaviside(x,1) 
plot(x, y, dy, 'ReLU', axes[0, 3])

alpha = 0.1
y  = np.where(x<0, alpha*x, x)
dy = np.where(x<0, alpha,   1)
plot(x, y, dy, 'LeakyReLU', axes[0, 4])

y  = np.heaviside(x,1) 
dy = np.zeros_like(y)
plot(x, y, dy, 'Step', axes[1, 0])

y  = np.log(1+np.exp(x))
dy = 1/(1+np.exp(-x))
plot(x, y, dy, 'Softplus', axes[1, 1])

alpha = 2
y  = np.where(x<=0, alpha*(np.exp(x)-1), x)
dy = np.where(x<=0, alpha*np.exp(x), 1)
plot(x, y, dy, 'ELU', axes[1, 2])

f = 1 + np.exp(-x)
y  = x/f
dy = (f + (x*np.exp(-x)))/(f**2)
plot(x, y, dy, 'Swish', axes[1, 3])

s = x / np.sqrt(2)
erf_prime = lambda x: (2 / np.sqrt(np.pi)) * np.exp(-(x ** 2))
y  = 0.5 * x * (1 + erf(s))
dy = 0.5 + 0.5 * erf(s) + ((0.5 * x * erf_prime(s)) / np.sqrt(2))
plot(x, y, dy, 'GELU', axes[1, 4])


fig.tight_layout()
plt.show()

In [None]:
class Module:

    # Explictly make gradients 0.0
    def zero_grad(self):
        for p in self.parameters():
            p.grad = 0.0

    # List of Parameters
    def parameters(self):
        return []

class Neuron(Module):

    # Initialises weights, bias and activations for the neuron
    def __init__(self, nin, activation='ReLU', layer_name='', neuron_name=''):
        
        self.w = [Value(np.random.uniform(-1,1), label=f"Weight of {layer_name} {neuron_name} for Input {i+1}") for i in range(nin)]
        self.b = Value(0, label=f"Bias of {layer_name} {neuron_name}")
        self.activation = activation

    # Sets the list of parameters in the neuron
    def parameters(self):
        return self.w + [self.b]

    # Information when printing neuron
    def __repr__(self):
        return f"{self.activation}Neuron(nin={len(self.w)})"
    
    # Forward Pass -> Compute the output of the neuron
    def __call__(self, x):
        
        w = sum((wi*xi for wi,xi in zip(self.w, x)))
        out = w + self.b
        
        if self.activation == 'ReLU':
            out = out.relu()
        elif self.activation == 'Tanh':
            out = out.tanh()
        elif self.activation == 'Linear':
            out = out
            
        return out

class Layer(Module):

    def __init__(self, nin, nout, **kwargs):
        # Define neurons of a layer
        self.neurons = [Neuron(nin, neuron_name=f"Neuron {i+1}", **kwargs) for i in range(nout)]

    # Sets the list of parameters in the layer
    def parameters(self):
        return [p for n in self.neurons for p in n.parameters()]

    # Information when printing layer
    def __repr__(self):
        return f"Layer of [ {', '.join(str(n) for n in self.neurons)} ]"
    
    # Forward Pass -> Compute the output of the layer
    def __call__(self, x):
        out = [n(x) for n in self.neurons]
        return out[0] if len(out) == 1 else out

class MLP(Module):

    def __init__(self, nin, nouts, activations=None):
        if activations is not None:
            assert len(nouts) == len(activations), 'Activations not defined for some layers'
        else:
            activations = ['Linear'] * len(nouts)
            
        sz = [nin] + nouts 
        
        # Define layers of a MLP
        self.layers = [Layer(sz[i], sz[i+1], activation=activations[i], layer_name=f"Layer {i+1}") for i in range(len(nouts))]

    # Sets the list of parameters in the MLP
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

    # Information when printing MLP
    def __repr__(self):
        new_line = f"\n{'-'*8}> "
        return f"MLP of [{new_line}{new_line.join(str(layer) for layer in self.layers)}\n]"
    
    # Forward Pass -> Compute the output of the MLP
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
def compute_loss(model, batch_size=None, X=X_train, y=y_train):
    
    # Process Data in batches, in case data is too big to handle
    if batch_size is None:
        Xb, yb = X, y
    else:
        ri = np.random.permutation(X.shape[0])[:batch_size]
        Xb, yb = X[ri], y[ri]
    
    # Format Data to our Datatype
    inputs = [ [Value(xrow[0], label='X'), Value(xrow[1], label='Y')] for xrow in Xb]
    
    # Forward Pass to get the scores
    scores = list(map(model, inputs))
    
    # Max-Margin Loss to calculate fitness based on scores and y
    losses = [(1 + -yi*scorei).relu() for yi, scorei in zip(yb, scores)]
    output_loss = sum(losses) * (1.0 / len(losses))
    
    # L2 Regularization (Optional)
    ## To improve performance, we also regularise the parameters. 
    alpha = 1e-4
    reg_loss = alpha * sum((p*p for p in model.parameters()))
    
    # Compute Final Loss -> Max-Margin Loss + L2 Regularization Loss
    loss = output_loss + reg_loss
    
    # Compute Predictions and Accuracy
    predictions = np.array([1 if (scorei.data > 0) else -1 for scorei in scores])
    accuracy    = sum([(yi > 0) == (scorei.data > 0) for yi, scorei in zip(yb, scores)])/len(yb)
    
    # Return everything required
    data = {}
    data['loss']        = loss
    data['scores']      = scores
    data['predictions'] = predictions
    data['accuracy']    = 100*accuracy
    return data

In [None]:
model = MLP(nin=2, nouts=[2, 2, 1]) # 2-layer neural network
print(model)
print(f"Number of Parameters: {len(model.parameters())}")

In [None]:
data = compute_loss(model)
print(f"Loss: {data['loss'].data:.4f} | Accuracy: {data['accuracy']: 5.2f}%")

In [None]:
data['scores'][0].backward()
draw_dot(data['scores'][0])

In [None]:
n_iter = 20
n_log  = 1
learning_rate = lr = 1.0

model = MLP(nin=2, nouts=[16, 16, 1], activations=['ReLU', 'ReLU', 'Linear']) # 2-layer neural network
print(model)
print(f"Number of Parameters: {len(model.parameters())}")
print(f"\n{'-'*70}\n")
train_history = []

# Optimize Iteratively
for k in range(n_iter):
    
    # Zero-Grad
    model.zero_grad()

    # Forward Pass -> Compute Loss
    data = compute_loss(model)
    
    # Backward Pass
    data['loss'].backward()
        
    # Log Details
    if k % n_log == 0:
        print(f"Step: {k+1:3d} | Loss: {data['loss'].data:.4f} | Accuracy: {data['accuracy']:5.2f}% | Learning Rate: {lr:.2f}")
        train_history.append((data['loss'].data, data['accuracy'], lr))
    
    # Update Weights using SGD
    lr = learning_rate - 0.9*(k+1)/n_iter
    for p in model.parameters():
        p.data -= lr * p.grad
    

In [None]:
fig = plt.figure(figsize=(6*3, 6))
axes = fig.subplots(1, 3)

train_history = np.asarray(train_history)

for ax, d, t in zip(axes.flat, [train_history[:, 0], train_history[:, 1], train_history[:, 2]], ['Loss', 'Accuracy', 'Learning Rate']):
    ax.plot(d)
    ax.set(title=t, xlim=(0, n_iter))

plt.show()

In [None]:
train_data = compute_loss(model, X=X_train, y=y_train)
test_data  = compute_loss(model, X=X_test,  y=y_test)

cmap = plt.cm.Spectral
fig, ax = plt.subplots()

train_color = [cmap(j%200) if i==j else 'k' for i, j in zip(train_data['predictions'], y_train)]
test_color  = [cmap(j%200) if i==j else 'k' for i, j in zip(test_data['predictions'],  y_test)]

ax.scatter(X_train[:,0], X_train[:,1], c=train_color,                s=20)
ax.scatter(X_test[:,0],  X_test[:,1],  edgecolor=test_color, c='w',  s=20)

ax.set(xlabel="X$_1$", ylabel="X$_2$", xlim =(X_train[:, 0].min()-1, X_train[:, 0].max()+1), ylim=(X_train[:, 1].min()-1, X_train[:, 1].max()+1))
plt.show()

In [None]:
# Visualise Decision Boundary
resolution = 0.25
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1 
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1 

xx, yy = np.meshgrid(np.arange(x_min, x_max+ resolution, resolution), np.arange(y_min, y_max+ resolution, resolution))

Xmesh = np.c_[xx.ravel(), yy.ravel()]
inputs = [list(map(Value, xrow)) for xrow in Xmesh]

scores = list(map(model, inputs))

Z = np.array([s.data > 0 for s in scores]).reshape(xx.shape)

cmap = plt.cm.Spectral
fig, ax = plt.subplots()

ax.contourf(xx, yy, Z, colors=[cmap(-1%200), cmap(1)], alpha=0.25)

train_color = [cmap(j%200) if i==j else 'k' for i, j in zip(train_data['predictions'], y_train)]
test_color  = [cmap(j%200) if i==j else 'k' for i, j in zip(test_data['predictions'],  y_test)]

ax.scatter(X_train[:,0], X_train[:,1], c=train_color,                s=20)
ax.scatter(X_test[:,0],  X_test[:,1],  edgecolor=test_color, c='w',  s=20)

ax.set(xlabel="X$_1$", ylabel="X$_2$", xlim =(x_min, x_max), ylim=(y_min, y_max))
plt.show()

In [None]:
%pip install torch torchvision

from tqdm.auto import tqdm
from itertools import repeat
from sklearn import manifold
from sklearn.preprocessing import MinMaxScaler
from matplotlib import offsetbox

import torch
import torchvision
import torch.nn.functional as F
from torch import nn as nn, optim as optim
from torchvision import datasets, transforms, utils as vutils

torch.manual_seed(seed)
device = 'cpu' if torch.cuda.is_available() 'cuda'

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(784, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 32)
        self.fc4 = nn.Linear(32, 10)
        

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.fc4(x)
        return F.log_softmax(x, dim=1)

In [None]:
batch_size      = 128
test_batch_size = 512

train_kwargs = {'batch_size': batch_size}
test_kwargs  = {'batch_size': test_batch_size}

def infinite_loader(data_loader):
    for loader in repeat(data_loader):
        for data in loader:
            yield data
            
train_dataset = datasets.MNIST('../data', train=True,  download=True, transform=transforms.ToTensor())
test_dataset  = datasets.MNIST('../data', train=False, transform=transforms.ToTensor())
train_loader  = infinite_loader(torch.utils.data.DataLoader(train_dataset,**train_kwargs))
test_loader   = torch.utils.data.DataLoader(test_dataset, **test_kwargs)

In [None]:
batch = next(iter(train_loader))

fig = plt.figure(figsize=(8, 8))
ax = fig.subplots()

ax.imshow(np.transpose(vutils.make_grid(batch[0].to(device)[:32], padding=2, normalize=True).cpu(), (1,2,0)))
ax.set(xticks=[], yticks=[])
plt.show()

In [None]:
n_iter          = 1000
learning_rate   = 0.1
n_log           = 1

model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=0.01, T_max=n_iter)

In [None]:
def train(model, device, train_loader, optimizer, scheduler, n_iter):
    train_history = []
    model.train()
    with tqdm(total=n_iter) as bar:
        for batch_idx, (data, target) in enumerate(train_loader):
            
            # Converting data to required format
            data, target = data.to(device), target.to(device)
            data = data.view(-1, 784)
            
            # Explictily Zeroing Gradients
            optimizer.zero_grad()
            
            # Forward Pass
            output = model(data)
            
            # Calculate Loss
            loss = F.nll_loss(output, target)
            
            # Backward Pass
            loss.backward()
            
            # Updating Weights
            optimizer.step()
            
            if batch_idx % n_log == 0:
                bar.update(n_log)
                bar.set_postfix({'Loss':  f"{loss.item():.4f}", 'Learning Rate': f"{scheduler.get_last_lr()[0]:.4f}"})
                train_history.append((loss.item, scheduler.get_last_lr()[0]))
            
            # Changing Learning Rate
            scheduler.step()
            
            if batch_idx == n_iter-1:
                break

def test(model, device, test_loader):
    model.eval()
    
    test_loss = 0
    correct   = 0
    
    images    = []
    labels    = []
    outputs   = []
    
    with torch.no_grad() and tqdm(total=len(test_loader)) as bar:
        for data, target in test_loader:
            
            # Converting data to required format
            data, target = data.to(device), target.to(device)
            data = data.view(-1, 784)
            
            # Forward Pass
            output = model(data)
            
            # Calculate Loss
            loss = F.nll_loss(output, target, reduction='sum').item()  
            
            # Get Prediction
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            
            correct   += pred.eq(target.view_as(pred)).sum().item()
            test_loss += loss
            outputs.extend(output.detach().cpu().numpy())
            images.extend(data.detach().cpu().numpy())
            labels.extend(target.detach().cpu().numpy())
            
            bar.update(1)

    test_loss /= len(test_loader.dataset)
    correct   /= len(test_loader.dataset)
    images      = np.asarray(images).reshape(-1, 1, 28, 28).transpose(0, 2, 3, 1)
    labels      = np.asarray(labels)
    outputs     = np.asarray(outputs)

    print(f"Test set--- Average loss: {test_loss:.4f}, Accuracy: {100. * correct:.2f}%")
    return images, labels, outputs

In [None]:
train(model, device, train_loader, optimizer, scheduler, n_iter)

In [None]:
torch.jit.script(model).save('mnist-try.pt')

In [None]:
trained_model = torch.jit.load('mnist-try.pt')

In [None]:
images, labels, outputs = test(trained_model, device, test_loader)

In [None]:
def plot_embedding(embedding, images, labels, title):
    fig = plt.figure(figsize=(16, 9))
    ax  = fig.subplots()
    
    X = MinMaxScaler().fit_transform(embedding)
    
    for digit in range(10):
        ax.scatter(*X[labels == digit].T, marker=f"${digit}$", s=60, color=plt.cm.Dark2(digit), alpha=0.425, zorder=2,)
    
    shown_images = np.array([[1.0, 1.0]])
    for i in range(X.shape[0]):
        dist = np.sum((X[i] - shown_images) ** 2, 1)
        if np.min(dist) < 4e-3:
            continue
        shown_images = np.concatenate([shown_images, [X[i]]], axis=0)
        imagebox = offsetbox.AnnotationBbox(offsetbox.OffsetImage(images[i], cmap=plt.cm.gray_r), X[i])
        imagebox.set(zorder=1)
        ax.add_artist(imagebox)

    ax.set_title(title)
    ax.axis("off")

num_images = 1000

tsne = manifold.TSNE(n_components=2)

image_embedding = tsne.fit_transform(images[:num_images].reshape(num_images, -1), labels[:num_images])
plot_embedding(image_embedding, images[:num_images], labels[:num_images], "Raw Data Representation")

output_embedding = tsne.fit_transform(outputs[:num_images].reshape(num_images, -1), labels[:num_images])
plot_embedding(output_embedding, images[:num_images], labels[:num_images], "Neural Network Representation")

In [None]:
%pip install git+https://github.com/RobustBench/robustbench.git 

from scipy.optimize import differential_evolution as de

from robustbench.data import load_cifar10
from robustbench.utils import load_model

if device == 'cuda':
    torch.cuda.empty_cache()

In [None]:
x_test, y_test = load_cifar10(n_examples=50)
model = load_model(model_name='Standard', dataset='cifar10', threat_model='Linf').to(device)
classes = ['Airplane', 'Automobile', 'Bird', 'Cat', 'Deer', 'Dog', 'Frog', 'Horse', 'Ship', 'Truck']

In [None]:
image_id = 10
image = x_test[image_id]
label = y_test[image_id]

In [None]:
output = torch.softmax(model(image[None, ...].to(device)), dim=1)
conf, pred = output.max(dim=1)

fig = plt.figure()
ax = fig.subplots()
ax.imshow(image.permute(1,2,0))
ax.set(xticks=[], yticks=[], title=f"True Class: {classes[label]}", xlabel=f"Predicted Class {classes[pred.item()]} has {conf.item()*100:.2f}% confidence")
plt.show()

In [None]:
def perturb_image(perturbations, img):
    
    perturbations = perturbations.astype(int)
    if perturbations.ndim < 2:
        perturbations = np.array([perturbations])
    
    img = (img * 255).detach().cpu().numpy().astype(int)
    imgs = np.tile(img, [len(perturbations)] + [1]*(perturbations.ndim+1))
    
    for x, img in zip(perturbations, imgs):
        pixels = np.split(x, len(x) // 5)
        for pixel in pixels:
            x_pos, y_pos, *value = pixel
            img[:, round(x_pos), round(y_pos)] = value
        
    imgs = imgs/255.0
    imgs = torch.from_numpy(imgs).float()
    
    return imgs

In [None]:
pixel = np.array([6, 6, 255, 0, 0]) 
image_perturbed = perturb_image(pixel, image)[0]

output = torch.softmax(model(image_perturbed[None, ...].to(device)), dim=1)
conf, pred = output.max(dim=1)

fig = plt.figure()
ax = fig.subplots()
ax.imshow(image_perturbed.permute(1,2,0))
ax.set(xticks=[], yticks=[], title=f"True Class: {classes[label]}", xlabel=f"Predicted Class {classes[pred.item()]} has {conf.item()*100:.2f}% confidence")
plt.show()

In [None]:
def attack(image, target, pixel_count=1, maxiter=100, popsize=256):
    
    bounds = [(0,32), (0,32), (0,256), (0,256), (0,256)] * pixel_count
    popmul = max(1, popsize // len(bounds))
    
    def run(perturbations, image, target, evaluate=False):
        images_perturbed = perturb_image(perturbations, image).to(device)
        probabilities = torch.softmax(model(images_perturbed), dim=1)
        if evaluate:
            prediction = torch.argmax(probabilities, dim=1)
            return prediction != target
        else:
            confidence = probabilities[:, target].detach().cpu().numpy()
            return confidence
    
    def predict_fn(xs):
        xs = xs.transpose()
        return run(xs, image, target, evaluate=False)
    
    def callback_fn(x, convergence):
        return run(x, image, target, evaluate=True)

    result = de(
        predict_fn, bounds=bounds, maxiter=maxiter, popsize=popmul, vectorized=True,
        recombination=1, atol=-1, callback=callback_fn, polish=False, disp=True)
    
    return result.x    

In [None]:
attacked_perturbation = attack(image, label, pixel_count=5)

In [None]:
image_perturbed = perturb_image(attacked_perturbation, image)[0]

output = torch.softmax(model(image_perturbed[None, ...].to(device)), dim=1)
conf, pred = output.max(dim=1)
true_conf = output[:, label]

fig = plt.figure()
ax = fig.subplots()
ax.imshow(image_perturbed.permute(1,2,0))
ax.set(xticks=[], yticks=[], title=f"True Class: {classes[label]}", xlabel=f"Predicted Class {classes[pred.item()]} has {conf.item()*100:.2f}% confidence\nTrue Class {classes[label]} has {true_conf.item()*100:.2f}% condifence")
plt.show()