<a href="https://colab.research.google.com/github/sangchun1/AI4DL/blob/main/Week%203/chapter02_mathematical-building-blocks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.

**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**

This notebook was generated for TensorFlow 2.6.

# The mathematical building blocks of neural networks

## The gears of neural networks: tensor operations

### Element-wise operations

In [1]:
def naive_relu(x):
    assert len(x.shape) == 2
    x = x.copy()
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i, j] = max(x[i, j], 0)
    return x

In [2]:
import numpy as np
x = np.array([[1,-2,3,-4], [5,6,-7,8]])
print(naive_relu(x))

[[1 0 3 0]
 [5 6 0 8]]


In [3]:
def naive_add(x, y):
    assert len(x.shape) == 2
    assert x.shape == y.shape
    x = x.copy()
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i, j] += y[i, j]
    return x

In [4]:
x = np.array([[1,2,3], [4,5,6]])
y = np.array([[1,2,3], [1,2,3]])
print(naive_add(x, y))

[[2 4 6]
 [5 7 9]]


In [5]:
import time

x = np.random.random((20, 100))
y = np.random.random((20, 100))

t0 = time.time()
for _ in range(1000):
    z = x + y
    z = np.maximum(z, 0.)
print("Took: {0:.2f} s".format(time.time() - t0))

Took: 0.01 s


In [6]:
t0 = time.time()
for _ in range(1000):
    z = naive_add(x, y)
    z = naive_relu(z)
print("Took: {0:.2f} s".format(time.time() - t0))

Took: 2.30 s


### Broadcasting

In [7]:
import numpy as np
X = np.random.random((32, 10))
y = np.random.random((10,))

In [8]:
y = np.expand_dims(y, axis=0)

In [9]:
Y = np.concatenate([y] * 32, axis=0)

In [10]:
def naive_add_matrix_and_vector(x, y):
    assert len(x.shape) == 2
    assert len(y.shape) == 1
    assert x.shape[1] == y.shape[0]
    x = x.copy()
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            x[i, j] += y[j]
    return x

In [13]:
X = np.random.random((32, 10))
y = np.random.random((10,))
n = naive_add_matrix_and_vector(X, y)
print(n.shape)

(32, 10)


In [15]:
a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
print(a.shape, a)
b = np.array([10, 10, 10, 10])
print(b.shape, b)
ab = naive_add_matrix_and_vector(a, b)
print(ab.shape, ab)

(2, 4) [[1 2 3 4]
 [5 6 7 8]]
(4,) [10 10 10 10]
(2, 4) [[11 12 13 14]
 [15 16 17 18]]


In [16]:
import numpy as np
x = np.random.random((64, 3, 32, 10))
y = np.random.random((32, 10))
z = np.maximum(x, y)
z.shape

(64, 3, 32, 10)

In [17]:
a = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
print(a.shape)
b = np.array([3, 9, 13])
print(b.shape)
c = np.maximum(a, b)
print(c, c.shape)

(2, 2, 3)
(3,)
[[[ 3  9 13]
  [ 4  9 13]]

 [[ 7  9 13]
  [10 11 13]]] (2, 2, 3)


### Tensor product

In [18]:
x = np.random.random((32,))
y = np.random.random((32,))
z = np.dot(x, y)

In [19]:
def naive_vector_dot(x, y):
    assert len(x.shape) == 1
    assert len(y.shape) == 1
    assert x.shape[0] == y.shape[0]
    z = 0.
    for i in range(x.shape[0]):
        z += x[i] * y[i]
    return z

In [20]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([6, 7, 8, 9, 10])
c = naive_vector_dot(a, b)
print(c)
print(1*6 + 2*7 + 3*8 + 4*9 + 5*10)

130.0
130


In [21]:
def naive_matrix_vector_dot(x, y):
    assert len(x.shape) == 2
    assert len(y.shape) == 1
    assert x.shape[1] == y.shape[0]
    z = np.zeros(x.shape[0])
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            z[i] += x[i, j] * y[j]
    return z

In [22]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([6, 7, 8])
c = naive_matrix_vector_dot(a, b)
print(c)
print(1*6 + 2*7 + 3*8, 4*6 + 5*7 + 6*8)

[ 44. 107.]
44 107


In [23]:
def naive_matrix_vector_dot(x, y):
    z = np.zeros(x.shape[0])
    for i in range(x.shape[0]):
        z[i] = naive_vector_dot(x[i, :], y)
    return z

In [24]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([6, 7, 8])
c = naive_matrix_vector_dot(a, b)
print(c)
print(1*6 + 2*7 + 3*8, 4*6 + 5*7 + 6*8)

[ 44. 107.]
44 107


In [25]:
def naive_matrix_dot(x, y):
    assert len(x.shape) == 2
    assert len(y.shape) == 2
    assert x.shape[1] == y.shape[0]
    z = np.zeros((x.shape[0], y.shape[1]))
    for i in range(x.shape[0]):
        for j in range(y.shape[1]):
            row_x = x[i, :]
            column_y = y[:, j]
            z[i, j] = naive_vector_dot(row_x, column_y)
    return z

In [26]:
a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[6, 7], [8, 9], [10, 11]])
c = naive_matrix_dot(a, b)
print(c)
print(1*6 + 2*8 + 3*10, 1*7 + 2*9 + 3*11, 4*6 + 5*8 + 6*10, 4*7 + 5*9 + 6*11)

[[ 52.  58.]
 [124. 139.]]
52 58 124 139


### Tensor reshaping

In [28]:
from tensorflow.keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [29]:
train_images = train_images.reshape((60000, 28 * 28))

In [30]:
x = np.array([[0., 1.],
             [2., 3.],
             [4., 5.]])
x.shape

(3, 2)

In [31]:
x = x.reshape((6, 1))
x

array([[0.],
       [1.],
       [2.],
       [3.],
       [4.],
       [5.]])

In [32]:
x = np.zeros((300, 20))
x = np.transpose(x)
x.shape

(20, 300)

### Geometric interpretation of tensor operations

### A geometric interpretation of deep learning

## The engine of neural networks: gradient-based optimization

### What's a derivative?

### Derivative of a tensor operation: the gradient

### Stochastic gradient descent

### Chaining derivatives: The Backpropagation algorithm

#### The chain rule

#### Automatic differentiation with computation graphs

#### The gradient tape in TensorFlow

In [33]:
import tensorflow as tf
x = tf.Variable(0.)
with tf.GradientTape() as tape:
    y = 2 * x + 3
grad_of_y_wrt_x = tape.gradient(y, x)
grad_of_y_wrt_x

<tf.Tensor: shape=(), dtype=float32, numpy=2.0>

In [34]:
x = tf.Variable(tf.random.uniform((2, 2)))
with tf.GradientTape() as tape:
    y = 2 * x + 3
grad_of_y_wrt_x = tape.gradient(y, x)
grad_of_y_wrt_x

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[2., 2.],
       [2., 2.]], dtype=float32)>

In [35]:
W = tf.Variable(tf.random.uniform((2, 2)))
b = tf.Variable(tf.zeros((2,)))
x = tf.random.uniform((2, 2))
with tf.GradientTape() as tape:
    y = tf.matmul(x, W) + b
grad_of_y_wrt_W_and_b = tape.gradient(y, [W, b])
grad_of_y_wrt_W_and_b

[<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[0.384385 , 0.384385 ],
        [1.4284033, 1.4284033]], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 2.], dtype=float32)>]

In [36]:
# Find derivatives of f(x, y) = x^2*y + x*y + 3*y with respect to x and y.
x = tf.Variable(2.)
y = tf.Variable(1.)
with tf.GradientTape() as tape:
  f = pow(x, 2)*y + x*y + 3*y
grad_of_f_wrt_x_and_y = tape.gradient(f, [x, y])
grad_of_f_wrt_x_and_y

[<tf.Tensor: shape=(), dtype=float32, numpy=5.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=9.0>]

In [37]:
x = tf.constant(np.array([1.,4.,3.]).reshape(1,3),dtype=tf.float32)
W = tf.Variable(tf.random.uniform((3,2)),dtype=tf.float32)
b = tf.Variable(tf.zeros((2,)),dtype=tf.float32)
with tf.GradientTape() as tape:
  y = tf.matmul(x, W) + b
grad_of_y_wrt_W_and_b = tape.gradient(y, [W, b])
grad_of_y_wrt_W_and_b

[<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
 array([[1., 1.],
        [4., 4.],
        [3., 3.]], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 1.], dtype=float32)>]

In [38]:
with tf.GradientTape() as tape:
  y = pow((tf.matmul(x, W) + b), 3)
grad_of_y_wrt_W_and_b = tape.gradient(y, [W, b])
grad_of_y_wrt_W_and_b

[<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
 array([[ 23.65624 ,  35.195885],
        [ 94.62496 , 140.78354 ],
        [ 70.96872 , 105.587654]], dtype=float32)>,
 <tf.Tensor: shape=(2,), dtype=float32, numpy=array([23.65624 , 35.195885], dtype=float32)>]

## Looking back at our first example

In [39]:
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype("float32") / 255
test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype("float32") / 255

In [41]:
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential([
    layers.Dense(512, activation="relu"),
    layers.Dense(10, activation="softmax")
])

In [42]:
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [43]:
model.fit(train_images, train_labels, epochs=5, batch_size=128)

Epoch 1/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8713 - loss: 0.4440
Epoch 2/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9651 - loss: 0.1214
Epoch 3/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9783 - loss: 0.0731
Epoch 4/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9840 - loss: 0.0517
Epoch 5/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9887 - loss: 0.0389


<keras.src.callbacks.history.History at 0x7e779d794650>

### Reimplementing our first example from scratch in TensorFlow

#### A simple Dense class

In [44]:
import tensorflow as tf

class NaiveDense:
    def __init__(self, input_size, output_size, activation):
        self.activation = activation

        w_shape = (input_size, output_size)
        w_initial_value = tf.random.uniform(w_shape, minval=0, maxval=1e-1)
        self.W = tf.Variable(w_initial_value)

        b_shape = (output_size,)
        b_initial_value = tf.zeros(b_shape)
        self.b = tf.Variable(b_initial_value)

    def __call__(self, inputs):
        return self.activation(tf.matmul(inputs, self.W) + self.b)

    @property
    def weights(self):
        return [self.W, self.b]

#### A simple Sequential class

In [45]:
class NaiveSequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, inputs):
        x = inputs
        for layer in self.layers:
           x = layer(x)
        return x

    @property
    def weights(self):
       weights = []
       for layer in self.layers:
           weights += layer.weights
       return weights

In [46]:
model = NaiveSequential([
    NaiveDense(input_size=28 * 28, output_size=512, activation=tf.nn.relu),
    NaiveDense(input_size=512, output_size=10, activation=tf.nn.softmax)
])
assert len(model.weights) == 4

#### A batch generator

In [47]:
import math

class BatchGenerator:
    def __init__(self, images, labels, batch_size=128):
        assert len(images) == len(labels)
        self.index = 0
        self.images = images
        self.labels = labels
        self.batch_size = batch_size
        self.num_batches = math.ceil(len(images) / batch_size)
        # no shuffling

    def next(self):
        images = self.images[self.index : self.index + self.batch_size]
        labels = self.labels[self.index : self.index + self.batch_size]
        self.index += self.batch_size
        return images, labels

### Running one training step

In [48]:
def one_training_step(model, images_batch, labels_batch):
    with tf.GradientTape() as tape:
        predictions = model(images_batch)
        per_sample_losses = tf.keras.losses.sparse_categorical_crossentropy(
            labels_batch, predictions)
        average_loss = tf.reduce_mean(per_sample_losses)
    gradients = tape.gradient(average_loss, model.weights)
    update_weights(gradients, model.weights)
    return average_loss

In [49]:
learning_rate = 1e-3

def update_weights(gradients, weights):
    for g, w in zip(gradients, weights):
        w.assign_sub(g * learning_rate)

In [50]:
from tensorflow.keras import optimizers

optimizer = optimizers.RMSprop()

def update_weights(gradients, weights):
    optimizer.apply_gradients(zip(gradients, weights))

### The full training loop

In [51]:
def fit(model, images, labels, epochs, batch_size=128):
    for epoch_counter in range(epochs):
        print(f"Epoch {epoch_counter+1}")
        batch_generator = BatchGenerator(images, labels)
        for batch_counter in range(batch_generator.num_batches):
            images_batch, labels_batch = batch_generator.next()
            loss = one_training_step(model, images_batch, labels_batch)
            if batch_counter % 100 == 0:
                print(f"loss at batch {batch_counter}: {loss:.2f}")

In [52]:
from tensorflow.keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype("float32") / 255
test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype("float32") / 255

fit(model, train_images, train_labels, epochs=10, batch_size=128)

Epoch 1
loss at batch 0: 5.77
loss at batch 100: 0.69
loss at batch 200: 0.69
loss at batch 300: 0.35
loss at batch 400: 0.54
Epoch 2
loss at batch 0: 0.26
loss at batch 100: 0.44
loss at batch 200: 0.20
loss at batch 300: 0.32
loss at batch 400: 0.45
Epoch 3
loss at batch 0: 0.20
loss at batch 100: 0.21
loss at batch 200: 0.20
loss at batch 300: 0.27
loss at batch 400: 0.32
Epoch 4
loss at batch 0: 0.15
loss at batch 100: 0.13
loss at batch 200: 0.15
loss at batch 300: 0.16
loss at batch 400: 0.21
Epoch 5
loss at batch 0: 0.09
loss at batch 100: 0.10
loss at batch 200: 0.10
loss at batch 300: 0.11
loss at batch 400: 0.16
Epoch 6
loss at batch 0: 0.06
loss at batch 100: 0.07
loss at batch 200: 0.07
loss at batch 300: 0.09
loss at batch 400: 0.13
Epoch 7
loss at batch 0: 0.04
loss at batch 100: 0.05
loss at batch 200: 0.05
loss at batch 300: 0.08
loss at batch 400: 0.11
Epoch 8
loss at batch 0: 0.02
loss at batch 100: 0.04
loss at batch 200: 0.04
loss at batch 300: 0.07
loss at batch 40

### Evaluating the model

In [53]:
predictions = model(test_images)
predictions = predictions.numpy()
predicted_labels = np.argmax(predictions, axis=1)
matches = predicted_labels == test_labels
print(f"accuracy: {matches.mean():.2f}")

accuracy: 0.98


## Summary