# Using vectorization in Python

This notebook demonstrates how to use vectorization in Python as well as comparisons of using vectorization and using for-loops.

In [1]:
# import libraries
import numpy as np
import time

HOW_LONG_TO_RUN = 100

a = np.random.rand(1000000)
b = np.random.rand(1000000)

# Vectorized vs. For Loop

In [3]:
tic = time.time()
c = 0
for j in range(len(a)):
    c += a[j] * b[j]
toc = time.time()
print(c)
print("Loop version: " + str(1000*(toc - tic)) + " ms")

250047.14194216448
Loop version: 553.812026978 ms


In [4]:
tic = time.time()
c = np.dot(a, b)
toc = time.time()
print(c)
print("Loop version: " + str(1000*(toc - tic)) + " ms")

250047.14194217467
Loop version: 9.2921257019 ms


# More Vectorization Examples

In [45]:
def vectorized(a, b, nb_times):
    m = []
    for i in xrange(nb_times):
        tic = time.time()
        c = np.dot(a, b)
        toc = time.time()
        #print("Vectorized version: " + str(1000*(toc - tic)) + " ms")
        m.append(1000*(toc - tic))

    mean_m = (np.array(m).sum())/float(nb_times)
    return mean_m, c

In [46]:
mean_m, c = vectorized(a, b, HOW_LONG_TO_RUN)
print("Mean of "+ str(HOW_LONG_TO_RUN) +" runnings as vectorized version: "+ str(mean_m) +" ms")
print("Value: "+ str(c))

Mean of 100 runnings as vectorized version: 0.910415649414 ms
Value: 250490.600103


In [47]:
def for_loop_zip(a, b, nb_times):
    m = []
    c = 0
    for i in xrange(nb_times):
        tic = time.time()
        for a_j, b_j in zip(a, b):
            c += a_j * b_j
        toc = time.time()
        #print("Zip version: " + str(1000*(toc - tic)) + " ms")
        m.append(1000*(toc - tic))

    mean_m = (np.array(m).sum())/float(nb_times)
    return mean_m, c

In [48]:
mean_m, c = for_loop_zip(a, b, HOW_LONG_TO_RUN)
print("Mean of "+ str(HOW_LONG_TO_RUN) +" runnings as for loop version: "+ str(mean_m) +" ms")
print("Value: "+ str(c))

Mean of 100 runnings as for loop version: 406.943519115 ms
Value: 25049060.0103


In [49]:
def for_loop(a, b, nb_times):
    m = []
    c = 0
    for i in xrange(nb_times):
        tic = time.time()
        for j in range(len(a)):
                c += a[j] * b[j]
        toc = time.time()
        #print("Loop version: " + str(1000*(toc - tic)) + " ms")
        m.append(1000*(toc - tic))

    mean_m = (np.array(m).sum())/float(nb_times)
    return mean_m, c

In [50]:
mean_m, c = for_loop(a, b, HOW_LONG_TO_RUN)
print("Mean of "+ str(HOW_LONG_TO_RUN) +" runnings as for loop version: "+ str(mean_m) +" ms")
print("Value: "+ str(c))

Mean of 100 runnings as for loop version: 349.924705029 ms
Value: 25049060.0103


### Other vectorizations in Python Numpy library

In [8]:
print('vector: {}'.format(a))
u = np.exp(a) # exponential
print('exponential: {}'.format(u))
v = np.log(a)
print('log: {}'.format(v))
w = np.abs(a)
print('abs: {}'.format(w))
x = np.maximum(a, b)
print('maximum: {}'.format(x))
y = a**2
print('square: {}'.format(y))

vector: [0.14923004 0.68556227 0.85590332 ... 0.6686893  0.52930779 0.32877505]
exponential: [1.16094003 1.98488756 2.35349938 ... 1.95167758 1.69775669 1.38926531]
log: [-1.90226624 -0.37751595 -0.15559785 ... -0.40243575 -0.63618519
 -1.1123815 ]
abs: [0.14923004 0.68556227 0.85590332 ... 0.6686893  0.52930779 0.32877505]
maximum: [0.24038818 0.68556227 0.85590332 ... 0.89155843 0.52930779 0.32877505]
square: [0.02226961 0.46999562 0.73257049 ... 0.44714538 0.28016673 0.10809303]


### Logistic Regression with vectors

Calculating for $m$ examples and $n = 2$ features <font color='red'>without</font> vectorization

$
J = 0;\ \ dw_1 = 0;\ \ dw_2 = 0;\ \ db = 0 \\
\\
\text{For}\ \ i=1\ \ \text{to}\ \ m \\
\hspace{15pt}z^{(i)} = w^Tx^{(i)} + b \\ 
\hspace{15pt}a^{(i)} = \sigma(z^{(i)}) \\
\hspace{15pt}J\ += - [y^{(i)}\ log(a^{(i)}) + (1 - y^{(i)})\ log(1 - a^{(i)})] \\
\hspace{15pt}dz^{(i)} = a^{(i)} - y^{(i)} \\
\hspace{15pt}\text{For}\ \ j=1 \ \text{to} \ \ n \\
\hspace{30pt}dw_j += x_j^{(i)}dz^{(i)} \\
\hspace{15pt}\text{End for} \\
\hspace{15pt}db += dz^{(i)} \\
\hspace{15pt}J\ /=\ m \\
\hspace{15pt}dw_1\ /=\ m \\
\hspace{15pt}dw_2\ /=\ m \\
\hspace{15pt}db\ /=\ m \\
\text{End for}
$

### Updating weights

$
w_1\ := w_1 - \alpha dw_1 \\
w_2\ := w_2 - \alpha dw_2 \\
b\ := b - \alpha db \\
$

The code in Python implementing this function is presented below:

In [103]:
# Initialize elements
def sigmoid(z):
    return 1./(1. + np.exp(-1.*z))

#hyperparameter
lr = 0.1
# input
x = np.array([
    [0.1, 0.1],
    [0.2, 0.2],
    [0.3, 0.3]
])
m = x.shape[0]
y = np.array([1, 2, 3])
# weights
w = np.array([0.2, 0.3])
n = w.shape[0]
dw = np.zeros((n, 1))
# bias
b = np.array([1.0])
db = np.zeros(b.shape)

z = np.zeros((m, 1))
dz = np.zeros((m, 1))
a = np.zeros((m, 1))

In [83]:
J, dw1, dw2, db = 0.0, 0.0, 0.0, 0.0
for i in range(x.shape[0]):
    z[i] = np.dot(w, x[i]) + b
    a[i] = sigmoid(z[i])
    J += -(y[i] * math.log(a[i]) + (1 - y[i]) * math.log(1 - a[i]))
    dz[i] = a[i] - y[i]
    for j in range(n):
        dw[j] += x[i][j] * dz[i]
    db += dz[i]
J /= m
for j in range(n):
    dw[j] /= m
db /= m
    
for j in range(n):
    w[j] = w[j] - lr * dw[j]
b = b - lr * db

print('Initial error: 0.0')
print('Error: {}'.format(J))
print('Initial weights: [0.2, 0.3]')
print('Updated weights: {}'.format(w))

Initial error: 0.0
Error: -0.845841870694
Initial weights: [0.2, 0.3]
Updated weights: [0.23160057 0.33160057]


# Vectorizing Logistic Regression

**Calculating for $m$ examples and $n = 2$ features <font color='red'>with</font> vectorization**

Consider that we have `m` training examples. In training using for loops, we make a prediction on the first example by computing $Z^{(1)}$. Then compute the activations $a^{(1)}$ in this first example. Next, we make a prediction on the second training example by computing $Z^{(2)}$ and then computing the activations $a^{(2)}$. Next, we make a prediction on the third example, by computing $Z^{(3)}$ and then computing the activations $a^{(3)}$, and so on. And you might need to do this `m` times if you have `m` training examples.

$$
\begin{matrix}
z^{(1)} = w^Tx^{(1)}+b & \ \ \ \ \ & z^{(2)} = w^Tx^{(2)}+b & \ \ \ \ \ & z^{(3)} = w^Tx^{(3)}+b \\ 
a^{(1)} = \sigma (z^{(1)}) & \ \ \ \ \ & a^{(2)} = \sigma (z^{(2)}) & \ \ \ \ \ & a^{(3)} = \sigma (z^{(3)}) \\
\end{matrix}
$$

Here, we define a matrix capital `X` to be your training inputs, stacked together in different columns as illustred below. This matrix is a ($n_x \times m$) matrix. So, this means that `X` is a $\mathbb{R}^{n_x,m}$ dimensional matrix. 

$
X = \begin{bmatrix}
| & | &  & |\\
x^{(1)} & x^{(2)} & \dots & x^{(m)} \\
| & | &  & |\\
\end{bmatrix} \hspace{15pt}\rightarrow\hspace{15pt} (n_x, m)\hspace{15pt} \mathbb{R}^{n_x,m}
$

To compute $Z^{(1)}$, $Z^{(2)}$, $Z^{(3)}$ and so on, all in one step, we construct a ($1\times M$) matrix (row vector) to compute them all at the same time. 

$
Z = [ z^{(1)}\ \ z^{(2)}\ \ \dots \ \ z^{(m)}] \\ 
Z = w^TX + 
\begin{bmatrix}
b_1 & b_2 & \ldots & b_m
\end{bmatrix} \hspace{15pt}\rightarrow\hspace{15pt} \text{where}\ \ \ b \rightarrow (1,m) \ \ \ \text{i.e.,} \ \ \  b \in \mathbb{R}^{1,m} \ \ \ \text{dimension} \\
Z = np.dot(w^T, X) + b \\
A = [ a^{(1)}\ \ a^{(2)}\ \ \dots\ \ a^{(m)}] = \sigma(Z)
$

Thus, we compute:

$
Z = \begin{bmatrix}
w_1 & w_2 & \ldots & w_n
\end{bmatrix}
\begin{bmatrix}
| & | &  & |\\
x^{(1)} & x^{(2)} & \dots & x^{(m)} \\
| & | &  & |\\
\end{bmatrix} + 
\begin{bmatrix}
b_1 & b_2 & \ldots & b_m
\end{bmatrix} \\
Z = \begin{matrix}
[w^Tx^{(1)}+b_1 & w^Tx^{(2)}+b_2 & ...  & w^Tx^{(m)}+b_m] & \hspace{15pt}\rightarrow\hspace{15pt} (1, m)\hspace{15pt} \mathbb{R}^{1,m}
\end{matrix}
$

In Python, we can perform this calculation using a single command as:

```python
Z = np.dot(w.T, X) + b
```

Where `b` can be a $\mathbb{R}^{1}$ with numpy library performing "broadcast" of the elements. Stacking elements $a^{(1)}$, $a^{(2)}$, ... $a^{(m)}$ into a single row vector, we obtain the vector `A` as:

$
A = \begin{bmatrix}
a^{(1)} & a^{(2)} & \ldots & a^{(m)}
\end{bmatrix} \hspace{15pt}\rightarrow\hspace{15pt} (1, m)\hspace{15pt} \mathbb{R}^{1,m}
$




# Vectorizing Logistic Regression's Gradient Output

So, for the gradient computation, we computed $dz^{(1)}$ for the first example, which is $a^{(1)} - y^1$, $dz^{(2)}$ which is equal to $a^{(2)} - y^2$ and so on for all `m` training examples.

$$
\begin{matrix}
dz^{(1)} = a^{(1)} - y^1 & \ \ \ \ \ & dz^{(2)} = a^{(2)} - y^2 & \ \ \ \ \ & dz^{(3)} = a^{(3)} - y^3 \\ 
\end{matrix}
$$

When vectorizing, we define a new variable `dZ` that contains $dz^{(1)}$, $dz^{(2)}$, up to $dz^{(m)}$ as illustred below. 

$$
dZ = \begin{bmatrix}
dz^{(1)} & dz^{(2)} & \ldots & dz^{(m)} \\ 
\end{bmatrix}
$$

Note that, all the `dz` variables are stacked horizontally, generating a ($1, m$) matrix or alternatively a $\mathbb{R}^{1,m}$ dimensional row vector. We know that `A` and `Y` are also matrices with variables stacked horizontally as:

$$
A = \begin{bmatrix}
a^{(1)} & a^{(2)} & \ldots & a^{(m)} \\ 
\end{bmatrix} \\
Y = \begin{bmatrix}
y^{(1)} & y^{(2)} & \ldots & y^{(m)} \\ 
\end{bmatrix}
$$

Based on these definitions, we can compute `dZ` as `A` minus `Y` because it's going to be equal to: 

$$
dZ = A - Y \\
dZ = \begin{bmatrix}
a^{(1)} - y^{(1)} & a^{(2)} - y^{(2)} & \ldots & a^{(m)} - y^{(m)} \\ 
\end{bmatrix}
$$

In order to compute `db`, we sum all elements of `dz` and divide by the number of elements `m` as:

$$
db = \frac{1}{m} \sum_{i=1}^{m} dz^{(i)}
$$

In a vectorized implementation using python, we can run:

```python
db = 1./m * np.sum(dZ)
```

Finally, for computing `dw` in non-vectorized version, we have:

$
dw = 0 \\
dw\ += x^{(1)} dz^{(1)} \\
dw\ += x^{(2)} dz^{(2)} \\
\ldots \\
dw\ += x^{(m)} dz^{(m)} \\
dw\ /= m
$

On the other hand, with a vectorized implementation, we have:

$$
dw = \frac{1}{m} X dZ^T
$$

Representing each matrix of the equation, we have:

$$
dw = \frac{1}{m} \begin{bmatrix}
| & | &  & |\\
x^{(1)} & x^{(2)} & \dots & x^{(m)} \\
| & | &  & |\\
\end{bmatrix} \begin{bmatrix}
dz^{(1)} \\
dz^{(2)} \\
\ldots \\
dz^{(m)}
\end{bmatrix} = \frac{1}{m} \begin{bmatrix}
x^{(1)} dz^{(1)} & x^{(2)} dz^{(2)} & \ldots & x^{(m)} dz^{(m)} \\
\end{bmatrix}
$$

which generates a ($1, m$) matrix. Finally, our algorithm to compute the derivatives becomes:

$$
\begin{matrix}
Z = w^{T}X + b & \text{# np.dot(w.T, X) + b} \\
A = \sigma(Z)  & \\
dZ = A - Y & \\
dw = \frac{1}{m} X dZ^{T} & \\
db = \frac{1}{m} \text{np.sum(dZ)} & \\
\end{matrix}
$$

And for update weights, we compute:

$$
w = w - \alpha dw \\
b = b - \alpha db \\
$$

Below, there is an example of the Python implementation of this vectorized version:

In [124]:
# Initialize elements
def sigmoid(Z):
    return 1./(1. + np.exp(-1.*Z))

#hyperparameter
lr = 0.1
nb_epochs = 10
# input (nx, m) matrix 3 features, 4 examples
X = np.array([
    [0.1, 0.1, 0.1, 0.1],
    [0.2, 0.2, 0.2, 0.2],
    [0.3, 0.3, 0.3, 0.3]
])
n, m = X.shape
Y = np.array([1, 2, 0, 2])

# weights (1, n) matrix
w = np.array([0.02, 0.03, 0.01])
dw = np.zeros((1, m))
# bias
b = np.array([1.0])
db = 0

Z = np.zeros((1, m))
dZ = np.zeros((1, m))
A = np.zeros((1, m))

In [125]:
for i in range(nb_epochs):
    Z = np.dot(w.T, X) + b
    A = sigmoid(Z)
    dZ = A - Y
    dw = 1./m * np.dot(X, dZ.T)
    db = 1./m * np.sum(dZ)
    J = 1./m * (- np.dot(Y, np.log(A)) + np.dot((1 - Y), np.log(1 - A)))
    
    w = w - lr*dw
    b = b - lr*db
    print("Error: {}".format(J))


Error: 0.718222810093
Error: 0.709879917577
Error: 0.702678351072
Error: 0.696521727895
Error: 0.691322332225
Error: 0.687000366625
Error: 0.683483249913
Error: 0.680704964154
Error: 0.678605451445
Error: 0.677130059769


# Broadcasting in Python



# Questions

1. Vectorization cannot be done without a GPU.<br>

&#9745; False<br>
&#9744; True

2. What are the dimensions of matrix `X` in this video?<br>

&#9744; ($m, n_x$)<br>
&#9745; ($n_x, m$)<br>
&#9744; ($m, $)<br>
&#9744; ($m, 1$)

3. How do you compute the derivative of `b` in one line of code in Python numpy?<br>

&#9745; `1 / m*(np.sum(dz))`<br>
&#9744; `1 - m(np.sum(dz))`<br>
&#9744; `m(np.sum(dz))`<br>
&#9744; `1 * m(np.sum(dz))`<br>

In [None]:
&#9744; 

In [96]:
x = np.array([
    [0.1, 0.1],
    [0.2, 0.2],
    [0.3, 0.3]
])
m = x.shape[0]
w = np.array([0.2,0.3])
b = [1.0, 1.0]
y = np.array([1, 2, 3])
z = np.zeros((m, 1))
a = np.zeros((m, 1))
dz = np.zeros((m, 1))
w = np.array([0.2, 0.3])
n = w.shape[0]
dw = np.zeros((n, 1)) 
b = np.array([1.0])
db = np.zeros(b.shape)

J = 0

for i in range(x.shape[0]):
    z[i] = np.dot(w, x[i]) + b
    a[i] = sigmoid(z[i])
    J += -(y[i] * math.log(a[i]) + (1 - y[i]) * math.log(1 - a[i]))
    dz[i] = a[i] - y[i]
    dw += dz[i] * x[i]
    db += dz[i]
db

ValueError: non-broadcastable output operand with shape (2,1) doesn't match the broadcast shape (2,2)

In [108]:
x.T

array([[0.1, 0.1],
       [0.2, 0.2],
       [0.3, 0.3]])

In [110]:
x.T[0]

array([0.1, 0.2, 0.3])

In [None]:
J, dw1, dw2, db = 0.0, 0.0, 0.0, 0.0
for i in range(x.shape[0]):
    z[i] = np.dot(x[i], w) + b
    a[i] = sigmoid(z[i])
    J += -(y[i] * math.log(a[i]) + (1 - y[i]) * math.log(1 - a[i]))
    dz[i] = a[i] - y[i]
    print x.shape
    print dz.shape
    dw += x.T[i] * dz[i]
    db += dz[i]
J /= m
dw /= m
db /= m
    
w = w - lr * dw
b = b - lr * db

print('Initial error: 0.0')
print('Error: {}'.format(J))
print('Initial weights: [0.2, 0.3]')
print('Updated weights: {}'.format(w))