# Optional Lab: Python, Numpy, Vectorization

## Goals
- Crash course for NumPy and vectorization

## Tools
- NumPy: a popular data science library
- time: a builtin python library to measure time

In [1]:
import numpy as np
import time

## NumPy
NumPy is a library that extends the base capabilities of Python to add a richer data set including more numeric types, vector, matrices, and many matrix functions. NumPy and Python work together fairly seamlessly. Python operators work on NumPy data types, and NumPy functions often can accept Python built-in data types. 

## Vectors
Vectors are ordered array of numbers of the same type. We use bold letters to note a vector (i.e $\mathbf{x}$). The dimension of a vector is the number of elements in the array, typically denoted by $n$. In NumPy, the basic data structure is an indexable, n-dimensional array (where dimensions mean the number of entries in the array in this context) containing elements of the same type that can be accessed by `dtype`. Vectors can thus be represented as 1d-array and indexed from `0` to `n-1`.

In [11]:
# Numpy functions which allocate memory and fill arrays with values
a = np.zeros(4) # Give the shape as n, so vector dimension 4
print(f"np.zeros(4): a = {a}, a shape = {a.shape}, a data type = {a.dtype}")
a = np.zeros((4,)) # Can also give dimension in this tuple format
print(f"np.zeros((4,)): a = {a}, a shape = {a.shape}, a data type = {a.dtype}")
a = np.random.random_sample(4) # Fill array with 4 random numbers between 0 and 1
print(f"np.random.random_sample(4): a = {a}, a shape = {a.shape}, a data type = {a.dtype}")

np.zeros(4): a = [0. 0. 0. 0.], a shape = (4,), a data type = float64
np.zeros((4,)): a = [0. 0. 0. 0.], a shape = (4,), a data type = float64
np.random.random_sample(4): a = [0.60307233 0.52077608 0.11420784 0.83325203], a shape = (4,), a data type = float64


In [20]:
# These functions do not take a shape argument
a = np.arange(4.) # Fill array with 4 numbers, like range(4) in Python
print(f"np.arange(4.): a = {a}, a shape = {a.shape}, a data type = {a.dtype}")
a = np.random.rand(4)
print(f"np.random.rand(4.): a = {a}, a shape = {a.shape}, a data type = {a.dtype}") # Another way to fill with 4 random numbers

np.arange(4.): a = [0. 1. 2. 3.], a shape = (4,), a data type = float64
np.random.rand(4.): a = [0.85634924 0.03584002 0.24521609 0.73932412], a shape = (4,), a data type = float64


In [22]:
# To specify value of array manually
a = np.array([5,4,3,2])
print(f"np.array([5,4,3,2]): a = {a}, a shape = {a.shape}, a data type = {a.dtype}")
a = np.array([5.,4,3,2])
print(f"np.array([5.,4,3,2]): a = {a}, a shape = {a.shape}, a data type = {a.dtype}")

np.array([5,4,3,2]): a = [5 4 3 2], a shape = (4,), a data type = int64
np.array([5.,4,3,2]): a = [5. 4. 3. 2.], a shape = (4,), a data type = float64


## Vectors: Indexing
Recall that the index starts at 0, like in Python

In [33]:
# indexing operations on 1d arrays
a = np.arange(10)
print(a)

# Access an element
print(f"a[2].shape = {a[2].shape} a[2] = {a[2]}, Accessing an element returns a scalar")

# access the last element using negative indices
print(f"a[-1] = {a[-1]}")

# Beware to stay in range!
try:
    c = a[10]
except Exception as e:
    print("The error message you will see is:")
    print(e)

[0 1 2 3 4 5 6 7 8 9]
a[2].shape = () a[2] = 2, Accessing an element returns a scalar
a[-1] = 9
The error message you will see is:
index 10 is out of bounds for axis 0 with size 10


## Vectors: Slicing
We can slice vectors to get subarrays, using `[start:stop:step]`

In [45]:
# Vector slicing operations
a = np.arange(10)

# access 5 consecutive elements from index 2 to 6
c = a[2:7:1]
print("a[2:7:1] =", c)

# access 3 elements seperated: 2 4 6
c = a[2:7:2]
print("a[2:7:2] =", c)
# access all elements index 3 and above
c = a[3:]
print("a[3:] =", c)
# Access all elements below index 3, up-to index 2
c = a[:3]
print("a[:3] =", c)
# Access all elements
c = a[:]
print("a[:] = ", c)

a[2:7:1] = [2 3 4 5 6]
a[2:7:2] = [2 4 6]
a[3:] = [3 4 5 6 7 8 9]
a[:3] = [0 1 2]
a[:] =  [0 1 2 3 4 5 6 7 8 9]


## Single vector operations
These are operations on a single vector.

In [51]:
# Single vector operations
a = np.array([1,2,3,4])
print(f"a = {a}")
# Negate all elements of a
b = - a
print(f"b = -a: {b}")
# Sum all elements of a, return a scalar
b = np.sum(a)
print(f"b = np.sum(a): {b}")
# Average of the elements of a
b = np.mean(a)
print(f"b = np.mean(a): {b}")
# Square all elements of a
b = a ** 2
print(f"b = a ** 2: {b}")

a = [1 2 3 4]
b = -a: [-1 -2 -3 -4]
b = np.sum(a): 10
b = np.mean(a): 2.5
b = a ** 2: [ 1  4  9 16]


## Vector-vector element wise operations
Arithmetic, logical, and comparison operators between two vectors work on an element by element basis.

In [54]:
a = np.array([1, 2, 3, 4])
b = np.array([-1, -2, 3, 4])
print(f"Binary operators work element wise: {a+b}")

Binary operators work element wise: [0 0 6 8]


Make sure vectors are of the same size, or exception might occure.

In [55]:
# mismatched vector operation
c = np.array([1,2])
try:
    d = a + c
except Exception as e:
    print("The error message you'll see:")
    print(e)

The error message you'll see:
operands could not be broadcast together with shapes (4,) (2,) 


Vectors can be *scaled*, by numbers or scalars.

In [57]:
a = np.array([1, 2, 3, 4])

# multiply by a scalar
b = 5 * a
print(f"b = 5 * a: {b}")

b = 5 * a: [ 5 10 15 20]


## Vector-vector dot product
We'll implement the dot product in a for loop, and using vectorization, then we'll compare both approaches.

In [63]:
def my_dot(a,b):
    """
    Compute the dot product of two vectors
    Args:
        a (ndarray (n,)): input vector
        b (ndarray (n,)): input vector with same dimension as a
    Returns:
        x (scalar): dot product
    """
    n = a.shape[0]
    x = 0
    for i in range(n):
        x = x + a[i] * b[i]
    return x

In [66]:
# Test with my_dot
a = np.array([1, 2, 3, 4])
b = np.array([-1, 4, 3, 2])
print(f"my_dot(a,b) = {my_dot(a,b)}")

my_dot(a,b) = 24


In [70]:
# Test with np.dot
a = np.array([1, 2, 3, 4])
b = np.array([-1, 4, 3, 2])
c = np.dot(a,b)
print(f"np.dot(a,b) = {c}, np.dot(a,b).shape = {c.shape}") # Shape is empty tuple because np.dots returns scalar
c = np.dot(b,a)
print(f"np.dot(b,a) = {c}, np.dot(a,b).shape = {c.shape}") # a.b = b.a

np.dot(a,b) = 24, np.dot(a,b).shape = ()
np.dot(b,a) = 24, np.dot(a,b).shape = ()


In [85]:
np.random.seed(1) # Get reproducable random numbers
# Large arrays
a = np.random.rand(100000000)
b = np.random.rand(100000000)

tic = time.time() # start time
c = np.dot(a,b)
toc = time.time() # end time

print(f"np.dot(a,b) = {c:.4f}")
print(f"Vectorized version duration: { (toc - tic):.4f} ms")

tic = time.time() # start time
c = my_dot(a,b)
toc = time.time() # end time

print(f"my_dot(a,b) = {c:.4f}")
print(f"loop version duration: { (toc - tic):.4f} ms")

# Delete the big arrays from memory
del(a)
del(b)

np.dot(a,b) = 25000669.8073
Vectorized version duration: 0.0675 s
my_dot(a,b) = 25000669.8073
loop version duration: 13.3759 s


A common example of vectorization would be calculating the prediction for a training example from X_train.

In [97]:
X_train = np.array([[1,2], [3,4], [5,6]])
w = np.array([2, 4])
b = 5
f_wb = np.dot(w, X_train[1]) + b
 
print(f"X_train[1] has shape {X_train[1].shape}")
print(f"w has shape {w.shape}")
print(f"f_wb has shape {f_wb.shape}")

X_train[1] has shape (2,)
w has shape (2,)
f_wb has shape ()


## Matrices
Matrices are two dimensional arrays of the same type. They are referenced using capital bold letters such as $\mathbf{X}$. We denote by $m$ the number of rows (In machine learning they may refer to the training examples), and $n$ number of columns (they may refer to the number of features). In NumPy matrices have a 2d index `[m,n]` and are zero-indexed.

In [101]:
# Same functions that create 1d vectors can create matrices or any ndarray
a = np.zeros((1, 5))
print(f"a shape = {a.shape}, a = {a}")
a = np.zeros((2, 1))
print(f"a shape = {a.shape}, a = {a}")
a = np.random.random_sample((1, 1))
print(f"a shape = {a.shape}, a = {a}")

a shape = (1, 5), a = [[0. 0. 0. 0. 0.]]
a shape = (2, 1), a = [[0.]
 [0.]]
a shape = (1, 1), a = [[0.11907518]]


In [103]:
# We can also manually specify the 2d array
a = np.array([ [5], [4], [3] ])
print(f"a shape = {a.shape}, a = {a}")
a = np.array([[5],
              [4],
              [3]]) # or with appropriate formating
print(f"a shape = {a.shape}, a = {a}")

a shape = (3, 1), a = [[5]
 [4]
 [3]]
a shape = (3, 1), a = [[5]
 [4]
 [3]]


## Matrices: Indexing

In [114]:
# Indexing operations on matrices

# reshape is a convenient way to create matrices
a = np.arange(6).reshape(-1, 2) # reshape(m,n) with n = 2 (two columns) and -1 is a place holder (figure it out)
print(f"a.shape = {a.shape}, \na={a}")

# access an element
print(f"a[2,0].shape = {a[2,0].shape}, a[2,0] = {a[2,0]}, type(a[2, 0]) = {type(a[2,0])}, Accessing an element returns a scalar")

# access a row
print(f"a[2].shape = {a[2].shape}, a[2] = {a[2]}, type(a[2]) = {type(a[2])}")

a.shape = (3, 2), 
a=[[0 1]
 [2 3]
 [4 5]]
a[2,0].shape = (), a[2,0] = 4, type(a[2, 0]) = <class 'numpy.int64'>, Accessing an element returns a scalar
a[2].shape = (2,), a[2] = [4 5], type(a[2]) = <class 'numpy.ndarray'>


## Matrices slicing
We can use slicing to get a subset of the 2d array, using `[start:stop:step,start:stop:step]`.

In [127]:
# Matrices slicing operations

a = np.arange(20).reshape(-1, 10)
print(f"a=\n{a}")

# access 5 consecutive elements
print(f"a[0, 2:7:1] = {a[0, 2:7:1]}, a[0,2:7:1].shape = {a[0, 2:7:1].shape}, a 1D array")
# access 5 consecutive elements in all  rows
print(f"a[:, 2:7:1] = {a[:, 2:7:1]}, a[:,2:7:1].shape = {a[:, 2:7:1].shape}, a 2D array")
# access all elements
print(f"a[:, :] = {a[:, :]}, a[:,:].shape = {a[:, :].shape}, a 2D array")

# access all elements in one row (very common usage)
print(f"a[1, :] = {a[1, :]}, a[1,:].shape = {a[1, :].shape}, a 1D array")
# same as
print(f"a[1] = {a[1]}, a[1].shape = {a[1].shape}, a 1D array")

a=
[[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]]
a[0, 2:7:1] = [2 3 4 5 6], a[0,2:7:1].shape = (5,), a 1D array
a[:, 2:7:1] = [[ 2  3  4  5  6]
 [12 13 14 15 16]], a[:,2:7:1].shape = (2, 5), a 2D array
a[:, :] = [[ 0  1  2  3  4  5  6  7  8  9]
 [10 11 12 13 14 15 16 17 18 19]], a[:,:].shape = (2, 10), a 2D array
a[1, :] = [10 11 12 13 14 15 16 17 18 19], a[1,:].shape = (10,), a 1D array
a[1] = [10 11 12 13 14 15 16 17 18 19], a[1].shape = (10,), a 1D array


## End!