# 3 | Arrays and Vectorization (NumPy)

   > **`numpy`** is a Linear Algebra Library for Python. Almost all of the libraries in the PyData Ecosystem rely on Numpy as one of their main building blocks. Numpy is also incredibly fast, as it has bindings to C libraries. 

Some basic topics we cover here include:
- Array indexing, slicing
- Boolean array
- Array operations 

*Resources*:
- **Coursera Python for Informatics** (University of Michigan)
- **Coursera Applied Data Science in Python** (University of Michigan)
- **Python for Data Analytics** - Mckinney (O'Reilly)

In [43]:
# import numpy as a library
import numpy as np

In [44]:
# Generate some random data
data = np.random.randn(2, 3)
data

array([[-1.75542644, -0.20876584,  0.21371833],
       [ 0.60604735,  0.02245854,  2.60988812]])

In [45]:
# get #rows, #cols
data.shape

(2, 3)

In [47]:
# save and load array file
np.save('array_archive.npy', data)
arch = np.load('array_archive.npy')
arch

array([[-1.75542644, -0.20876584,  0.21371833],
       [ 0.60604735,  0.02245854,  2.60988812]])

In [48]:
# save and load array as io wrapper
np.savez('array_archive.npz', data)
arch = np.load('array_archive.npz')
arch

<numpy.lib.npyio.NpzFile at 0x7f2cf252dd68>

### Array Indexing and Slicing

In [106]:
# compare times for range and np.arange
my_arr = np.arange(1000000)
my_list = list(range(1000000))

%time for _ in range(10): my_arr2 = my_arr * 2
%time for _ in range(10): my_list2 = [x * 2 for x in my_list]

CPU times: user 16.7 ms, sys: 0 ns, total: 16.7 ms
Wall time: 16.7 ms
CPU times: user 621 ms, sys: 79 ms, total: 700 ms
Wall time: 699 ms


In [62]:
# array slicing
arr = np.arange(10)
# print 5 element in array
print(arr[5])
# print range of elements in array as list
print(arr[5:8])

5
[5 6 7]


In [63]:
# select first two elements from left
arr[:2]

array([0, 1])

In [66]:
# select first two elements from right
arr[-2:]

array([8, 9])

In [64]:
# select from first two elements from right
arr[2:]

array([2, 3, 4, 5, 6, 7, 8, 9])

In [86]:
# replace all elements
arr[:] = 64
arr

array([[64, 64, 64, 64],
       [64, 64, 64, 64],
       [64, 64, 64, 64],
       [64, 64, 64, 64],
       [64, 64, 64, 64],
       [64, 64, 64, 64],
       [64, 64, 64, 64],
       [64, 64, 64, 64]])

In [87]:
# create empty array and fill rows
arr = np.empty((8, 4))
for i in range(8):
    arr[i] = i
arr

array([[0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [2., 2., 2., 2.],
       [3., 3., 3., 3.],
       [4., 4., 4., 4.],
       [5., 5., 5., 5.],
       [6., 6., 6., 6.],
       [7., 7., 7., 7.]])

In [77]:
# keep specified rows
arr[[4, 3, 0, 6]]

array([[4., 4., 4., 4.],
       [3., 3., 3., 3.],
       [0., 0., 0., 0.],
       [6., 6., 6., 6.]])

In [99]:
# create array and reshape to 8 rows, 4 cols 
arr = np.arange(32).reshape((8, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [98]:
# keep specified rows, but reorder using index
arr[[1, 5, 7, 2], [0, 3, 1, 2]]
arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]

array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

In [92]:
# transform rows orders cols across rows
arr.T

array([[ 0,  4,  8, 12, 16, 20, 24, 28],
       [ 1,  5,  9, 13, 17, 21, 25, 29],
       [ 2,  6, 10, 14, 18, 22, 26, 30],
       [ 3,  7, 11, 15, 19, 23, 27, 31]])

In [102]:
# arithmetic with arrays
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr * arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [109]:
# sq root and exponential
np.sqrt(arr)
np.exp(arr)

array([[1.        , 1.41421356, 1.73205081],
       [2.        , 2.23606798, 2.44948974]])

In [115]:
# replace value with np.where
np.where(arr > 3, 2, arr) # set values over 3 to 2

array([[1., 2., 3.],
       [2., 2., 2.]])

### Sorting

In [None]:
# get unique items in array
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
np.unique(names)
ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])
np.unique(ints)

In [145]:
# sort ascending
arr = np.random.randn(6)
arr.sort()
arr

array([-2.57375558, -0.72978227, -0.42859617, -0.26789914, -0.24723564,
        0.22196228])

In [148]:
# sort across (axis=1)
arr = np.random.randn(5, 3)
arr.sort(1)
arr

array([[ 0.16869059,  0.96102335, -1.66091765],
       [-1.43384897, -1.07503402,  0.37440258],
       [-0.4938382 , -1.23793702,  1.1474374 ],
       [ 1.28889036, -1.13518833, -1.88190473],
       [ 0.76533697,  0.85441244, -1.4368096 ]])

In [None]:
# return 5% quantile
large_arr = np.random.randn(1000)
large_arr.sort()
large_arr[int(0.05 * len(large_arr))] 

In [151]:
# use ravel or flatten to reshape all values into one row
arr.ravel()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [154]:
#concat arrays
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

## Broadcasting

In [127]:
arr = np.random.randn(4, 3)
arr

array([[-7.05451575e-01,  2.18422321e-01, -1.90948873e+00],
       [ 4.88405563e-01,  7.99977613e-01, -1.01415202e+00],
       [ 9.39938707e-01,  7.66537970e-01, -9.58758828e-01],
       [-1.57136231e-03, -1.85162188e-01,  5.37654430e-01]])

In [138]:
# get mean across rows and columns
arr.mean()

-0.08530400836595321

In [122]:
# get mean across columns (axis=0)
arr.mean(0)

array([-0.37038469, -0.02053778, -0.41431847])

In [139]:
# get mean across rows (axis=1)
arr.mean(1)

array([-0.79883933,  0.09141039,  0.24923928,  0.11697363])

In [117]:
# create array with broadcasting
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [118]:
# replace first two rows with two values
arr[:2] = [[-1.37], [0.509]]
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])