For additional information, see Python Data Science Handbook chapter 2

In [1]:
import numpy as np

# Basic Math

In [2]:
x = np.arange(4)
print(x)

[0 1 2 3]


In [3]:
print(x + 5)

[5 6 7 8]


In [4]:
print(x - 5)

[-5 -4 -3 -2]


In [5]:
print(x * 2)

[0 2 4 6]


In [6]:
print(x / 2)

[0.  0.5 1.  1.5]


In [7]:
print(-x)

[ 0 -1 -2 -3]


In [8]:
print(x ** 2)

[0 1 4 9]


In [9]:
print(x % 2) # modulo division

[0 1 0 1]


In [10]:
print(abs(x)) # abs

[0 1 2 3]


# Trig functions
note that the functions are preceeded by np.

In [12]:
theta = np.linspace(0, np.pi, 5)
print(theta)

[0.         0.78539816 1.57079633 2.35619449 3.14159265]


In [13]:
print(np.sin(theta))

[0.00000000e+00 7.07106781e-01 1.00000000e+00 7.07106781e-01
 1.22464680e-16]


In [14]:
print(np.cos(theta))

[ 1.00000000e+00  7.07106781e-01  6.12323400e-17 -7.07106781e-01
 -1.00000000e+00]


In [15]:
print(np.tan(theta))

[ 0.00000000e+00  1.00000000e+00  1.63312394e+16 -1.00000000e+00
 -1.22464680e-16]


# Log and Exp

In [16]:
x = np.array([1, 10, 100])
print(np.log(x))   # natural log
print(np.log10(x)) # common log

[0.         2.30258509 4.60517019]
[0. 1. 2.]


In [17]:
y = np.arange(3)
print(np.exp(y))  # e^y

[1.         2.71828183 7.3890561 ]


In [18]:
print(np.exp2(y))  # 2^y

[1. 2. 4.]


In [19]:
print(np.power(3, y)) # power ^ y

[1 3 9]


# Aggregates

you can use `sum()`

or `np.sum()`

`np.sum()` is faster than sum, but doesn't always behave the same way

In [20]:
x = np.arange(100)
print(sum(x))

4950


In [21]:
print(np.sum(x))

4950


In [22]:
big_array = np.random.rand(10000)
%timeit sum(big_array)
%timeit np.sum(big_array)  # the np version is much faster

1.13 ms ± 167 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
11.4 µs ± 2.21 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## min and max

In [23]:
print(min(big_array))
print(max(big_array))

7.310935831383869e-05
0.9998333341240515


In [24]:
print(np.min(big_array))
print(np.max(big_array))

7.310935831383869e-05
0.9998333341240515


In [25]:
%timeit min(big_array)
%timeit np.min(big_array)  # the np version is much faster

1.1 ms ± 235 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
9.63 µs ± 806 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## summaries for matrices

In [26]:
np.random.seed(1)
# M = np.random.random((3, 4))
M = np.arange(12)
np.random.shuffle(M)
M = np.reshape(M, [3,4])
print(M)

[[ 2  3  4 10]
 [ 1  6  0  7]
 [11  9  8  5]]


In [27]:
sum(M) # regular sum function

array([14, 18, 12, 22])

In [28]:
np.sum(M) # np.sum function

66

In [29]:
np.sum(M, axis = 0)  # np.sum function with axis specified
# matrices have two dimensions
# 0 is rows, 1 is columns
# np.sum axis = 0, will sum over rows, so you end up getting column totals

array([14, 18, 12, 22])

In [30]:
np.sum(M, axis = 1)

array([19, 14, 33])

In [31]:
np.min(M, axis = 0)

array([1, 3, 0, 5])

In [32]:
np.std(M)

3.452052529534663

In [33]:
np.std(M, axis = 0)

array([4.49691252, 2.44948974, 3.26598632, 2.05480467])

## dealing with nan
nan is the float value for something that is not a number. We often use it in the place of a missing value.
nan only exists in float type.

In [34]:
x = float("nan")  # direct creation of nan
print(x)
print(type(x))

nan
<class 'float'>


In [35]:
y = float("inf")  # y is the float representation of infinity
print(y / y)  # these calculations will yield a nan result
print(y - y)

nan
nan


In [37]:
np.sum([x, 2])

nan

In [38]:
np.nansum([x, 2])   # in R you have the option na.rm = TRUE

2.0

The following table provides a list of useful aggregation functions available in NumPy:

|Function Name      |   NaN-safe Version  | Description                                   |
|-------------------|---------------------|-----------------------------------------------|
| ``np.sum``        | ``np.nansum``       | Compute sum of elements                       |
| ``np.prod``       | ``np.nanprod``      | Compute product of elements                   |
| ``np.mean``       | ``np.nanmean``      | Compute mean of elements                      |
| ``np.std``        | ``np.nanstd``       | Compute standard deviation                    |
| ``np.var``        | ``np.nanvar``       | Compute variance                              |
| ``np.min``        | ``np.nanmin``       | Find minimum value                            |
| ``np.max``        | ``np.nanmax``       | Find maximum value                            |
| ``np.argmin``     | ``np.nanargmin``    | Find index of minimum value                   |
| ``np.argmax``     | ``np.nanargmax``    | Find index of maximum value                   |
| ``np.median``     | ``np.nanmedian``    | Compute median of elements                    |
| ``np.percentile`` | ``np.nanpercentile``| Compute rank-based statistics of elements     |
| ``np.any``        | N/A                 | Evaluate whether any elements are true        |
| ``np.all``        | N/A                 | Evaluate whether all elements are true        |

## Broadcasting

This is a similar concept to recyling values in R, but only works when the dimensions are compatible

In [39]:
a = np.array([1,2,3])
b = np.array([4,5,6])
print(a + b)

[5 7 9]


In [40]:
c = np.array([7,8])
print(a + c)  # doesn't work

ValueError: operands could not be broadcast together with shapes (3,) (2,) 

In [41]:
print(a)

[1 2 3]


In [42]:
e = np.ones([3,3])
print(e)

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


In [43]:
print(e + a)  # the array a gets 'broadcast' across all three rows

[[2. 3. 4.]
 [2. 3. 4.]
 [2. 3. 4.]]


In [44]:
print(a.reshape([3,1]))  # we reshape a to be a 3x1 array

[[1]
 [2]
 [3]]


In [45]:
print(e + a.reshape([3,1])) # the reshaped array is broadcast across columns

[[2. 2. 2.]
 [3. 3. 3.]
 [4. 4. 4.]]


In [46]:
d = np.vstack([a,b])  # we stack the arrays a and b vertically
print(d)

[[1 2 3]
 [4 5 6]]


In [47]:
print(d + a)  # a is broadcast across row

[[2 4 6]
 [5 7 9]]


In [48]:
print(c)

[7 8]


In [49]:
print(d + c)  # c does not have compatible dimensions

ValueError: operands could not be broadcast together with shapes (2,3) (2,) 

In [50]:
print(d + c.reshape([2,1]))  # after we reshape c to be a column, we can broadcast it

[[ 8  9 10]
 [12 13 14]]


In [51]:
e = np.arange(10).reshape((10, 1))
f = np.arange(11)
print(e)
print(f)

[[0]
 [1]
 [2]
 [3]
 [4]
 [5]
 [6]
 [7]
 [8]
 [9]]
[ 0  1  2  3  4  5  6  7  8  9 10]


In [52]:
print(e + f)  ## e and f are broadcast into compatible matrices and then added

[[ 0  1  2  3  4  5  6  7  8  9 10]
 [ 1  2  3  4  5  6  7  8  9 10 11]
 [ 2  3  4  5  6  7  8  9 10 11 12]
 [ 3  4  5  6  7  8  9 10 11 12 13]
 [ 4  5  6  7  8  9 10 11 12 13 14]
 [ 5  6  7  8  9 10 11 12 13 14 15]
 [ 6  7  8  9 10 11 12 13 14 15 16]
 [ 7  8  9 10 11 12 13 14 15 16 17]
 [ 8  9 10 11 12 13 14 15 16 17 18]
 [ 9 10 11 12 13 14 15 16 17 18 19]]


In [53]:
print(e * f)  ## e and f are broadcast into compatible matrices and then multiplied element-wise

[[ 0  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  2  4  6  8 10 12 14 16 18 20]
 [ 0  3  6  9 12 15 18 21 24 27 30]
 [ 0  4  8 12 16 20 24 28 32 36 40]
 [ 0  5 10 15 20 25 30 35 40 45 50]
 [ 0  6 12 18 24 30 36 42 48 54 60]
 [ 0  7 14 21 28 35 42 49 56 63 70]
 [ 0  8 16 24 32 40 48 56 64 72 80]
 [ 0  9 18 27 36 45 54 63 72 81 90]]


# Boolean Operators in NumPy

In [54]:
x = np.arange(6)
print(x)

[0 1 2 3 4 5]


In [55]:
print(x < 3)

[ True  True  True False False False]


In [56]:
print(x >= 3)

[False False False  True  True  True]


In [57]:
print(x == 3)

[False False False  True False False]


In [58]:
# the results can then be used to subset
print(x[x >= 3])

[3 4 5]


In [59]:
np.sum(x >= 3) # True = 1, False = 0, so sum counts how many are true

3

In [60]:
np.mean(x >= 3)  # finds the proportion that is True

0.5

### Working with matrices

In [61]:
y = np.arange(12).reshape([3,4])
print(y)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [62]:
print(y >= 6)

[[False False False False]
 [False False  True  True]
 [ True  True  True  True]]


In [63]:
np.sum(y >= 6)

6

In [64]:
np.sum(y >= 6, axis = 0)  # you can perform sums and other aggregate functions axis-wise on the boolean matrix

array([1, 1, 2, 2])

In [65]:
np.sum(y >= 6, axis = 1)

array([0, 2, 4])

## Bitwise (element-wise) Boolean operators

In [66]:
a = np.array([True, True, False, False])
b = np.array([True, False, True, False])
print(a)
print(b)

[ True  True False False]
[ True False  True False]


In [67]:
print(a & b) # bitwise and

[ True False False False]


In [68]:
print(a | b) # bitwise or

[ True  True  True False]


In [69]:
print(a ^ b) # bitwise xor (exclusive or)

[False  True  True False]


In [70]:
print(~a)  # bitwise not

[False False  True  True]


In [71]:
np.any(a)

True

In [72]:
np.all(a)

False

# fancy indexing
Regular lists in python do not support fancy indexing, but NumPy does!

In [73]:
np.random.seed(1)
x = np.random.randint(100, size = 10)
print(x)

[37 12 72  9 75  5 79 64 16  1]


In [74]:
index = [0, 1, 5]
print(x[index])

[37 12  5]


In [75]:
a = [1, 4, 7]
b = [2, 3, 8]
ind = np.vstack([a,b])
print(ind)

[[1 4 7]
 [2 3 8]]


In [76]:
print(x[ind])

[[12 75 64]
 [72  9 16]]


In [77]:
X = np.arange(12).reshape((3, 4))
print(X)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [78]:
row = np.array([0, 1, 2])
col = np.array([2, 1, 3])
X[row, col]

array([ 2,  5, 11])

# sorting

- `np.sort()` is like sort() in R
- `np.argsort()` is like order in R. It gives the indexes of the values to have the proper sorting

In [79]:
np.random.seed(2)
x = np.arange(5)
np.random.shuffle(x)
print(x)

[2 4 1 3 0]


In [81]:
x.sort() # sorts x in place
print(x)

[0 1 2 3 4]


In [82]:
y = np.array([5, 2, 1, 4])
print(y)
print(y.argsort())

[5 2 1 4]
[2 1 3 0]


In [83]:
d = y.argsort()
y[d]

array([1, 2, 4, 5])

## Sorting along rows or columns

A useful feature of NumPy's sorting algorithms is the ability to sort along specific rows or columns of a multidimensional array using the axis argument. For example:

In [84]:
np.random.seed(1)
X = np.random.randint(0, 10, (4, 6))
print(X)

[[5 8 9 5 0 0]
 [1 7 6 9 2 4]
 [5 2 4 2 4 7]
 [7 9 1 7 0 6]]


In [85]:
# sort each column of X
# np.sort returns a copy of X after sorted. It does not modify X
np.sort(X, axis=0)

array([[1, 2, 1, 2, 0, 0],
       [5, 7, 4, 5, 0, 4],
       [5, 8, 6, 7, 2, 6],
       [7, 9, 9, 9, 4, 7]])

In [87]:
# sort each row of X
np.sort(X, axis=1)

array([[0, 0, 5, 5, 8, 9],
       [1, 2, 4, 6, 7, 9],
       [2, 2, 4, 4, 5, 7],
       [0, 1, 6, 7, 7, 9]])