<img style="float: left;" src="pic2.png">

### Sridhar Palle, Ph.D, spalle@emory.edu (Applied ML & DS with Python Program)

<img style = 'float:left;' src = 'nump.jpg'> 

* Numerical Python
* Numpy is the fundamental package for scientific computing with Python
* Much more efficient data storage and operations capability
* Entire ecosystem of python data science tools depend on Numpy

In [1]:
import numpy as np
# Numpy is the linear algebra library for python
np.__version__


'1.16.4'

## 1. Creating Numpy Arrays

### 1.1 Creating Arrays from Scratch

**np.arange(start,stop,step)**

In [2]:
np.arange(10) # Creates numbers 0 to 9

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

**np.zeros()**

In [3]:
np.zeros(5) # creates 5 zeroes

array([0., 0., 0., 0., 0.])

**np.ones()**

In [4]:
np.ones(8) # creates an array of 1

array([1., 1., 1., 1., 1., 1., 1., 1.])

**np.eye()**

In [5]:
np.eye(3) # creates identity matrix

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

**np.full()**

In [6]:
np.full((2,3),1) # creates a 2 x 3 array filled with 1.  np.full((nrows,ncols),value)

array([[1, 1, 1],
       [1, 1, 1]])

**np.random.randint()**

In [7]:
np.random.randint(0,10,(3,4)) # creates an 3 x 4 array with random integers between 0 and 10

array([[8, 5, 1, 3],
       [0, 1, 7, 0],
       [2, 8, 0, 2]])

In [8]:
np.random.randint

<function RandomState.randint>

In [9]:
np.random.randint(9,20,5) # for just 1d array

array([10, 12, 12, 10,  9])

**np.random.random()**

In [10]:
np.random.random((3,4)) # creates a 3 x 4 array with random uniform values between 0 and 1

array([[0.2693589 , 0.26391039, 0.07878305, 0.54330681],
       [0.6420051 , 0.98177263, 0.65517224, 0.5041905 ],
       [0.1771923 , 0.48333681, 0.79920184, 0.58515783]])

In [11]:
np.random.rand(3,4) # this is another version with just random.rand(), solves the same purpose as above

array([[0.8534699 , 0.88169765, 0.18599832, 0.86477791],
       [0.09072075, 0.31256879, 0.62402615, 0.34670983],
       [0.91625781, 0.70362606, 0.69354228, 0.52956138]])

**np.random.normal()**

In [12]:
np.random.normal(0,1,(4,5)) # creates a 4 x 5 array with normally distributed random values having mean 0 and SD 1

array([[-1.08734423, -1.00745782,  0.50957734,  0.60131996,  1.10917726],
       [ 0.42151226, -0.409     , -0.95902727, -0.89242495,  0.45688809],
       [ 0.45329306,  0.1265311 ,  0.01785138,  2.75482731,  0.11175045],
       [ 1.44360977, -0.08031946,  0.57000679,  0.36666556,  0.13873083]])

In [13]:
np.random.randn(4,5) # here mean is by default 0 and SD is 1. Similar to the above method

array([[-0.15981144, -1.81377264,  1.40493131,  0.79926429,  0.74762172],
       [-1.34491275, -0.94963482,  0.28806595,  0.66941665, -0.15149477],
       [ 0.0986632 , -1.00805645, -0.76018389,  2.19092579, -0.27570964],
       [ 0.66984201,  1.59493803,  0.72539654, -0.94953581,  1.13053113]])

**np.linspace()**

In [14]:
np.linspace(0,10,5) # (start,stop,# of samples), creates an array from 0 to 10 with 5 equally spaced values

array([ 0. ,  2.5,  5. ,  7.5, 10. ])

### 1.2 Creating numpy arrays fom lists or tuples

In [15]:
np.array([4,5,6])

array([4, 5, 6])

In [16]:
alist = [9, 7, 5, 6]
np.array(alist)

array([9, 7, 5, 6])

In [17]:
atuple = (4, 5, 6, 7)
np.array(atuple)

array([4, 5, 6, 7])

In [18]:
list_list = [[i, i+2] for i in range(0,5)] # List of Lists
list_list

[[0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]

In [19]:
my_arr = np.array(list_list) # creating a 2D numpy array from lists of lists.

In [20]:
my_arr2 = np.arange(10)
my_arr2

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [21]:
list(my_arr2) # convert array back to list

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [22]:
tuple(my_arr2)

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)

## 2. Indexing, Modifying Arrays

### 2.1 Attributes of Arrays

**.shape** gives the dimensions of an array

In [23]:
my_arr = np.array([[0, 2], [1, 3], [2, 4], [3, 5], [4, 6]])
my_arr

array([[0, 2],
       [1, 3],
       [2, 4],
       [3, 5],
       [4, 6]])

In [24]:
my_arr.shape # .shape gives the shape of the array.

(5, 2)

**ndim**

In [25]:
my_arr.ndim

2

**size**

In [26]:
my_arr.size # total number of elements

10

**.reshape()**

In [27]:
my_arr.reshape(10) # doesn't happen inplace. Original array remains the same.

array([0, 2, 1, 3, 2, 4, 3, 5, 4, 6])

In [28]:
my_arr 

array([[0, 2],
       [1, 3],
       [2, 4],
       [3, 5],
       [4, 6]])

### 2.2 Indexing of numpy arrays (similar to lists) npa[start:stop:step]

In [29]:
my_arr2 = np.array([31, 89, 94, 56, 34, 69, 98, 41, 53, 83, 77])
my_arr2

array([31, 89, 94, 56, 34, 69, 98, 41, 53, 83, 77])

***Q. how to get 69***

In [30]:
my_arr2[5]

69

In [None]:
 # indexing works the same way as before

***Q. How to get all the numbers in reverse order***

In [31]:
my_arr2[::-1]

array([77, 83, 53, 41, 98, 69, 34, 56, 94, 89, 31])

In [None]:
my_arr2

***Q. Get numbers from 98 to 89 in reverse order***

In [33]:
my_arr2[1:7][::-1]

array([98, 69, 34, 56, 94, 89])

**indexing 2D arrays, [start:stop:step, start:stop:step]**

In [34]:
np.random.seed(32) # .seed just makes sure that all of our random numbers match. there is no other meaning
my_arr = np.random.rand(4,5)
my_arr

array([[0.85888927, 0.37271115, 0.55512878, 0.95565655, 0.7366696 ],
       [0.81620514, 0.10108656, 0.92848807, 0.60910917, 0.59655344],
       [0.09178413, 0.34518624, 0.66275252, 0.44171349, 0.55148779],
       [0.70371249, 0.58940123, 0.04993276, 0.56179184, 0.76635847]])

In [35]:
my_arr[:,:] # my_arr[start:stop:step, start:stop:step]

array([[0.85888927, 0.37271115, 0.55512878, 0.95565655, 0.7366696 ],
       [0.81620514, 0.10108656, 0.92848807, 0.60910917, 0.59655344],
       [0.09178413, 0.34518624, 0.66275252, 0.44171349, 0.55148779],
       [0.70371249, 0.58940123, 0.04993276, 0.56179184, 0.76635847]])

In [36]:
my_arr[0:2,0:2] # get rows 0,1 and columns 0,1

array([[0.85888927, 0.37271115],
       [0.81620514, 0.10108656]])

In [None]:
# in Python, we generally use ':' after ',' to get all rows or columns. in R just ',' is enough

In [37]:
my_arr[1,:] #get row 1, all columns. In R, just my_arr[2,] would have worked.

array([0.81620514, 0.10108656, 0.92848807, 0.60910917, 0.59655344])

In [38]:
my_arr[:,1] # Get all rows, and column with index 1

array([0.37271115, 0.10108656, 0.34518624, 0.58940123])

In [39]:
my_arr

array([[0.85888927, 0.37271115, 0.55512878, 0.95565655, 0.7366696 ],
       [0.81620514, 0.10108656, 0.92848807, 0.60910917, 0.59655344],
       [0.09178413, 0.34518624, 0.66275252, 0.44171349, 0.55148779],
       [0.70371249, 0.58940123, 0.04993276, 0.56179184, 0.76635847]])

In [40]:
my_arr[2,2] # to get a particular element

0.6627525231855876

In [41]:
my_arr[1,1:3] # row 1 and columns from 1 to 2

array([0.10108656, 0.92848807])

In [42]:
my_arr

array([[0.85888927, 0.37271115, 0.55512878, 0.95565655, 0.7366696 ],
       [0.81620514, 0.10108656, 0.92848807, 0.60910917, 0.59655344],
       [0.09178413, 0.34518624, 0.66275252, 0.44171349, 0.55148779],
       [0.70371249, 0.58940123, 0.04993276, 0.56179184, 0.76635847]])

***Q. Get columns 0, 2, 4 in reverse order***

In [64]:
my_arr[:,::-2]

array([[0.7366696 , 0.55512878, 0.85888927],
       [0.59655344, 0.92848807, 0.81620514],
       [0.55148779, 0.66275252, 0.09178413],
       [0.76635847, 0.04993276, 0.70371249]])

### 2.3 Boolean Masking

**Using conditionals in slicing**

In [65]:
np.random.seed(32)
my_arr = np.random.rand(10)
my_arr

array([0.85888927, 0.37271115, 0.55512878, 0.95565655, 0.7366696 ,
       0.81620514, 0.10108656, 0.92848807, 0.60910917, 0.59655344])

In [66]:
my_arr > 0.5

array([ True, False,  True,  True,  True,  True, False,  True,  True,
        True])

In [67]:
my_arr[my_arr > 0.5] # get only those array elements whose value is greater than 0.5

array([0.85888927, 0.55512878, 0.95565655, 0.7366696 , 0.81620514,
       0.92848807, 0.60910917, 0.59655344])

In [68]:
np.random.seed(45)
my_arr2D = np.random.rand(3,5)
my_arr2D

array([[0.98901151, 0.54954473, 0.2814473 , 0.07728957, 0.4444695 ],
       [0.47280797, 0.048522  , 0.16332445, 0.11595071, 0.62739168],
       [0.85618205, 0.65010242, 0.99072168, 0.47035075, 0.61829448]])

In [69]:
my_arr2D > 0.5 # get a boolean array 

array([[ True,  True, False, False, False],
       [False, False, False, False,  True],
       [ True,  True,  True, False,  True]])

In [70]:
my_arr2D[my_arr2D > 0.5]

array([0.98901151, 0.54954473, 0.62739168, 0.85618205, 0.65010242,
       0.99072168, 0.61829448])

In [71]:
my_arr2D

array([[0.98901151, 0.54954473, 0.2814473 , 0.07728957, 0.4444695 ],
       [0.47280797, 0.048522  , 0.16332445, 0.11595071, 0.62739168],
       [0.85618205, 0.65010242, 0.99072168, 0.47035075, 0.61829448]])

In [None]:
# Get all rows, for which column 2 values are greater than 0.2

In [72]:
my_arr2D[:,2] > 0.2

array([ True, False,  True])

In [73]:
my_arr2D[(my_arr2D[:,2] > 0.2), 0:2] # Get all rows, for which column 2 values are greater than 0.2

array([[0.98901151, 0.54954473],
       [0.85618205, 0.65010242]])

In [74]:
np.random.seed(32)
my_arr1 = np.random.randint(0,10,6)
my_arr2 = np.random.randint(0,10,6)
print ("my_arr1" + " is " + str(my_arr1))
print ("my_arr2" + " is " + str(my_arr2))

my_arr1 is [7 5 6 8 3 7]
my_arr2 is [9 3 5 9 4 1]


### 2.4 Modifying arrays

In [75]:
import numpy as np
np.random.seed(32)
my_arr = np.random.rand(4,5)
my_arr

array([[0.85888927, 0.37271115, 0.55512878, 0.95565655, 0.7366696 ],
       [0.81620514, 0.10108656, 0.92848807, 0.60910917, 0.59655344],
       [0.09178413, 0.34518624, 0.66275252, 0.44171349, 0.55148779],
       [0.70371249, 0.58940123, 0.04993276, 0.56179184, 0.76635847]])

In [76]:
my_arr[1,2] = 9
my_arr

array([[0.85888927, 0.37271115, 0.55512878, 0.95565655, 0.7366696 ],
       [0.81620514, 0.10108656, 9.        , 0.60910917, 0.59655344],
       [0.09178413, 0.34518624, 0.66275252, 0.44171349, 0.55148779],
       [0.70371249, 0.58940123, 0.04993276, 0.56179184, 0.76635847]])

In [77]:
my_arr[:,4] = np.array([8, 6, 9, 22]) # replacing all rows of 4th column with new values
my_arr

array([[ 0.85888927,  0.37271115,  0.55512878,  0.95565655,  8.        ],
       [ 0.81620514,  0.10108656,  9.        ,  0.60910917,  6.        ],
       [ 0.09178413,  0.34518624,  0.66275252,  0.44171349,  9.        ],
       [ 0.70371249,  0.58940123,  0.04993276,  0.56179184, 22.        ]])

**appending, inserting, deleting**

In [78]:
np.random.seed(32)
my_arr1D = np.random.randint(0,20,5)
my_arr1D

array([11,  5, 19,  7,  3])

In [79]:
np.append(my_arr1D,100) # add an element at the end. But this is a method on numpy, not on the array object.

array([ 11,   5,  19,   7,   3, 100])

In [80]:
np.insert(my_arr1D,3,200) # insert element at index 3, and with value 2

array([ 11,   5,  19, 200,   7,   3])

In [81]:
my_arr1D # for all the above methods, changes are not in place.

array([11,  5, 19,  7,  3])

In [82]:
np.delete(my_arr1D,1) # delete element at index 1

array([11, 19,  7,  3])

**Concatenation and splitting**

In [83]:
my_arr1 = np.array([1,2,3, 4])
my_arr2 = np.array([9, 9, 9, 9])

In [84]:
np.vstack([my_arr1,my_arr2]) # vertical stacking

array([[1, 2, 3, 4],
       [9, 9, 9, 9]])

In [85]:
np.hstack([my_arr1,my_arr2]) # horizontal stacking, np.concatenate also does something similar

array([1, 2, 3, 4, 9, 9, 9, 9])

In [86]:
np.random.seed(32)
my_arr = np.random.randint(0,10,(4,5))
my_arr

array([[7, 5, 6, 8, 3],
       [7, 9, 3, 5, 9],
       [4, 1, 3, 1, 2],
       [3, 8, 2, 4, 2]])

In [87]:
upsplit, losplit = np.vsplit(my_arr,[2]) # row index, where the split should happen. also try np.hsplit()

In [88]:
upsplit

array([[7, 5, 6, 8, 3],
       [7, 9, 3, 5, 9]])

In [89]:
losplit

array([[4, 1, 3, 1, 2],
       [3, 8, 2, 4, 2]])

In [90]:
lsplit, rsplit = np.hsplit(my_arr,[2])

In [91]:
lsplit

array([[7, 5],
       [7, 9],
       [4, 1],
       [3, 8]])

In [92]:
rsplit

array([[6, 8, 3],
       [3, 5, 9],
       [3, 1, 2],
       [2, 4, 2]])

**Several More array manipulation methods please see link below**
* https://docs.scipy.org/doc/numpy/reference/routines.array-manipulation.html

**sub arrays are being aliased. If we mutate subarrays, originals are also mutated**

In [93]:
sub_arr = my_arr[:,1:2]
sub_arr

array([[5],
       [9],
       [1],
       [8]])

In [94]:
sub_arr[:,0] = np.array([9,9,9,9])
sub_arr

array([[9],
       [9],
       [9],
       [9]])

In [95]:
my_arr

array([[7, 9, 6, 8, 3],
       [7, 9, 3, 5, 9],
       [4, 9, 3, 1, 2],
       [3, 9, 2, 4, 2]])

### 2.5 UFuncs and Algebraic operations

In [96]:
al = [1,2,3,4] # With lists elementwise operations need for loops, 
# in R however, we can do algebraic operations on vectors directly.

In [97]:
al*4

[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]

In [98]:
al + [1,2,3,4]

[1, 2, 3, 4, 1, 2, 3, 4]

In [99]:
for i in al:
    print (i*4)

4
8
12
16


In [100]:
[4*i for i in al] # this is more tedious. (Of course list comprehensions are better than traditional for loops)

[4, 8, 12, 16]

**See the power of numpy**

In [101]:
import numpy as np
my_arr = np.arange(10)
my_arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [102]:
my_arr*4

array([ 0,  4,  8, 12, 16, 20, 24, 28, 32, 36])

In [103]:
my_arr-500

array([-500, -499, -498, -497, -496, -495, -494, -493, -492, -491])

In [104]:
my_arr/5

array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8])

In [105]:
my_arr*5+3 # we can even do operations in an expression. 

array([ 3,  8, 13, 18, 23, 28, 33, 38, 43, 48])

**All these operations are basically wrappers for built in np. ufunc**

In [106]:
np.multiply(my_arr,4) # this is same as my_arr*4, similarly we have np.add, np.substract, np.divide etc,=.

array([ 0,  4,  8, 12, 16, 20, 24, 28, 32, 36])

**Some more Ufuncs: max, min, mean, sum, sqrt,exp, sin, cos, log, argmin, argmax, std, var, just use np.function**
#### Go to this link https://docs.scipy.org/doc/numpy/reference/ufuncs.html for more functions.


In [107]:
my_arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [108]:
np.mean(my_arr)

4.5

In [109]:
np.prod(my_arr)

0

In [110]:
np.sum(my_arr) # for several of these aggregating functions, we could also used methods on numpy array object itself (see below)

45

In [111]:
np.sqrt(my_arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [112]:
np.exp(my_arr)

array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
       2.98095799e+03, 8.10308393e+03])

**multi-dimensional aggregation**

In [113]:
np.random.seed(32)
my_arr = np.random.randn(3,5)
my_arr

array([[-0.34889445,  0.98370343,  0.58092283,  0.07028444,  0.77753268],
       [ 0.58195875,  1.47179053,  1.66318101, -0.26117712, -0.68867681],
       [-0.69492326,  1.94042346,  1.80541519,  0.45631385, -0.57481204]])

In [114]:
np.mean(my_arr,0) # specify whether we want to apply the function along rows (0), or columns (1)

array([-0.15395299,  1.46530581,  1.34983968,  0.08847372, -0.16198539])

In [115]:
np.sum(my_arr,1)

array([2.06354893, 2.76707635, 2.9324172 ])

**Methods on numpy objects**

In [116]:
my_arr.max() # similar to Ufuncs, numpy objects themselves have several methods

1.9404234595994223

In [117]:
my_arr.sum()

7.763042476841536

In [118]:
my_arr.argmax() # index position where maximum value occurs.

11

***Extras(Optional)***

**Few other numpy methods**

In [119]:
# np.any, np.all, np.prod, np.bincount, np.where
import numpy as np

In [120]:
np.random.seed(32)
my_arr = np.random.randint(0,10,20)
my_arr

array([7, 5, 6, 8, 3, 7, 9, 3, 5, 9, 4, 1, 3, 1, 2, 3, 8, 2, 4, 2])

**np.bincount**

In [121]:
np.bincount(my_arr) # bincount gives count (frequency) of each value starting from 0 to the largest value.

array([0, 2, 3, 4, 2, 2, 1, 2, 2, 2])

**np.where**

In [122]:
np.where(my_arr == 5) # gives the index position where value = 5

(array([1, 8]),)

**np.sort**

In [123]:
np.sort(my_arr) # this doesn't happen inplace. To mutate the original use my_arr.sort()

array([1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 7, 7, 8, 8, 9, 9])

**np.argsort**

In [124]:
my_arr

array([7, 5, 6, 8, 3, 7, 9, 3, 5, 9, 4, 1, 3, 1, 2, 3, 8, 2, 4, 2])

In [125]:
np.argsort(my_arr) # this will give indexes after sorting the array. order(my_arr) does something similar in R.

array([11, 13, 19, 17, 14,  4, 15,  7, 12, 18, 10,  1,  8,  2,  0,  5, 16,
        3,  6,  9])