In [1]:
import numpy as np
import time

In [2]:
x = np.random.random(100000000)

In [3]:
# plain python
start = time.time()
sum(x) / len(x)
print(time.time() - start)

11.586775779724121


In [4]:
# numpy
start = time.time()
np.mean(x)
print(time.time() - start)

0.04247713088989258


In [5]:
x = np.array([1,2,3,4,5])
print("x = ", x)
print()

# info about x
print("x has dimensions:", x.shape)
print("x is an object of type:", type(x))
print("The elements in x are of type:", x.dtype)

x =  [1 2 3 4 5]

x has dimensions: (5,)
x is an object of type: <class 'numpy.ndarray'>
The elements in x are of type: int64


In [6]:
w = np.array(["Hello", "hapless", "programmer"])
print("w = ", w)
print()

# info about w
print("w has dimensions:", w.shape)
print("w is an object of type:", type(w))
print("The elements in w are of type:", w.dtype)

w =  ['Hello' 'hapless' 'programmer']

w has dimensions: (3,)
w is an object of type: <class 'numpy.ndarray'>
The elements in w are of type: <U10


In [7]:
# create a rank 2 ndarray that only contains integers
Y = np.array([[1,2,3],[4,5,6],[7,8,9], [10,11,12]])

# We print Y
print()
print('Y = \n', Y)
print()

# We print information about Y
print('Y has dimensions (rows & cols):', Y.shape) # rows & columns
print('Y has a total of', Y.size, 'elements')
print('Y is an object of type:', type(Y))
print('The elements in Y are of type:', Y.dtype)


Y = 
 [[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]

Y has dimensions (rows & cols): (4, 3)
Y has a total of 12 elements
Y is an object of type: <class 'numpy.ndarray'>
The elements in Y are of type: int64


In [8]:
# We create a rank 1 ndarray
x = np.array([1, 2, 3, 4, 5])

# We save x into the current directory as 
np.save('my_array', x)

In [9]:
g = np.load('my_array.npy')

print()
print('g = ', g)
print()

print('g is an object of type:', type(g))
print('The elements in g are of type:', g.dtype)


g =  [1 2 3 4 5]

g is an object of type: <class 'numpy.ndarray'>
The elements in g are of type: int64


In [10]:
# create numpy array of letters a-j

# ord - given a string representing a Unicode character, returns an integer 
# representing the Unicode of that character. 
# the inverse of chr.
letter_array = [chr(x) for x in range(ord('a'), ord('j') + 1)] 
letter_array = np.array(letter_array)

print("Letter Array: ", letter_array)

# get dtype of array
print('The elements in letter_array are of type:', letter_array.dtype)

# get shape of array
print('letter_array has dimensions (rows & cols):', letter_array.shape) # rows & columns

# get size of array
print('letter_array has a total of', letter_array.size, 'elements')



Letter Array:  ['a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j']
The elements in letter_array are of type: <U1
letter_array has dimensions (rows & cols): (10,)
letter_array has a total of 10 elements


In [11]:
# Built-in functions to create ndArrays with prefilled values

# zeros - pass in tuple indicating rows, cols
z = np.zeros((3,4))
print('\nz = np.zeros((3,4))')
print(z)

# ones
z = np.ones((2,3))
print('\nz = np.ones((2,3))')
print(z)

# full - shape, plus constant value
z = np.full((2,3), 7)
print('\nz = np.full((2,3), 7)')
print(z)

# eye - identity matrix (linear algebra), square matrix with 1s on the diagonal
z = np.eye(4) # 4 x 4
print('\nz = np.eye(4)')
print(z)

# diag - square matrix with values on the diagonal
z = np.diag([10, 20, 30, 50])

print('\nz = np.diag([10, 20, 30, 50])')
print(z)


z = np.zeros((3,4))
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

z = np.ones((2,3))
[[1. 1. 1.]
 [1. 1. 1.]]

z = np.full((2,3), 7)
[[7 7 7]
 [7 7 7]]

z = np.eye(4)
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]

z = np.diag([10, 20, 30, 50])
[[10  0  0  0]
 [ 0 20  0  0]
 [ 0  0 30  0]
 [ 0  0  0 50]]


In [12]:
# More built-in functions

# We create a rank 1 ndarray that has sequential integers from 0 to 9
x = np.arange(10)
print(x)

# We create a rank 1 ndarray that has sequential integers from 4 to 9. 
x = np.arange(4,10) # start, stop
print(x)

# We create a rank 1 ndarray that has evenly spaced integers from 1 to 13 in steps of 3.
x = np.arange(1,14,3) # start, stop, step
print(x)

# use linspace when non-linear steps are required
# We create a rank 1 ndarray that has 10 integers evenly spaced between 0 and 25.
x = np.linspace(0,25,10) # start, stop, N - evenly spaced numbers
print(x)


[0 1 2 3 4 5 6 7 8 9]
[4 5 6 7 8 9]
[ 1  4  7 10 13]
[ 0.          2.77777778  5.55555556  8.33333333 11.11111111 13.88888889
 16.66666667 19.44444444 22.22222222 25.        ]


In [13]:
# reshaping arrays
# np.reshape(ndarray, new_shape) function converts the given ndarray into the 
# specified new_shape

# important to note that the new_shape should be compatible with the number of 
# elements in the given ndarray

# We create a rank 1 ndarray with sequential integers from 0 to 19
x = np.arange(20)
print('Original x = ', x)

# We reshape x into a 4 x 5 ndarray 
x = np.reshape(x, (4,5))

# We print the reshaped x
print()
print('Reshaped x = \n', x)

# can chain together functions
Y = np.arange(9).reshape(3,3)
print()
print('Y = \n', Y)
print()

# We create a rank 1 ndarray with 10 integers evenly spaced between 0 and 50,
# with 50 excluded. We then reshape it to a 5 x 2 ndarray
X = np.linspace(0,50,10, endpoint=False).reshape(5,2)

# We print X
print()
print('X = \n', X)
print()

# We create a 3 x 2 ndarray with random integers in the half-open interval [4, 15).
X = np.random.randint(4,15,size=(3,2))

# We print X
print()
print('X = \n', X)
print()

Original x =  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]

Reshaped x = 
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]

Y = 
 [[0 1 2]
 [3 4 5]
 [6 7 8]]


X = 
 [[ 0.  5.]
 [10. 15.]
 [20. 25.]
 [30. 35.]
 [40. 45.]]


X = 
 [[11 14]
 [ 4  4]
 [ 6 10]]



In [14]:
# create ndarrays with random numbers that satisfy certain statistical properties

"""
NumPy allows you create random ndarrays with numbers drawn from various probability 
distributions. The function np.random.normal(mean, standard deviation, size=shape), 
for example, creates an ndarray with the given shape that contains random numbers 
picked from a normal (Gaussian) distribution with the given mean and standard deviation. 

Let's create a 1,000 x 1,000 ndarray of random floating point numbers drawn from a 
normal distribution with a mean (average) of zero and a standard deviation of 0.1
"""

X = np.random.normal(0, 0.1, size=(10, 10))
print(X)

# print information about X
print('X has dimensions:', X.shape)
print('X is an object of type:', type(X))
print('The elements in X are of type:', X.dtype)
print('The elements in X have a mean of:', X.mean())
print('The maximum value in X is:', X.max())
print('The minimum value in X is:', X.min())
print('X has', (X < 0).sum(), 'negative numbers')
print('X has', (X > 0).sum(), 'positive numbers')



[[-0.06347468 -0.08112269  0.08459421 -0.09780561 -0.08384177  0.04072446
  -0.03454816 -0.00988358  0.00820429 -0.00943733]
 [-0.07492392  0.0197714  -0.0528978   0.1091107  -0.11676699 -0.12293378
   0.05063024  0.08695253 -0.08030881 -0.13500324]
 [ 0.04078093 -0.05653333  0.17096031 -0.15813246 -0.13135574 -0.10344093
   0.06131897 -0.07688188 -0.10951096  0.11559493]
 [ 0.1052446  -0.12014729 -0.10351364  0.01018026  0.00953101 -0.1676241
   0.01339895  0.07054145  0.24105565 -0.01965469]
 [ 0.15450269  0.06746988  0.07022623  0.04505787 -0.12373757  0.11764302
   0.00103695 -0.1514922   0.09512082  0.00740852]
 [ 0.03093239  0.04016848  0.03054737 -0.13042133  0.17912598  0.01379863
  -0.03430649  0.14154568  0.14697946  0.00408366]
 [-0.09540897  0.06725478 -0.04280244  0.03363569  0.07538655 -0.02296048
  -0.19356809  0.07841085 -0.08993796 -0.02630227]
 [-0.11316475 -0.13913465  0.06706058 -0.00504972  0.09924033  0.04421298
  -0.01082834 -0.04019501 -0.05423322  0.09250761]
 

In [15]:
# Using the Built-in functions you learned about in the
# previous lesson, create a 4 x 4 ndarray that only
# contains consecutive even numbers from 2 to 32 (inclusive)

X = np.arange(2, 34, 2).reshape(4, 4)   # reshape to create a rank 2 array
print()
print('X = \n', X)
print()

Y = np.linspace(2,32,16).reshape(4,4)
print()
print('Y = \n', Y)
print()


X = 
 [[ 2  4  6  8]
 [10 12 14 16]
 [18 20 22 24]
 [26 28 30 32]]


Y = 
 [[ 2.  4.  6.  8.]
 [10. 12. 14. 16.]
 [18. 20. 22. 24.]
 [26. 28. 30. 32.]]



In [16]:
# We create a rank 1 ndarray that contains integers from 1 to 5
x = np.array([1, 2, 3, 4, 5])

# We print x
print()
print('x = ', x)
print()

# Let's access some elements with positive indices
print('This is First Element in x:', x[0]) 
print('This is Second Element in x:', x[1])
print('This is Fifth (Last) Element in x:', x[4])
print()

# Let's access the same elements with negative indices
print('This is First Element in x:', x[-5])
print('This is Second Element in x:', x[-4])
print('This is Fifth (Last) Element in x:', x[-1])

print('\nAccess a range in the ndarray:', x[0:2])
print('* note the above yielded another ndarray *')


x =  [1 2 3 4 5]

This is First Element in x: 1
This is Second Element in x: 2
This is Fifth (Last) Element in x: 5

This is First Element in x: 1
This is Second Element in x: 2
This is Fifth (Last) Element in x: 5

Access a range in the ndarray: [1 2]
* note the above yielded another ndarray *


In [17]:
# We create a 3 x 3 rank 2 ndarray that contains integers from 1 to 9
X = np.array([[1,2,3],[4,5,6],[7,8,9]])

print()
print(X)

# modify in place
X[0,1] = 15

print()
print(X)


[[1 2 3]
 [4 5 6]
 [7 8 9]]

[[ 1 15  3]
 [ 4  5  6]
 [ 7  8  9]]


In [18]:
"""
DELETE
np.delete(ndarray, elements, axis) function. 
This function deletes the given list of elements from the given ndarray along the specified axis. 
For rank 1 ndarrays the axis keyword is not required. 
For rank 2 ndarrays, axis = 0 is used to select rows, and axis = 1 is used to select columns
"""
# We create a rank 1 ndarray 
x = np.array([1, 2, 3, 4, 5])

# We create a rank 2 ndarray
Y = np.array([[1,2,3],[4,5,6],[7,8,9]])

# We print x
print()
print('Original x = ', x)

# We delete the first and last element of x
x = np.delete(x, [0,4])

# We print x with the first and last element deleted
print()
print('Modified x = ', x)

# We print Y
print()
print('Original Y = \n', Y)

# We delete the first row of y
w = np.delete(Y, 0, axis=0)

# We delete the first and last column of y
v = np.delete(Y, [0,2], axis=1)

# We print w
print()
print('w = \n', w)



Original x =  [1 2 3 4 5]

Modified x =  [2 3 4]

Original Y = 
 [[1 2 3]
 [4 5 6]
 [7 8 9]]

w = 
 [[4 5 6]
 [7 8 9]]


In [19]:
"""
APPEND
We can append values to ndarrays using the np.append(ndarray, elements, axis) function. 
This function appends the given list of elements to ndarray along the specified axis

Notice that when appending rows or columns to rank 2 ndarrays the rows or columns must 
have the correct shape, so as to match the shape of the rank 2 ndarray
"""
# We create a rank 1 ndarray 
x = np.array([1, 2, 3, 4, 5])

# We create a rank 2 ndarray 
Y = np.array([[1,2,3],[4,5,6]])

# We print x
print()
print('Original x = ', x)

# We append the integer 6 to x
x = np.append(x, 6)

# We print x
print()
print('x = ', x)

# We append the integer 7 and 8 to x
x = np.append(x, [7,8])

# We print x
print()
print('x = ', x)

# We print Y
print()
print('Original Y = \n', Y)

# We append a new row containing 7,8,9 to y
v = np.append(Y, [[7,8,9]], axis=0)

# We append a new column containing 9 and 10 to y
q = np.append(Y,[[9],[10]], axis=1)

# We print v
print()
print('v = \n', v)

# We print q
print()
print('q = \n', q)



Original x =  [1 2 3 4 5]

x =  [1 2 3 4 5 6]

x =  [1 2 3 4 5 6 7 8]

Original Y = 
 [[1 2 3]
 [4 5 6]]

v = 
 [[1 2 3]
 [4 5 6]
 [7 8 9]]

q = 
 [[ 1  2  3  9]
 [ 4  5  6 10]]


In [20]:
"""
INSERT
Insert values to ndarrays using the np.insert(ndarray, index, elements, axis) function. 
This function inserts the given list of elements to ndarray right before the given index 
along the specified axis.
"""
# We create a rank 1 ndarray 
x = np.array([1, 2, 5, 6, 7])

# We create a rank 2 ndarray 
Y = np.array([[1,2,3],[7,8,9]])

# We print x
print()
print('Original x = ', x)

# We insert the integer 3 and 4 between 2 and 5 in x. 
x = np.insert(x,2,[3,4])

# We print x with the inserted elements
print()
print('x = ', x)

# We print Y
print()
print('Original Y = \n', Y)

# We insert a row between the first and last row of y
w = np.insert(Y,1,[4,5,6],axis=0)

# We insert a column full of 5s between the first and second column of y
v = np.insert(Y,1,5, axis=1)

# We print w
print()
print('w = \n', w)

# We print v
print()
print('v = \n', v)


Original x =  [1 2 5 6 7]

x =  [1 2 3 4 5 6 7]

Original Y = 
 [[1 2 3]
 [7 8 9]]

w = 
 [[1 2 3]
 [4 5 6]
 [7 8 9]]

v = 
 [[1 5 2 3]
 [7 5 8 9]]


In [21]:
"""
STACK
Stack on top or each other (np.vstack) or side-by-side (np.hstack)

* the shape of the ndarrays must match *
"""
# We create a rank 1 ndarray 
x = np.array([1,2])

# We create a rank 2 ndarray 
Y = np.array([[3,4],[5,6]])

# We print x
print()
print('x = ', x)

# We print Y
print()
print('Y = \n', Y)

# We stack x on top of Y
z = np.vstack((x, Y))

# We stack x on the right of Y. We need to reshape x in order to stack it on the right of Y. 
w = np.hstack((Y, x.reshape(2,1)))

# We print z
print()
print('z = \n', z)

# We print w
print()
print('w = \n', w)


x =  [1 2]

Y = 
 [[3 4]
 [5 6]]

z = 
 [[1 2]
 [3 4]
 [5 6]]

w = 
 [[3 4 1]
 [5 6 2]]


### Slicing ndArrays

**1) ndarray[start:end]**  
**2) ndarray[start:]**  
**3) ndarray[:end]**  


The first method is used to select elements between the start and end indices. 

The second method is used to select all elements from the start index till the last index. 

The third method is used to select all elements from the first index till the end index. 

_Note that in methods one and three, the end index is excluded._ 

We should also note that since ndarrays can be multidimensional, when doing slicing you usually have to specify a slice for each dimension of the array.

In [22]:
# We create a 4 x 5 ndarray that contains integers from 0 to 19
X = np.arange(20).reshape(4, 5)

# We print X
print()
print('X = \n', X)
print()

# We select all the elements that are in the 2nd through 4th rows and in the 3rd to 5th columns
Z = X[1:4,2:5]

# We print Z
print('Z = \n', Z)

# We can select the same elements as above using method 2
W = X[1:,2:5]

# We print W
print()
print('W = \n', W)

# We select all the elements that are in the 1st through 3rd rows and in the 3rd to 4th columns
Y = X[:3,2:5]

# We print Y
print()
print('Y = \n', Y)

# We select all the elements in the 3rd row
v = X[2,:]

# We print v
print()
print('v = ', v)

# We select all the elements in the 3rd column
q = X[:,2]

# We print q
print()
print('q = ', q)

# We select all the elements in the 3rd column but return a rank 2 ndarray
R = X[:,2:3]

# We print R
print()
print('R = \n', R)


X = 
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]

Z = 
 [[ 7  8  9]
 [12 13 14]
 [17 18 19]]

W = 
 [[ 7  8  9]
 [12 13 14]
 [17 18 19]]

Y = 
 [[ 2  3  4]
 [ 7  8  9]
 [12 13 14]]

v =  [10 11 12 13 14]

q =  [ 2  7 12 17]

R = 
 [[ 2]
 [ 7]
 [12]
 [17]]


### Slicing and Copying
It is important to note that when we _perform slices_ on ndarrays and save them into new variables, as we did above, the _data is not copied into the new variable_. This is one feature that often causes confusion for beginners. Therefore, we will look at this in a bit more detail.

In the above examples, when we make assignments, such as:

Z = X[1:4,2:5]

the slice of the original array X is not copied in the variable Z. Rather, **X and Z are now just two different names for the same ndarray**. 

We say that _slicing only creates a different view of the original array_. This means that if you make changes in Z you will be in effect changing the elements in X as well. 

**Copy**

If you want to create a new ndarray that contains a copy of the values in the slice we need to use the **np.copy()** function. The np.copy(ndarray) function creates a copy of the given ndarray. This function can also be used as a method, in the same way as we did before with the reshape function.

create a copy of the slice using the np.copy() function

Z = np.copy(X[1:4,2:5])

create a copy of the slice using the copy as a method

W = X[1:4,2:5].copy()


### Boolean Indexing, Set Operations, and Sorting

**Boolean Indexing**
Allows us to select elements using logical arguments instead of explicit indices.

X = np.arange(25).reshape(5, 5)

print('The elements in X that are between 10 and 17:', X[(X > 10) & (X < 17)])

We use Boolean indexing to assign the elements that are between 10 and 17 the value of -1

X[(X > 10) & (X < 17)] = -1



In [23]:
X = np.arange(25).reshape(5, 5)
print()
print('X = \n', X)
print()

print('The elements in X that are between 10 and 17:', X[(X > 10) & (X < 17)])

X[(X > 10) & (X < 17)] = -1
print()
print('X = \n', X)
print()



X = 
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]

The elements in X that are between 10 and 17: [11 12 13 14 15 16]

X = 
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 -1 -1 -1 -1]
 [-1 -1 17 18 19]
 [20 21 22 23 24]]



### Set Operations

**numpy.intersect1d(ar1, ar2, assume_unique=False, return_indices=False)**

    Find the intersection of two arrays.

    Return the sorted, unique values that are in both of the input arrays.

**numpy.setdiff1d(ar1, ar2, assume_unique=False)**

    Find the set difference of two arrays.

    Return the unique values in ar1 that are not in ar2.

**numpy.union1d(ar1, ar2)**

    Find the union of two arrays.

    Return the unique, sorted array of values that are in either of the two input arrays.


In [24]:
# We create a rank 1 ndarray
x = np.array([1,2,3,4,5])

# We create a rank 1 ndarray
y = np.array([6,7,2,8,4])

# We print x
print()
print('x = ', x)

# We print y
print()
print('y = ', y)

# We use set operations to compare x and y:
print()
print('The elements that are both in x and y:: np.intersect1d(x,y):', np.intersect1d(x,y))
print('The elements that are in x that are not in y:: np.setdiff1d(x,y):', np.setdiff1d(x,y))
print('All the elements of x and y:: np.union1d(x,y):',np.union1d(x,y))


x =  [1 2 3 4 5]

y =  [6 7 2 8 4]

The elements that are both in x and y:: np.intersect1d(x,y): [2 4]
The elements that are in x that are not in y:: np.setdiff1d(x,y): [1 3 5]
All the elements of x and y:: np.union1d(x,y): [1 2 3 4 5 6 7 8]


In [25]:
# Create a 5 x 5 ndarray with consecutive integers from 1 to 25 (inclusive).
# Afterwards use Boolean indexing to pick out only the odd numbers in the array

# Create a 5 x 5 ndarray with consecutive integers from 1 to 25 (inclusive).
X = np.arange(1, 26, 1).reshape(5, 5)
print()
print('X = ', X)

# Use Boolean indexing to pick out only the odd numbers in the array
Y = X[X % 2 == 1]
print()
print('Y = ', Y)



X =  [[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]
 [21 22 23 24 25]]

Y =  [ 1  3  5  7  9 11 13 15 17 19 21 23 25]


### Arithmetic Operations and Broadcasting

#### Element wise and matrix operations

[Broadcasting](https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html) is the term used to describe how NumPy handles element-wise arithmetic operations with ndarrays of different shapes.

NumPy provides a functional approach, where we use functions such as np.add(), or by using arithmetic symbols, such as +, that resembles more how we write mathematical equations. Both forms will do the same operation, the only difference is that if you use the function approach, the functions usually have options that you can tweak using keywords.




In [26]:
# We create two rank 1 ndarrays
x = np.array([1,2,3,4])
y = np.array([5.5,6.5,7.5,8.5])

# We print x
print()
print('x = ', x)

# We print y
print()
print('y = ', y)
print()

# We perfrom basic element-wise operations using arithmetic symbols and functions
print('x + y = ', x + y)
print('add(x,y) = ', np.add(x,y))
print()
print('x - y = ', x - y)
print('subtract(x,y) = ', np.subtract(x,y))
print()
print('x * y = ', x * y)
print('multiply(x,y) = ', np.multiply(x,y))
print()
print('x / y = ', x / y)
print('divide(x,y) = ', np.divide(x,y))


x =  [1 2 3 4]

y =  [5.5 6.5 7.5 8.5]

x + y =  [ 6.5  8.5 10.5 12.5]
add(x,y) =  [ 6.5  8.5 10.5 12.5]

x - y =  [-4.5 -4.5 -4.5 -4.5]
subtract(x,y) =  [-4.5 -4.5 -4.5 -4.5]

x * y =  [ 5.5 13.  22.5 34. ]
multiply(x,y) =  [ 5.5 13.  22.5 34. ]

x / y =  [0.18181818 0.30769231 0.4        0.47058824]
divide(x,y) =  [0.18181818 0.30769231 0.4        0.47058824]


#### NumPy has a wide variety of statistical functions. Statistical functions provide us with statistical information about the elements in an ndarray.

In [27]:
# We create a 2 x 2 ndarray
X = np.array([[1,2], [3,4]])

# We print x
print()
print('X = \n', X)
print()

print('Average of all elements in X:', X.mean())
print('Average of all elements in the columns of X:', X.mean(axis=0))
print('Average of all elements in the rows of X:', X.mean(axis=1))
print()
print('Sum of all elements in X:', X.sum())
print('Sum of all elements in the columns of X:', X.sum(axis=0))
print('Sum of all elements in the rows of X:', X.sum(axis=1))
print()
print('Standard Deviation of all elements in X:', X.std())
print('Standard Deviation of all elements in the columns of X:', X.std(axis=0))
print('Standard Deviation of all elements in the rows of X:', X.std(axis=1))
print()
print('Median of all elements in X:', np.median(X))
print('Median of all elements in the columns of X:', np.median(X,axis=0))
print('Median of all elements in the rows of X:', np.median(X,axis=1))
print()
print('Maximum value of all elements in X:', X.max())
print('Maximum value of all elements in the columns of X:', X.max(axis=0))
print('Maximum value of all elements in the rows of X:', X.max(axis=1))
print()
print('Minimum value of all elements in X:', X.min())
print('Minimum value of all elements in the columns of X:', X.min(axis=0))
print('Minimum value of all elements in the rows of X:', X.min(axis=1))


X = 
 [[1 2]
 [3 4]]

Average of all elements in X: 2.5
Average of all elements in the columns of X: [2. 3.]
Average of all elements in the rows of X: [1.5 3.5]

Sum of all elements in X: 10
Sum of all elements in the columns of X: [4 6]
Sum of all elements in the rows of X: [3 7]

Standard Deviation of all elements in X: 1.118033988749895
Standard Deviation of all elements in the columns of X: [1. 1.]
Standard Deviation of all elements in the rows of X: [0.5 0.5]

Median of all elements in X: 2.5
Median of all elements in the columns of X: [2. 3.]
Median of all elements in the rows of X: [1.5 3.5]

Maximum value of all elements in X: 4
Maximum value of all elements in the columns of X: [3 4]
Maximum value of all elements in the rows of X: [2 4]

Minimum value of all elements in X: 1
Minimum value of all elements in the columns of X: [1 2]
Minimum value of all elements in the rows of X: [1 3]


Use Broadcasting to create a 4 x 4 ndarray that has its first column full of 1s, its second column full of 2s, its third column full of 3s, etc.

Return evenly spaced values within a given interval.

Values are generated within the half-open interval \[start, stop) (in other words, the interval including start but excluding stop). For integer arguments the function is equivalent to the Python built-in range function, but returns an ndarray rather than a list.

[arrange](https://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html)


In [28]:
import numpy as np

# Use Broadcasting to create a 4 x 4 ndarray that has its first
# column full of 1s, its second column full of 2s, its third
# column full of 3s, etc.. 

X = np.ones(16, dtype=int).reshape(4, 4) * np.arange(1, 5)

print(np.arange(1,5))

print()
print(X)

[1 2 3 4]

[[1 2 3 4]
 [1 2 3 4]
 [1 2 3 4]
 [1 2 3 4]]


## Pandas Series

In [29]:
import pandas as pd
fruit_labels = ['apples', 'oranges', 'kiwi']
fruits = pd.Series([6,3, 5], fruit_labels)

In [30]:
print(fruit_labels)
print()
print(fruits)

fruits = fruits + 2

print()
print(fruits)

['apples', 'oranges', 'kiwi']

apples     6
oranges    3
kiwi       5
dtype: int64

apples     8
oranges    5
kiwi       7
dtype: int64


In [31]:
# Create a Pandas Series that contains the distance of some planets from the Sun.
# Use the name of the planets as the index to your Pandas Series, and the distance
# from the Sun as your data. The distance from the Sun is in units of 10^6 km

distance_from_sun = [149.6, 1433.5, 227.9, 108.2, 778.6]

planets = ['Earth','Saturn', 'Mars','Venus', 'Jupiter']

# Create a Pandas Series using the above data, with the name of the planets as
# the index and the distance from the Sun as your data.
dist_planets = pd.Series(data = distance_from_sun, index = planets)

print()
print('dist_planets = \n', dist_planets)

# Calculate the number of minutes it takes sunlight to reach each planet. You can
# do this by dividing the distance from the Sun for each planet by the speed of light.
# Since in the data above the distance from the Sun is in units of 10^6 km, you can
# use a value for the speed of light of c = 18, since light travels 18 x 10^6 km/minute.
speed_of_light = 18
time_light = dist_planets / speed_of_light

print()
print('time_light = \n', time_light)

# Use Boolean indexing to select only those planets for which sunlight takes less
# than 40 minutes to reach them.
close_planets = time_light[time_light < 40]

print('\nPlanets closest to sun - time of sunlight to reach them being less than 40 minutes')
print(close_planets)


dist_planets = 
 Earth       149.6
Saturn     1433.5
Mars        227.9
Venus       108.2
Jupiter     778.6
dtype: float64

time_light = 
 Earth       8.311111
Saturn     79.638889
Mars       12.661111
Venus       6.011111
Jupiter    43.255556
dtype: float64

Planets closest to sun - time of sunlight to reach them being less than 40 minutes
Earth     8.311111
Mars     12.661111
Venus     6.011111
dtype: float64


## Pandas DataFrame

In [32]:
# We import Pandas as pd into Python
import pandas as pd

# We create a dictionary of Pandas Series 
# if index labels are NOT used will have integer indexes
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}

# We print the type of items to see that it is a dictionary
print('items is of type', type(items))

# We create a Pandas DataFrame by passing it a dictionary of Pandas Series
shopping_carts = pd.DataFrame(items)

# We display the DataFrame
print()
print(shopping_carts)
print()

# We print some information about shopping_carts
print('shopping_carts has shape:', shopping_carts.shape)
print('shopping_carts has dimension:', shopping_carts.ndim)
print('shopping_carts has a total of:', shopping_carts.size, 'elements')
print()
print('The data in shopping_carts is:\n', shopping_carts.values)
print()
print('The row index in shopping_carts is:', shopping_carts.index)
print()
print('The column index in shopping_carts is:', shopping_carts.columns)

items is of type <class 'dict'>

           Bob  Alice
bike     245.0  500.0
book       NaN   40.0
glasses    NaN  110.0
pants     25.0   45.0
watch     55.0    NaN

shopping_carts has shape: (5, 2)
shopping_carts has dimension: 2
shopping_carts has a total of: 10 elements

The data in shopping_carts is:
 [[245. 500.]
 [ nan  40.]
 [ nan 110.]
 [ 25.  45.]
 [ 55.  nan]]

The row index in shopping_carts is: Index(['bike', 'book', 'glasses', 'pants', 'watch'], dtype='object')

The column index in shopping_carts is: Index(['Bob', 'Alice'], dtype='object')


In [33]:
# We print the store_items DataFrame
# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35}, 
          {'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5}]

# We create a DataFrame 
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2'])

# We display the DataFrame
print(store_items)

# We access rows, columns and elements using labels
print()
print('How many bikes are in each store:\n', store_items[['bikes']])
print()
print('How many bikes and pants are in each store:\n', store_items[['bikes', 'pants']])
print()
print('What items are in Store 1:\n', store_items.loc[['store 1']])
print()
print('How many bikes are in Store 2:', store_items['bikes']['store 2'])

         bikes  pants  watches  glasses
store 1     20     30       35      NaN
store 2     15      5       10     50.0

How many bikes are in each store:
          bikes
store 1     20
store 2     15

How many bikes and pants are in each store:
          bikes  pants
store 1     20     30
store 2     15      5

What items are in Store 1:
          bikes  pants  watches  glasses
store 1     20     30       35      NaN

How many bikes are in Store 2: 15


---
### Note when accessing individual elements in a DataFrame, as we did in the last example above, the labels should always be provided with the column label first, i.e. in the form dataframe[column][row]

Possible, to insert new columns into the DataFrames anywhere we want. The **dataframe.insert(loc,label,data)** method allows us to insert a new column in the dataframe at location loc, with the given column label, and given data. 

Add new column named shoes right before the suits column. Since suits has numerical index value 4 then we will use this value as loc.

In [34]:
# We create a dictionary from a list of Python dictionaries that will number of items at the new store
new_items = [{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4}]

# We create new DataFrame with the new_items and provide and index labeled store 3
new_store = pd.DataFrame(new_items, index = ['store 3'])

# We display the items at the new store
print(new_store)

# We append store 3 to our store_items DataFrame
store_items = store_items.append(new_store)

# We display the modified DataFrame
store_items

         bikes  pants  watches  glasses
store 3     20     30       35        4


Unnamed: 0,bikes,pants,watches,glasses
store 1,20,30,35,
store 2,15,5,10,50.0
store 3,20,30,35,4.0


In [35]:
# We insert a new column with label shoes right before the column with numerical index 4
store_items.insert(4, 'shoes', [8,5, 2])

# we display the modified DataFrame
store_items

Unnamed: 0,bikes,pants,watches,glasses,shoes
store 1,20,30,35,,8
store 2,15,5,10,50.0,5
store 3,20,30,35,4.0,2


### Dealing with NaN - clean up the data

In Pandas, logical True values have numerical value 1 and logical False values have numerical value 0.

**Eliminate rows or columns from our DataFrame that contain any NaN values.**

* The .dropna(axis) method eliminates any rows with NaN values when axis = 0 is used and will eliminate any columns with NaN values when axis = 1 is used.


**Replace all NaN values with zero**
* store_items.fillna(0)

**Can also forward fill (ffill) or back fill (backfill) from values contained in DF**
* We replace NaN values with the previous value in the column
  * store_items.fillna(method = 'ffill', axis = 0)
* We replace NaN values with the previous value in the row
  * store_items.fillna(method = 'ffill', axis = 1)
* We replace NaN values with the next value in the column
  *store_items.fillna(method = 'backfill', axis = 0)
* We replace NaN values with the next value in the row
  * store_items.fillna(method = 'backfill', axis = 1) 

**Interpolate**
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.interpolate.html

---
**Mean**
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.mean.html
* Use with fillna() -- see below

---
**Moving Average**
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html
* https://www.datacamp.com/community/tutorials/moving-averages-in-pandas



In [43]:
# We create a list of Python dictionaries
items2 = [{'bikes': 20, 'pants': 30, 'watches': 35, 'shirts': 15, 'shoes':8, 'suits':45},
{'watches': 10, 'glasses': 50, 'bikes': 15, 'pants':5, 'shirts': 2, 'shoes':5, 'suits':7},
{'bikes': 20, 'pants': 30, 'watches': 35, 'glasses': 4, 'shoes':10}]

# We create a DataFrame  and provide the row index
store_items = pd.DataFrame(items2, index = ['store 1', 'store 2', 'store 3'])

# We display the DataFrame
print()
print(store_items)
print()
print(store_items.isnull())
print()
print(store_items.isnull().sum())
print()
print('Total number of NaN values in our DataFrame:', store_items.isnull().sum().sum())


         bikes  pants  watches  shirts  shoes  suits  glasses
store 1     20     30       35    15.0      8   45.0      NaN
store 2     15      5       10     2.0      5    7.0     50.0
store 3     20     30       35     NaN     10    NaN      4.0

         bikes  pants  watches  shirts  shoes  suits  glasses
store 1  False  False    False   False  False  False     True
store 2  False  False    False   False  False  False    False
store 3  False  False    False    True  False   True    False

bikes      0
pants      0
watches    0
shirts     1
shoes      0
suits      1
glasses    1
dtype: int64

Total number of NaN values in our DataFrame: 3


In [47]:
import pandas as pd
import numpy as np

# Since we will be working with ratings, we will set the precision of our 
# dataframes to one decimal place.
pd.set_option('precision', 1)

# Create a Pandas DataFrame that contains the ratings some users have given to a
# series of books. The ratings given are in the range from 1 to 5, with 5 being
# the best score. The names of the books, the authors, and the ratings of each user
# are given below:

books = pd.Series(data = ['Great Expectations', 'Of Mice and Men', 'Romeo and Juliet', 
                          'The Time Machine', 'Alice in Wonderland' ])

authors = pd.Series(data = ['Charles Dickens', 'John Steinbeck', 'William Shakespeare', 
                            ' H. G. Wells', 'Lewis Carroll' ])

user_1 = pd.Series(data = [3.2, np.nan ,2.5])
user_2 = pd.Series(data = [5., 1.3, 4.0, 3.8])
user_3 = pd.Series(data = [2.0, 2.3, np.nan, 4])
user_4 = pd.Series(data = [4, 3.5, 4, 5, 4.2])

# Users that have np.nan values means that the user has not yet rated that book.
# Use the data above to create a Pandas DataFrame that has the following column
# labels: 'Author', 'Book Title', 'User 1', 'User 2', 'User 3', 'User 4'. Let Pandas
# automatically assign numerical row indices to the DataFrame. 

# Create a dictionary with the data given above
dat = {'Author': authors,
       'Book Title': books,
       'User 1': user_1,
       'User 2': user_2,
       'User 3': user_3,
       'User 4': user_4
      }

# Use the dictionary to create a Pandas DataFrame
book_ratings = pd.DataFrame(dat)

print('\n book_ratings:')
print(book_ratings)

# If you created the dictionary correctly you should have a Pandas DataFrame
# that has column labels: 'Author', 'Book Title', 'User 1', 'User 2', 'User 3',
# 'User 4' and row indices 0 through 4.

# Now replace all the NaN values in your DataFrame with the average rating in
# each column. Replace the NaN values in place. HINT: you can use the fillna()
# function with the keyword inplace = True, to do this. Write your code below:

# fills the NaN with the mean of the appropriate column
book_ratings.fillna(book_ratings.mean(), inplace = True)

print('\n book_ratings:')
print(book_ratings)


 book_ratings:
                Author           Book Title  User 1  User 2  User 3  User 4
0      Charles Dickens   Great Expectations     3.2     5.0     2.0     4.0
1       John Steinbeck      Of Mice and Men     NaN     1.3     2.3     3.5
2  William Shakespeare     Romeo and Juliet     2.5     4.0     NaN     4.0
3          H. G. Wells     The Time Machine     NaN     3.8     4.0     5.0
4        Lewis Carroll  Alice in Wonderland     NaN     NaN     NaN     4.2

 book_ratings:
                Author           Book Title  User 1  User 2  User 3  User 4
0      Charles Dickens   Great Expectations     3.2     5.0     2.0     4.0
1       John Steinbeck      Of Mice and Men     2.9     1.3     2.3     3.5
2  William Shakespeare     Romeo and Juliet     2.5     4.0     2.8     4.0
3          H. G. Wells     The Time Machine     2.9     3.8     4.0     5.0
4        Lewis Carroll  Alice in Wonderland     2.9     3.5     2.8     4.2


### Loading Data

**We can load CSV files into Pandas DataFrames using the pd.read_csv() function**
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

**Can also load data from Excel**
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

**Check for any nulls in a dataset**
* .isnull().any()

**Get descriptive statistics**
* .describe()

**Correlation**
* if the data in different columns are correlated
* .corr()

**GROUPBY**
* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.groupby.html
* A groupby operation involves some combination of splitting the object, applying a function, and combining the results. 
* This can be used to group large amounts of data and compute operations on these groups.
