# The Numpy Package

## 1: Numpy Array Basics

In [1]:
import numpy as np

In [2]:
L = list(range(1, 11)) #Creates a list L

In [3]:
L

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [4]:
type(L)

list

In [5]:
#Transform the list L into a numpy array - (ndarray)
my_array = np.array(L)

In [6]:
my_array

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [7]:
type(my_array)

numpy.ndarray

In [8]:
#ndarrays are iterables just like list
for i in my_array:
    print(i)

1
2
3
4
5
6
7
8
9
10


In [9]:
#List can store diffrent data types
L = [1, 2.5, "Dog", True]

In [10]:
for i in L:
    print(type(i))

<class 'int'>
<class 'float'>
<class 'str'>
<class 'bool'>


In [11]:
#In ndarray, All elements must have same data type; numpy transforms autimatically
a = np.array(L)

In [12]:
a

array(['1', '2.5', 'Dog', 'True'], dtype='<U32')

In [13]:
for i in a:
    print(type(i))

<class 'numpy.str_'>
<class 'numpy.str_'>
<class 'numpy.str_'>
<class 'numpy.str_'>


In [14]:
b = np.array([1, 2, 3])

In [15]:
type(b)

numpy.ndarray

In [16]:
#We can check single data type that b contains using dtype method
b.dtype

dtype('int64')

## 1.2 Numpy Array (Element-wise Operations / Vectorization )

In [17]:
#np.arrange(start(incl), end(excl), step)
np.arange(1, 11)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [18]:
np.arange(1, 11, 2) #Skips every second element

array([1, 3, 5, 7, 9])

In [19]:
L = [1, 2, 3, 4]

In [20]:
L * 2 #NB This is not an element-wise operation

[1, 2, 3, 4, 1, 2, 3, 4]

In [21]:
#Element-wise operations with list requires looping thro the list
l1 = []
for i in L:
    l1.append(i * 2)
    
print(l1)

[2, 4, 6, 8]


In [22]:
#Create an ndarray from 1-4
a = np.arange(1, 5)

In [23]:
a

array([1, 2, 3, 4])

In [24]:
#Element-wise or Vectorized operations are pretty simple in numpy
a * 2 #Vectorized multiplication by 2

array([2, 4, 6, 8])

In [25]:
#Addition also works
a + 2

array([3, 4, 5, 6])

In [26]:
#Squaring also works
a ** 2

array([ 1,  4,  9, 16])

In [27]:
#a can also work as an exponent
2 ** a 

array([ 2,  4,  8, 16])

In [28]:
#Exponentiation with e
np.exp(a)

array([ 2.71828183,  7.3890561 , 20.08553692, 54.59815003])

In [29]:
#Natural logarithm
np.log(a)

array([0.        , 0.69314718, 1.09861229, 1.38629436])

In [30]:
#sum all elements - ndarray
a.sum()

10

In [31]:
np.sum(a)

10

In [32]:
sum(a) #pythonic

10

In [33]:
#Number of elements in ndarray
a.size

4

In [34]:
len(a) #pythonic

4

In [35]:
b = np.array([-2, -1, 0, 0.5, 1, 2, 3])

In [36]:
#Absolute values
np.abs(b)

array([2. , 1. , 0. , 0.5, 1. , 2. , 3. ])

In [37]:
c = np.array([-1.7, -1.5, -0.2, 0.2, 1.7, 1.5, 2.0, 3.5])

In [38]:
#Ceiling
np.ceil(c)

array([-1., -1., -0.,  1.,  2.,  2.,  2.,  4.])

In [39]:
#Flooring
np.floor(c)

array([-2., -2., -1.,  0.,  1.,  1.,  2.,  3.])

In [40]:
#Evenly round all elements to a given number of decimals
np.around([-1.22, -1.58, -3.99, 1.22, 1.58, 3.99, 7.64, 8.55], decimals=0)

array([-1., -2., -4.,  1.,  2.,  4.,  8.,  9.])

# Numpy Array Indexing and Slicing

In [41]:
d = np.arange(1, 11)

In [42]:
d

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [43]:
#Zero-based indexing
d[0] #first element

1

In [44]:
#2nd element
d[1]

2

In [45]:
#last elemnt
d[-1]

10

In [46]:
#2nd last element
d[-2]

9

In [47]:
#A tuple of index, value pairs
list(enumerate(d))

[(0, 1),
 (1, 2),
 (2, 3),
 (3, 4),
 (4, 5),
 (5, 6),
 (6, 7),
 (7, 8),
 (8, 9),
 (9, 10)]

In [48]:
#Slicing from index 2(incl) to 6(excl)
d[2:6]

array([3, 4, 5, 6])

In [49]:
#Slicing All elements
d[:]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [50]:
#Slicing from index 0(incl) to 5(excl)
d[:5]

array([1, 2, 3, 4, 5])

In [51]:
#Slicing starting from index 6(incl) to the last index
d[6:]

array([ 7,  8,  9, 10])

In [52]:
#Skipping every other 2nd element - array[start : end : step]
d[::2]

array([1, 3, 5, 7, 9])

In [53]:
#Skipping every other 3rd element
d[::3]

array([ 1,  4,  7, 10])

In [54]:
#Skip every 3rd elemnt starting from index position 2
d[2::3]

array([3, 6, 9])

In [55]:
#ndarrays are mutable
d[0] = 100 #Changes the value of element at index position zero to 100

In [56]:
d

array([100,   2,   3,   4,   5,   6,   7,   8,   9,  10])

In [57]:
#In contrast to list, ndarrays allows for broadcasting - Slicing multiple values and assigning them values
d[1:4] = 101 #Assigns all value 101

In [58]:
d

array([100, 101, 101, 101,   5,   6,   7,   8,   9,  10])

In [59]:
d[4:7] = [102, 103, 104] #From index pos 4(incl) to 6 (incl) assign values 102, 103, 104 respectively

In [60]:
d

array([100, 101, 101, 101, 102, 103, 104,   8,   9,  10])

# Numpy Array - ( Shape and Multiple Dimensions )

In [61]:
#Create a one dimensional array from (1 - 12)
e = np.arange(1, 13)

In [62]:
e

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [63]:
type(e)

numpy.ndarray

In [64]:
e.shape #One dimensional ndarray - has one row

(12,)

In [65]:
#Reshaping to a 2 by 6 - 2 rows and 6 columns
e = e.reshape(2, 6)

In [66]:
e

array([[ 1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12]])

In [67]:
e.shape #Two dimensional ndarray

(2, 6)

In [68]:
#Reshaping 6 rows / 2 columns
e = e.reshape(6, 2)

In [69]:
e

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10],
       [11, 12]])

In [70]:
e.shape #Also a two dimensional ndarray

(6, 2)

In [71]:
type(e)

numpy.ndarray

In [72]:
#Vectorized operations still works
e + 100

array([[101, 102],
       [103, 104],
       [105, 106],
       [107, 108],
       [109, 110],
       [111, 112]])

In [73]:
#Reshaping to three dimensional
e = e.reshape(2, 2, 3)

In [74]:
e

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [75]:
e.shape

(2, 2, 3)

In [76]:
type(e)

numpy.ndarray

In [77]:
#Creating a two dimensional ndarray in one line of code
f = np.arange(1, 101).reshape(25, 4)

In [78]:
f

array([[  1,   2,   3,   4],
       [  5,   6,   7,   8],
       [  9,  10,  11,  12],
       [ 13,  14,  15,  16],
       [ 17,  18,  19,  20],
       [ 21,  22,  23,  24],
       [ 25,  26,  27,  28],
       [ 29,  30,  31,  32],
       [ 33,  34,  35,  36],
       [ 37,  38,  39,  40],
       [ 41,  42,  43,  44],
       [ 45,  46,  47,  48],
       [ 49,  50,  51,  52],
       [ 53,  54,  55,  56],
       [ 57,  58,  59,  60],
       [ 61,  62,  63,  64],
       [ 65,  66,  67,  68],
       [ 69,  70,  71,  72],
       [ 73,  74,  75,  76],
       [ 77,  78,  79,  80],
       [ 81,  82,  83,  84],
       [ 85,  86,  87,  88],
       [ 89,  90,  91,  92],
       [ 93,  94,  95,  96],
       [ 97,  98,  99, 100]])

# Slicing and Indexing Multi-Dimensional ndarrays

In [79]:
#Slicing first row
f[0]

array([1, 2, 3, 4])

In [80]:
#Slicing last row
f[-1]

array([ 97,  98,  99, 100])

In [81]:
#Slicing first column
f[:, 0]

array([ 1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65,
       69, 73, 77, 81, 85, 89, 93, 97])

In [82]:
#Slicing second columns
f[:, 1]

array([ 2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 66,
       70, 74, 78, 82, 86, 90, 94, 98])

In [83]:
#Slicing second row and second column
f[1][1]

6

In [84]:
f[1, 1]

6

In [85]:
f[:4, 0:2]

array([[ 1,  2],
       [ 5,  6],
       [ 9, 10],
       [13, 14]])

In [86]:
f[:4, 2:4]

array([[ 3,  4],
       [ 7,  8],
       [11, 12],
       [15, 16]])

In [87]:
#Slicing from value 49 to 68
f[12:17, :]

array([[49, 50, 51, 52],
       [53, 54, 55, 56],
       [57, 58, 59, 60],
       [61, 62, 63, 64],
       [65, 66, 67, 68]])

In [88]:
#Slicing value 70 to 80
f[17:20, 1:4]

array([[70, 71, 72],
       [74, 75, 76],
       [78, 79, 80]])

In [89]:
g = f[17:20, 1:4]

In [90]:
g

array([[70, 71, 72],
       [74, 75, 76],
       [78, 79, 80]])

In [91]:
#Transposing with T - Switching axis
g.T

array([[70, 74, 78],
       [71, 75, 79],
       [72, 76, 80]])

In [92]:
#OR
g.transpose()

array([[70, 74, 78],
       [71, 75, 79],
       [72, 76, 80]])

In [93]:
#Vectorrized operations still works
g * 2

array([[140, 142, 144],
       [148, 150, 152],
       [156, 158, 160]])

In [94]:
g / 4

array([[17.5 , 17.75, 18.  ],
       [18.5 , 18.75, 19.  ],
       [19.5 , 19.75, 20.  ]])

In [95]:
g.sum()

675

In [96]:
#Sum of each column
g.sum(axis=0)

array([222, 225, 228])

In [97]:
#Sum of each row
g.sum(axis = 1)

array([213, 225, 237])

In [98]:
#Cummulative sum of all elements
g.cumsum()

array([ 70, 141, 213, 287, 362, 438, 516, 595, 675])

In [99]:
#Cummulative sum for each column
g.cumsum(axis = 0)

array([[ 70,  71,  72],
       [144, 146, 148],
       [222, 225, 228]])

# Numpy Array - Boolean Indexing

In [100]:
f = np.arange(1, 11)

In [101]:
f

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [102]:
#Element-wise if greater than 5
mask1 = f > 5

In [103]:
mask1

array([False, False, False, False, False,  True,  True,  True,  True,
        True])

In [104]:
#Element-wise check if a is smaller than 8
mask2 = f < 8

In [105]:
mask2

array([ True,  True,  True,  True,  True,  True,  True, False, False,
       False])

In [106]:
#Element-wise check if greater than 5 and smaller than 8 (Logical AND)
mask3 = mask1 & mask2

In [107]:
mask3

array([False, False, False, False, False,  True,  True, False, False,
       False])

In [108]:
#Element-wise check if greater than 5 or smaller than 8 (Logical OR)
mask4 = mask1 | mask2

In [109]:
mask4

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [110]:
#Negation
~mask3

array([ True,  True,  True,  True,  True, False, False,  True,  True,
        True])

In [111]:
#Slicing all elements greater than 5 - mask1
f[mask1]

array([ 6,  7,  8,  9, 10])

In [112]:
#Slicing all elements smaller than 8 - mask8
f[mask2]

array([1, 2, 3, 4, 5, 6, 7])

In [113]:
#Slicing elements  greater than 5 and smaller than 8 (Logical AND)
f[mask3]

array([6, 7])

In [114]:
#Slicing elements  greater than 5 OR smaller than 8 (Logical OR) - All elements fullfill at least one condition
f[mask4]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

# Generating Random Numbers with Numpy

#Syntax
np.random.randint(start, stop(excl), steps)

In [115]:
#Creating 10(Steps) random integers from 1 to 101 - exclusive
k = np.random.randint(1, 101, 10)

In [116]:
k

array([96,  5, 52, 98, 27, 88, 92, 92, 84, 36])

In [117]:
#Setting a seed enables pseudo random numbers
np.random.seed(123)

In [118]:
l = np.random.randint(1, 101, 10)
l

array([67, 93, 99, 18, 84, 58, 87, 98, 97, 48])

In [119]:
m = np.random.randint(1, 101, 10)
m

array([74, 33, 47, 97, 26, 84, 79, 37, 97, 81])

In [120]:
#Creating a 10 normal distributed numbers with mean 5 and std 2
k = np.random.normal(5, 2, 10)
k

array([3.46113306, 6.15349204, 5.25305184, 2.39702205, 9.41485487,
       6.04548494, 5.93128951, 6.44983045, 7.99165305, 6.49316118])

In [121]:
#Shuffling
np.random.shuffle(l)
l

array([97, 84, 67, 48, 99, 18, 98, 87, 58, 93])

In [122]:
#Sorting
l.sort()
l

array([18, 48, 58, 67, 84, 87, 93, 97, 98, 99])

In [123]:
#Sorting in reverse order
l[::-1]

array([99, 98, 97, 93, 87, 84, 67, 58, 48, 18])

# Case Study - Numpy Vs Python Standard Library

1: We want to create 100 random integers between 1 and 10

2: Count how many times we get the integer 1

3: To do this - we have to simulate this many times

In [124]:
np.random.seed(122)

### Step 1

In [125]:
#Create 100 random integers from 1 to 11(excl)
np.random.randint(1, 11, 100)

array([ 7,  1,  3,  7,  6, 10,  7,  4,  5,  4,  4,  1,  5,  2,  7,  7,  1,
        6,  9,  8,  1,  9, 10,  3,  3,  6,  9,  4, 10,  6, 10, 10,  5,  3,
        1,  8,  1,  5,  1,  6,  1,  3,  1,  3,  8,  2,  6,  8,  8,  7,  2,
        7,  1,  1,  3, 10,  6,  4,  4,  7,  3,  1,  3, 10,  9,  5, 10,  5,
        5,  6,  9,  9,  8,  3,  3,  3,  2,  5,  6,  8, 10,  4,  7,  4,  9,
        6,  4, 10,  2,  3,  8,  5,  3,  2,  4,  5, 10,  4,  5,  2])

### Step 2

In [126]:
#Check which of them are equal to 1
(np.random.randint(1, 11, 100) == 1)

array([False, False,  True, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False,  True, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False,  True, False,  True, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False])

### Step 3

In [127]:
#Count the occurence of True - i.e where we have integer 1 by summing up.
(np.random.randint(1, 11, 100) == 1).sum()

15

### Step 4

In [128]:
#Let simulate this process a million times. Reshape to 10,000 rows and 100 columns
(np.random.randint(1, 11, 10000*100) == 1).reshape(10000, 100)

array([[False, False, False, ...,  True, False, False],
       [False, False, False, ..., False, False,  True],
       [ True, False, False, ..., False, False, False],
       ...,
       [False,  True, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [129]:
#Count the occurence of True - i.e where we have integer 1 by summing up.
(np.random.randint(1, 11, 10000*100).reshape(10000, 100) == 1).sum(axis=1)

array([9, 7, 9, ..., 6, 6, 8])

### Step 5

In [130]:
#Calculate the mean
(np.random.randint(1, 11, 10000*100) == 1).reshape(10000, 100).sum(axis=1).mean()

9.9376

From above - we can say from a random sample of 1 to 100, the average occurence of integer value 1 could be 10 times

### Timing our Numpy Library calculation

In [131]:
%timeit (np.random.randint(1, 11, 10000*100) == 1).reshape(10000, 100).sum(axis=1).mean()

18.3 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


From above - It takes numpy 17.8 milliseconds to do the calculation

## Simulating the same with Python Libraries

In [132]:
import random

In [133]:
def simulation():
    results = []
    for _ in range(10000):
        l = []
        for _ in range(100):
            if random.randint(1, 10) == 1:
                l.append(True)
            else:
                l.append(False)
                
        results.append(sum(l))
    return (sum(results) / len(results))

In [134]:
simulation()

9.9787

### Timing our Python Library calculation

In [135]:
%timeit simulation()

1.7 s ± 368 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


From above - It takes Python 1.79 seconds to do the calculation