In [2]:
# Numpy
# Numpy is what is happening in the background of Pandas (data tool)
# Numpy is built on C++
# We could potentially do everything we are about to do ourselves
# But that would be incredibly inefficient and take a lot of time

In [49]:
# Numpy is a LOT more efficient than Python
# ~50x - 100x faster
# Arrays, C++ Arrays = Contiguous memory, optimized searches
# Lists in python can contain multiple types, and are not "Arrays"
# Numpy Arrays are all one data type, and it is stored in an n-dimensional array

In [None]:
# Usage:
# np.array(list, dtype='') - creates a np array using list of type dtype
# arr[0,1] - np indexing example
# np.ndim - returns the dimension of the array
# np.dtype - returns the array's data type
# np.shape - returns the array's shape
# np.reshape() - reshapes the array into the specified shape
# np.nditer() - returns an iterable of the given array to avoid nested loops
# np.concatenate((tuple of arrs), axis=) - concatenates arrs based on axis

In [38]:
# Let's start using it
import numpy as np

print(np.__version__)

1.26.2


In [41]:
# Creating an array
list = [1,2,3,4,5]
arr = np.array(list)

print(list)
print(arr)

[1, 2, 3, 4, 5]
[1 2 3 4 5]


In [26]:
# Dimensions - n-dimensional array

# 0-3D arrays

# 0D Array AKA a Scalar
zero_arr = np.array(42)

zero_arr

#1D Array AKA a Array
one_arr = np.array(list)

one_arr

# 2D Array AKA a Matrix
list2 = [6,7,8,9,10]
two_arr = np.array([list,list2])

two_arr

# 3D Array AKA a Tensor
list3 = [11, 12, 13, 14, 15]
three_arr = np.array((list,list2,list3))
three_arr = np.array([[list,list2],[list3,list3]])

three_arr

print(zero_arr.ndim, one_arr.ndim, two_arr.ndim, three_arr.ndim)

(zero_arr, one_arr, two_arr, three_arr)

0 1 2 3


(array(42),
 array([1, 2, 3, 4, 5]),
 array([[ 1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10]]),
 array([[[ 1,  2,  3,  4,  5],
         [ 6,  7,  8,  9, 10]],
 
        [[11, 12, 13, 14, 15],
         [11, 12, 13, 14, 15]]]))

In [31]:
# Indexing into our nd arrays

# Python indexing - Do not  do this
print(two_arr[0][1])

# Numpy indexing (much faster)
print(two_arr[0,1])

print(three_arr[0,1,2])
three_arr

2
2
8


array([[[ 1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10]],

       [[11, 12, 13, 14, 15],
        [11, 12, 13, 14, 15]]])

In [43]:
# Numpy arrays are a single data type

# Numpy has its own data types

# i - integers up to int64
# b - boolean
# u - unsigned ints
# f - float - float128
# c - complex
# m - timedelta
# M - datetime
# O - object
# S - string
# U - unicode string
# V - void type - fixed chunk of memory that is reserved

print(arr.dtype)

# Creating arr as a string
arr = np.array([list], dtype='S')

print(arr.dtype)

# Casting array into a float
new_arr = arr.astype('f')
print(new_arr.dtype)

arr

|S1
|S1
float32


array([[b'1', b'2', b'3', b'4', b'5']], dtype='|S1')

In [46]:
# Checking out matrix's shape

print(arr.shape)
print(two_arr.shape)
print(three_arr.shape)

(1, 5)
(2, 5)
(2, 2, 5)


In [50]:
# Reshape arrays

arr = np.array([1,2,3,4,5,6,7,8,9,10,11,12])
print(arr)
new_arr = arr.reshape(4, 3)
print(new_arr)

[ 1  2  3  4  5  6  7  8  9 10 11 12]
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [53]:
# If we do not know our full dimensions, we can substitute it
new_arr = arr.reshape(-1, 3) # automatically figures out it should be 4
print(new_arr)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [56]:
# To flatten our arrays

arr = new_arr.reshape(-1)
newer_arr = three_arr.reshape(2, -1)
print(arr)
print(newer_arr)

[ 1  2  3  4  5  6  7  8  9 10 11 12]
[[ 1  2  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 11 12 13 14 15]]


In [60]:
# Iterating through arrays

print(three_arr)
for x in np.nditer(three_arr):
    print(x)

[[[ 1  2  3  4  5]
  [ 6  7  8  9 10]]

 [[11 12 13 14 15]
  [11 12 13 14 15]]]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
11
12
13
14
15


In [63]:
# Joining arrays
# Join based on axis
# axis = 0 is based on rows, axis = 1 is based on columns

arr1 = np.array([[1,2],[3,4]])
arr2 = np.array([[5,6],[7,8]])
print(arr1)
print(arr2)

arr = np.concatenate((arr1, arr2), axis=0)
print(f"Concatenating rows: {arr}")

arr = np.concatenate((arr1, arr2), axis=1)
print(f"Concatenating cols:{arr}")

[[1 2]
 [3 4]]
[[5 6]
 [7 8]]
Concatenating rows: [[1 2]
 [3 4]
 [5 6]
 [7 8]]
Concatenating cols:[[1 2 5 6]
 [3 4 7 8]]


In [70]:
# Stacking
# Stack(), hstack, dstack()

arr1 = np.array([1,2,3])
arr2 = np.array([4,5,6])
arr3 = np.array([7,8,9])
arr = np.stack((arr1,arr2,arr3))
print(f"Stack: {arr}")

arr = np.hstack((arr1,arr2,arr3))
print(f"Horizontal stack: {arr}")

arr = np.dstack((arr1, arr2, arr3))
print(f"Depth stack: {arr}")

Stack: [[1 2 3]
 [4 5 6]
 [7 8 9]]
Horizontal stack: [1 2 3 4 5 6 7 8 9]
Depth stack: [[[1 4 7]
  [2 5 8]
  [3 6 9]]]


In [75]:
# Split array into n number of parts

arr = np.array([1, 2, 3, 4, 5, 6])
new_arr = np.array_split(arr, 3)

print(new_arr)

# Will split as evenly as it can
arr = np.array([1, 2, 3, 4, 5, 6])
new_arr = np.array_split(arr, 4)

print(new_arr)

[array([1, 2]), array([3, 4]), array([5, 6])]
[array([1, 2]), array([3, 4]), array([5]), array([6])]


In [81]:
# Searching our arrays is done with the where() method

# Returns an array with the indices of all occurences

arr = np.array([1,2,3,4,5,6,4,4])
x = np.where(arr == 4)
print(x)

(array([3, 6, 7], dtype=int64),)


In [86]:
# Searchsorted() - only for sorted arrays

arr = np.array([1,2,3,4,5,6,7,8,9,10])
x = np.searchsorted(arr,6)

print(x)

5


In [88]:
# Sorting arrays
# Can still use python sort() method
# np.sort()

arr = np.array([[5,2,1],[6,3,9]])
print(np.sort(arr))

[[1 2 5]
 [3 6 9]]


In [91]:
# Filtering our arrays
# Filter based off a condition
# or we can use a boolean array

arr = np.array([40,41,42,430])
x = [False, True, False, True]

filtered_arr = arr[x]
print(filtered_arr)

[ 41 430]


In [100]:
# Copies vs Views
# When we work with Big Data we need to carefully consider what we copy
# 4 GB file -> 8 GB
# 11 GB file -> 22 GB
# Use a view - a reference to the array

arr = np.array([1,2,3,4])

view = arr.view()
copy = arr.copy()
arr[0] = 15

print(arr)
print(view)
print(copy)
print()
print(view.base)
print(copy.base)

[15  2  3  4]
[15  2  3  4]
[1 2 3 4]

[15  2  3  4]
None


In [105]:
# Time comparison
import time

# Python
start = time.time()
list = [i for i in range(1000000)]
squared = [x**2 for x in list]
stop = time.time()

pytime = stop-start
print(f"Python took: {stop-start} seconds")

# Numpy arrs
start = time.time()
arr = np.arange(1_000_000)
squared = arr**2
stop = time.time()

nptime = stop-start
print(f"Numpy took:  {stop-start} seconds")

print(f"Numpy was {pytime/nptime*100}% faster")

Python took: 0.07569217681884766 seconds
Numpy took:  0.006699800491333008 seconds
Numpy was 1129.767623927974% faster
