# NumPy vs. Pandas Collections

In [52]:
import time
import sys
import numpy as np

In [33]:
# NumPy is faster
# 1. Contiguous storage
# 2. Leverage datatypes

# ten million
big_number = 10000000

# List
python_list = list(range(big_number))

start_time = time.time()
sum_list = sum(python_list)
list_time = time.time() - start_time

# NumPy Array
numpy_array = np.array(range(big_number), dtype=np.int64)

start_time_np = time.time()
sum_array = np.sum(numpy_array)
numpy_time = time.time() - start_time_np

print(f"Python List Time: {list_time}")
print(f"NumPy Array Time: {numpy_time}")
print(f"Numpy performing {list_time/numpy_time} times faster than Python Lists")

Python List Time: 0.24396467208862305
NumPy Array Time: 0.005988359451293945
Numpy performing 40.73981765338217 times faster than Python Lists


In [60]:
# one million, two million
lst1 = [i for i in range(1000000)]
lst2 = [i for i in range(1000000, 2000000)]

arr1 = np.array(lst1)
arr2 = np.array(lst2)

# Python List
start_time1 = time.time()
result_list = [a + b for a, b in zip(lst1, lst2)]
list_time1 = time.time() - start_time1

# NumPy Array
start_time_np1 = time.time()
result_array = arr1 + arr2
numpy_time1 = time.time() - start_time_np1

print(f"Python List Operation Time: {list_time1}")
print(f"NumPy Array Operation Time: {numpy_time1}")
print(f"Numpy performing {list_time1/numpy_time1} times faster than Python Lists")

Python List Operation Time: 0.07099771499633789
NumPy Array Operation Time: 0.0020058155059814453
Numpy performing 35.395934862712465 times faster than Python Lists


In [23]:
# handling inconsistent data in NumPy
try:
    # specify the dtype - makes things more reliable (and faster!)
    # inconsistent_array = np.array([1, "two", 3, '!'])
    # inconsistent_array = np.array([1, "two", 3, '!'], dtype=np.int32)
    inconsistent_array = np.array([1, 2, 3, 4], dtype=np.int64)
except Exception as e:
    print(
        e
    )  # Usually raises a numpy VisibleDeprecationWarning for inconsistent datatypes

print(inconsistent_array.dtype)

int64


In [24]:
inconsistent_array.strides

(8,)

In [76]:
# NumPy array
arr = np.array([1, 2, 3, 4], dtype=np.int32)    
# 
print(f"NumPy ctypes {arr.ctypes.data}\n{arr.ctypes.strides}\n{arr.nbytes}")
print(f"NumPy ctypes.data {arr.ctypes.data}")
# add another element to the array - see how size changes
arr = np.append(arr, [5])
print(f"NumPy ctypes {arr.ctypes.data}\n{arr.ctypes.strides}\n{arr.nbytes}")
print(f"NumPy ctypes.data {arr.ctypes.data}")
# 
for idx in range(len(arr)):
    print(f"NumPy array element {idx}: memory address = {arr[idx].__array_interface__['data']}")

# Python list
lst = [1, 2, 3, 4]
for idx, item in enumerate(lst):
    print(f"Python list element {idx}: memory address = {id(item)}")

NumPy ctypes 2502847002304
<numpy.core._internal.c_longlong_Array_1 object at 0x00000246CF82D250>
16
NumPy ctypes.data 2502847002304
NumPy ctypes 2502847002400
<numpy.core._internal.c_longlong_Array_1 object at 0x00000246CF82D5D0>
20
NumPy ctypes.data 2502847002400
NumPy array element 0: memory address = (2502848075312, False)
NumPy array element 1: memory address = (2502848075312, False)
NumPy array element 2: memory address = (2502848075312, False)
NumPy array element 3: memory address = (2502848075312, False)
NumPy array element 4: memory address = (2502848075312, False)
Python list element 0: memory address = 140708916073256
Python list element 1: memory address = 140708916073288
Python list element 2: memory address = 140708916073320
Python list element 3: memory address = 140708916073352


Python lists have a significant memory overhead because they store more than just the data (like object type info, size, reference count, etc.).  
NumPy arrays, being homogeneous, cut down on this overhead.

In [58]:
# NumPy consuming less memory
lst = list(range(big_number))
print(f"Size of Python list: {sys.getsizeof(lst)} bytes")

arr = np.array(lst)
print(f"Size of NumPy array: {arr.nbytes} bytes")

Size of Python list: 80000056 bytes
Size of NumPy array: 40000000 bytes


In [100]:
# row order, column order

arr_c = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], order='C')
arr_f = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], order='F')

print("Row-major order:")
print(arr_c.ravel(order='C'))
print(arr_c)
# 
print("\nColumn-major order:")
print(arr_f.ravel(order='F'))
print(arr_f)
# 
arr_c_to_f = arr_c.ravel(order='F').reshape(arr_c.shape, order = 'F')
arr_f_to_c = arr_f.ravel(order='C').reshape(arr_f.shape, order = 'C')
# this gets mondo confusing.. 
# add lots of comments
# even if they explain the code over and over
# over-communicate - confusion confuses
print("\nRow-major to Column-major:")
print(arr_c_to_f.ravel(order='F'))
print(arr_c_to_f)
print("\nColumn-major to Row-major:")
print(arr_f_to_c.ravel(order='C'))
print(arr_f_to_c)

Row-major order:
[1 2 3 4 5 6 7 8 9]
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Column-major order:
[1 4 7 2 5 8 3 6 9]
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Row-major to Column-major:
[1 4 7 2 5 8 3 6 9]
[[1 2 3]
 [4 5 6]
 [7 8 9]]

Column-major to Row-major:
[1 2 3 4 5 6 7 8 9]
[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [80]:
def print_memory(arr):
    for i in range(arr.shape[0]):
        for j in range(arr.shape[1]):
            print(arr[i, j], arr[i, j].data)

print("Memory locations for C-order:")
print_memory(arr_c)
print("\nMemory locations for F-order:")
print_memory(arr_f)

Memory locations for C-order:
1 <memory at 0x00000246B48D3560>
2 <memory at 0x00000246B48D3560>
3 <memory at 0x00000246B48D3560>
4 <memory at 0x00000246B48D3560>
5 <memory at 0x00000246B48D3560>
6 <memory at 0x00000246B48D3560>
7 <memory at 0x00000246B48D3560>
8 <memory at 0x00000246B48D3560>
9 <memory at 0x00000246B48D3560>

Memory locations for F-order:
1 <memory at 0x00000246B48D3560>
2 <memory at 0x00000246B48D3560>
3 <memory at 0x00000246B48D3560>
4 <memory at 0x00000246B48D3560>
5 <memory at 0x00000246B48D3560>
6 <memory at 0x00000246B48D3560>
7 <memory at 0x00000246B48D3560>
8 <memory at 0x00000246B48D3560>
9 <memory at 0x00000246B48D3560>
