<h3><center>Data Munging - Numpy</center></h3>

In [2]:
# Importing numpy
import numpy as np

In [2]:
# Creating array
array = np.arange(5)

In [3]:
# Array
array

array([0, 1, 2, 3, 4])

In [6]:
# Working in different ways
array % 2 == 0

array([ True, False,  True, False,  True])

In [7]:
# Otherwise...
array[array % 2 == 0]

array([0, 2, 4])

In [8]:
# Checking the performance
%timeit [x for x in array if x % 2 == 0]

2.81 µs ± 398 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [9]:
# The best performance
%timeit array[array % 2 == 0]

1.75 µs ± 219 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [10]:
# This anotation is useful
array > 2

array([False, False, False,  True,  True])

In [11]:
# Or....
array[array > 2]

array([3, 4])

In [4]:
# Creating array
array2 = np.array([1, 2])

In [16]:
# Lines
np.vstack((np.array([1, 2]), np.array([1, 2])))

array([[1, 2],
       [1, 2]])

In [18]:
# Lines 2 ndim
np.row_stack((np.array([1, 2]), np.array([1, 2])))

array([[1, 2],
       [1, 2]])

In [17]:
# Columns
np.hstack((np.array([1, 2]), np.array([1, 2])))

array([1, 2, 1, 2])

In [19]:
# Columns 2 ndim
np.column_stack((np.array([1, 2]), np.array([1, 2])))

array([[1, 1],
       [2, 2]])

In [5]:
# Let's try
array[array2]

array([1, 2])

In [12]:
# Concatenating arrays
result = np.concatenate((array, array2))

In [13]:
# The result is...
result

array([0, 1, 2, 3, 4, 1, 2])

In [53]:
# Creating array
complete = np.arange(12)

In [54]:
# Array
complete

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [59]:
# Change reshape
complete = complete.reshape(2, 6)

In [60]:
# Array
complete

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11]])

In [65]:
# Spliting by lines
[part1, part2] = np.vsplit(complete, 2)

In [70]:
# The first part
part1

array([[0, 1, 2, 3, 4, 5]])

In [71]:
# The second part
part2

array([[ 6,  7,  8,  9, 10, 11]])

In [72]:
# Spliting by columns
[part1, part2] = np.hsplit(complete, 2)

In [73]:
# The first part
part1

array([[0, 1, 2],
       [6, 7, 8]])

In [74]:
# The second part
part2

array([[ 3,  4,  5],
       [ 9, 10, 11]])

In [3]:
# Working with large data volumes
data = np.random.randint(0, 200, 200 * 20000)

In [4]:
# Simple function
def funcfun(num):
    if num + 1 == 1:
        return num * 2
    else:
        return num * num

In [5]:
# It's impossible
funcfun(data)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [6]:
# Okay, we can use the vectorize function
funcfunvec = np.vectorize(funcfun)

In [7]:
# With large data
%timeit funcfunvec(data)
%timeit [funcfun(i) for i in data]
%timeit list(map(funcfun, data))

520 ms ± 8.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.98 s ± 11.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.93 s ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
# Creating a simple array
array = np.array([1, 2, 3, 4])

In [11]:
# With simple data
%timeit funcfunvec(array)
%timeit [funcfun(i) for i in array]
%timeit list(map(funcfun, array))

10.9 µs ± 885 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
2.5 µs ± 91.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
2.5 µs ± 62.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
