# Comparación Numpy vs. Dask

In [2]:
import dask.dataframe as dd
from dask.distributed import Client
import dask.array as da
import time 
import numpy as np

## Creamos el cliente

In [84]:
client = Client(n_workers = 2, threads_per_worker=4, memory_limit='1GB', dashboard_address=':8883')
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8883/status,

0,1
Dashboard: http://127.0.0.1:8883/status,Workers: 2
Total threads: 8,Total memory: 1.86 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:49226,Workers: 2
Dashboard: http://127.0.0.1:8883/status,Total threads: 8
Started: Just now,Total memory: 1.86 GiB

0,1
Comm: tcp://127.0.0.1:49234,Total threads: 4
Dashboard: http://127.0.0.1:49237/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:49229,
Local directory: /var/folders/dv/l82lzhjj64v3xgj4xs_hdqn80000gn/T/dask-scratch-space/worker-6dvbhsul,Local directory: /var/folders/dv/l82lzhjj64v3xgj4xs_hdqn80000gn/T/dask-scratch-space/worker-6dvbhsul

0,1
Comm: tcp://127.0.0.1:49235,Total threads: 4
Dashboard: http://127.0.0.1:49236/status,Memory: 0.93 GiB
Nanny: tcp://127.0.0.1:49231,
Local directory: /var/folders/dv/l82lzhjj64v3xgj4xs_hdqn80000gn/T/dask-scratch-space/worker-j7slux76,Local directory: /var/folders/dv/l82lzhjj64v3xgj4xs_hdqn80000gn/T/dask-scratch-space/worker-j7slux76


## Creación de datos

In [3]:
dNP = np.random.randint(0, 100, size=(5000, 5000))
dNP


array([[41, 21, 69, ..., 92, 51, 75],
       [78, 99, 17, ..., 77, 24, 15],
       [ 2,  3,  9, ..., 53, 67,  9],
       ...,
       [68, 36, 37, ..., 61,  2, 71],
       [15, 61, 11, ..., 56, 48, 10],
       [72, 75, 77, ..., 53, 89, 13]])

In [6]:
dD =da.random.randint(0, 100, size=(5000, 5000),chunks=(100, 100))
dD

Unnamed: 0,Array,Chunk
Bytes,190.73 MiB,78.12 kiB
Shape,"(5000, 5000)","(100, 100)"
Dask graph,2500 chunks in 1 graph layer,2500 chunks in 1 graph layer
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 190.73 MiB 78.12 kiB Shape (5000, 5000) (100, 100) Dask graph 2500 chunks in 1 graph layer Data type int64 numpy.ndarray",5000  5000,

Unnamed: 0,Array,Chunk
Bytes,190.73 MiB,78.12 kiB
Shape,"(5000, 5000)","(100, 100)"
Dask graph,2500 chunks in 1 graph layer,2500 chunks in 1 graph layer
Data type,int64 numpy.ndarray,int64 numpy.ndarray


## Operaciones Matemáticas

In [91]:
#Sacar media de las filas
#Numpy
startTimeNP = time.time()
media_filasN = np.mean(dNP, axis=1)
print(f"Medias de las filas: {media_filasN}")
print(f"Numpy se tardó: {time.time() - startTimeNP}")
#Dask
startTimeD = time.time()
media_filasD = dD.mean(axis=1)
print(f"Medias de las filas: {media_filasD}")
print(f"Dask se tardó: {time.time() - startTimeD}")

Medias de las filas: [49.6804 49.5872 49.9398 ... 49.1466 49.9824 48.9294]
Numpy se tardó: 0.5395169258117676
Medias de las filas: dask.array<mean_agg-aggregate, shape=(5000,), dtype=float64, chunksize=(100,), chunktype=numpy.ndarray>
Dask se tardó: 0.06389403343200684


In [54]:
media_filasD

Unnamed: 0,Array,Chunk
Bytes,39.06 kiB,800 B
Shape,"(5000,)","(100,)"
Dask graph,50 chunks in 5 graph layers,50 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 39.06 kiB 800 B Shape (5000,) (100,) Dask graph 50 chunks in 5 graph layers Data type float64 numpy.ndarray",5000  1,

Unnamed: 0,Array,Chunk
Bytes,39.06 kiB,800 B
Shape,"(5000,)","(100,)"
Dask graph,50 chunks in 5 graph layers,50 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [9]:
#Máximos
#Numpy
startTimeNP = time.time()
maxN = np.max(dNP)
print(f"Máximo es: {maxN}")
print(f"Numpy se tardó: {time.time() -startTimeNP}")
#Dask
startTimeD = time.time()
maxD = dD.max()
print(f"Máximo es: {maxD}")
print(f"Dask se tardó: {time.time() -startTimeD}")

Máximo es: 99
Numpy se tardó: 0.13172388076782227
Máximo es: dask.array<max-aggregate, shape=(), dtype=int64, chunksize=(), chunktype=numpy.ndarray>
Dask se tardó: 0.026259899139404297


In [64]:
#Suma de dos columnas
#NP
startTimeNP = time.time()
suma_colN = np.sum(dNP[:, [0, 1]], axis=1)
print(f"La suma es: {suma_colN}")
print(f"Numpy se tardó: {time.time() - startTimeNP}")
#Dask
startTimeD = time.time()
suma_colD =  dD[:, [0, 1]].sum(axis=1)
print(f"La suma es: {suma_colD}")
print(f"Dask se tardó: {time.time() - startTimeD}")



La suma es: [ 87 120  60 ...  72 131 149]
Numpy se tardó: 0.030443906784057617
La suma es: dask.array<sum-aggregate, shape=(5000,), dtype=int64, chunksize=(100,), chunktype=numpy.ndarray>
Dask se tardó: 0.0061299800872802734


## Manipulación de los arreglos

In [71]:
#Slicing: Obtener las primeras dos filas y columnas de cada array
#Numpy
startTimeNP = time.time()
subNP = dNP[slice(0, 10000), slice(0, 10000)]
subNP
print(f"Numpy se tardó: {time.time() - startTimeNP}")
#Dask
startTimeD = time.time()
subD =  dD[slice(0, 10000), slice(0, 10000)]
print(f"Dask se tardó: {time.time() - startTimeD}")

#En este caso, si usamos un subconjunto pequeño es más rápido np array

Numpy se tardó: 0.00014591217041015625
Dask se tardó: 0.0002830028533935547


In [74]:
#Slicing: Obtener las primeras dos filas y columnas de cada array
#Numpy
startTimeNP = time.time()
subNP = dNP[slice(0, 10000), slice(0, 10000)]
subNP
print(f"Numpy se tardó: {time.time() - startTimeNP}")
#Dask
startTimeD = time.time()
subD =  dD[slice(0, 10000), slice(0, 10000)]
print(f"Dask se tardó: {time.time() - startTimeD}")

Numpy se tardó: 0.00011897087097167969
Dask se tardó: 0.0007169246673583984


In [76]:
#Trasponer filas y columnas
#Numpy
startTimeNP = time.time()
traspN = np.transpose(dNP)
print(f"Numpy se tardó: {time.time() - startTimeNP}")
#Dask
startTimeD = time.time()
traspN = da.transpose(dD)
print(f"Dask se tardó: {time.time() - startTimeD}")


Numpy se tardó: 0.000640869140625
Dask se tardó: 0.0009899139404296875


## Concatenar (unión)

In [60]:
dNP2 = np.random.randint(0, 100, size=(5000, 5000))
dD2 = da.random.randint(0, 100, size=(5000, 5000),chunks=(100, 100))

In [80]:
#Dask
startTimeD = time.time()
concat_resultD = da.concatenate([dD, dD2], axis=1)
print(f"Dask se tardó: {time.time() -startTimeD}")

Dask se tardó: 0.019779205322265625


In [83]:
#NP
startTimeNP = time.time()
concat_resultN = np.concatenate((dNP, dNP2), axis=1)
print(f"Numpy se tardó: {time.time() -startTimeNP}")

Numpy se tardó: 0.6801128387451172
