# Comparación Numpy vs. Dask

In [2]:
import dask.dataframe as dd
from dask.distributed import Client
import dask.array as da
import time 
import numpy as np



In [3]:
#Creación de datos
dNP = np.random.randint(0, 100, size=(50000, 50000))
dNP


array([[49, 28, 26, ..., 99, 53, 53],
       [48, 65, 89, ..., 66, 93, 38],
       [91,  9, 37, ..., 25, 73, 86],
       ...,
       [ 5,  8, 22, ..., 15, 97, 99],
       [75, 12, 83, ..., 16, 92,  9],
       [67,  8, 68, ..., 10,  1, 48]])

In [4]:
dD =da.random.randint(0, 100, size=(50000, 50000),chunks=(1000, 1000))

Operaciones Matemáticas

In [15]:
#Sacar media de las filas
#NP
startTimeNP = time.time()
media_filasN = np.mean(dNP, axis=1)
print(f"Medias de las filas: {media_filas}")
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
media_filasD = dD.mean(axis=1)
print(f"Medias de las filas: {media_filas}")
print(time.time() -startTimeD)



Medias de las filas: dask.array<mean_agg-aggregate, shape=(50000,), dtype=float64, chunksize=(4096,), chunktype=numpy.ndarray>
24.116183042526245
Medias de las filas: dask.array<mean_agg-aggregate, shape=(50000,), dtype=float64, chunksize=(4096,), chunktype=numpy.ndarray>
0.031744956970214844


In [17]:
media_filasD

Unnamed: 0,Array,Chunk
Bytes,390.62 kiB,32.00 kiB
Shape,"(50000,)","(4096,)"
Dask graph,13 chunks in 4 graph layers,13 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 390.62 kiB 32.00 kiB Shape (50000,) (4096,) Dask graph 13 chunks in 4 graph layers Data type float64 numpy.ndarray",50000  1,

Unnamed: 0,Array,Chunk
Bytes,390.62 kiB,32.00 kiB
Shape,"(50000,)","(4096,)"
Dask graph,13 chunks in 4 graph layers,13 chunks in 4 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [6]:
#Máximos
#NP
startTimeNP = time.time()
maxN = np.max(dNP)
print(f"Máximo es: {maxN}")
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
maxD = dD.max()
print(f"Máximo es: {maxD}")
print(time.time() -startTimeD)

Máximo es: 99
22.37541389465332
Máximo es: dask.array<max-aggregate, shape=(), dtype=int64, chunksize=(), chunktype=numpy.ndarray>
0.03896594047546387


In [21]:
#Suma de dos columnas
#NP
startTimeNP = time.time()
suma_colN = np.sum(dNP[:, [0, 1]], axis=1)
print(f"La suma es: {suma_colN}")
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
suma_colD =  dD[:, [0, 1]].sum(axis=1)
print(f"La suma es: {suma_colD}")
print(time.time() -startTimeD)



La suma es: [ 77 113 100 ...  13  87  75]
8.14670705795288
La suma es: dask.array<sum-aggregate, shape=(50000,), dtype=int64, chunksize=(4096,), chunktype=numpy.ndarray>
0.03273177146911621


In [24]:
suma_colD.visualize(engine="cytoscape")

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

Manipulación de los arreglos

In [8]:
#Slicing: Obtener las primeras dos filas y columnas de cada array
#NP
startTimeNP = time.time()
subNP = dNP[slice(0, 10000), slice(0, 10000)]
subNP
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
subD =  dD[slice(0, 10000), slice(0, 10000)]
print(time.time() -startTimeD)

#En este caso, si usamos un subconjunto pequeño es más rápido np array

0.00037789344787597656
0.007956981658935547


In [9]:
#Slicing: Obtener las primeras dos filas y columnas de cada array
#NP
startTimeNP = time.time()
subNP = dNP[slice(0, 10000), slice(0, 10000)]
subNP
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
subD =  dD[slice(0, 10000), slice(0, 10000)]
print(time.time() -startTimeD)

5.1975250244140625e-05
0.0002772808074951172


In [12]:
#Trasponer filas y columnas
#NP
startTimeNP = time.time()
traspN = np.transpose(dNP)
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
traspN = da.transpose(dD)
print(time.time() -startTimeD)


9.512901306152344e-05
0.0011589527130126953


In [25]:
dNP2 = np.random.randint(0, 100, size=(5, 50000))
dD2 = da.random.randint(0, 100, size=(5, 50000),chunks=(1000, 1000))


In [26]:
#NP
startTimeNP = time.time()
concat_result = np.concatenate((dNP, dNP2), axis=1)
print(time.time() -startTimeNP)
#Dask
startTimeD = time.time()
da.stack([dD, dD2], axis=1)
print(time.time() -startTimeD)

: 